summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--.bazelrc3
-rw-r--r--BUILD11
-rw-r--r--Dockerfile4
-rw-r--r--Makefile2
-rw-r--r--README.md3
-rw-r--r--WORKSPACE83
-rw-r--r--benchmarks/BUILD10
-rw-r--r--benchmarks/README.md162
-rw-r--r--benchmarks/defs.bzl18
-rw-r--r--benchmarks/examples/localhost.yaml2
-rw-r--r--benchmarks/harness/BUILD92
-rw-r--r--benchmarks/harness/__init__.py25
-rw-r--r--benchmarks/harness/benchmark_driver.py85
-rw-r--r--benchmarks/harness/container.py181
-rw-r--r--benchmarks/harness/machine.py226
-rw-r--r--benchmarks/harness/machine_mocks/BUILD9
-rw-r--r--benchmarks/harness/machine_mocks/__init__.py81
-rw-r--r--benchmarks/harness/machine_producers/BUILD80
-rw-r--r--benchmarks/harness/machine_producers/__init__.py13
-rw-r--r--benchmarks/harness/machine_producers/gcloud_mock_recorder.py97
-rw-r--r--benchmarks/harness/machine_producers/gcloud_producer.py250
-rw-r--r--benchmarks/harness/machine_producers/gcloud_producer_test.py48
-rw-r--r--benchmarks/harness/machine_producers/machine_producer.py51
-rw-r--r--benchmarks/harness/machine_producers/mock_producer.py52
-rw-r--r--benchmarks/harness/machine_producers/testdata/get_five.json211
-rw-r--r--benchmarks/harness/machine_producers/testdata/get_one.json145
-rw-r--r--benchmarks/harness/machine_producers/yaml_producer.py106
-rw-r--r--benchmarks/harness/ssh_connection.py111
-rw-r--r--benchmarks/harness/tunnel_dispatcher.py122
-rw-r--r--benchmarks/requirements.txt32
-rw-r--r--benchmarks/run.py19
-rw-r--r--benchmarks/runner/BUILD63
-rw-r--r--benchmarks/runner/__init__.py278
-rw-r--r--benchmarks/runner/commands.py84
-rw-r--r--benchmarks/runner/runner_test.py59
-rw-r--r--benchmarks/suites/BUILD130
-rw-r--r--benchmarks/suites/__init__.py119
-rw-r--r--benchmarks/suites/absl.py37
-rw-r--r--benchmarks/suites/density.py121
-rw-r--r--benchmarks/suites/fio.py165
-rw-r--r--benchmarks/suites/helpers.py57
-rw-r--r--benchmarks/suites/http.py138
-rw-r--r--benchmarks/suites/media.py42
-rw-r--r--benchmarks/suites/ml.py33
-rw-r--r--benchmarks/suites/network.py101
-rw-r--r--benchmarks/suites/redis.py46
-rw-r--r--benchmarks/suites/startup.py110
-rw-r--r--benchmarks/suites/sysbench.py119
-rw-r--r--benchmarks/suites/syscall.py37
-rw-r--r--benchmarks/tcp/BUILD41
-rw-r--r--benchmarks/tcp/README.md87
-rw-r--r--benchmarks/tcp/nsjoin.c47
-rwxr-xr-xbenchmarks/tcp/tcp_benchmark.sh388
-rw-r--r--benchmarks/tcp/tcp_proxy.go444
-rw-r--r--benchmarks/workloads/BUILD35
-rw-r--r--benchmarks/workloads/__init__.py14
-rw-r--r--benchmarks/workloads/ab/BUILD36
-rw-r--r--benchmarks/workloads/ab/Dockerfile15
-rw-r--r--benchmarks/workloads/ab/__init__.py88
-rw-r--r--benchmarks/workloads/ab/ab_test.py42
-rw-r--r--benchmarks/workloads/absl/BUILD36
-rw-r--r--benchmarks/workloads/absl/Dockerfile24
-rw-r--r--benchmarks/workloads/absl/__init__.py63
-rw-r--r--benchmarks/workloads/absl/absl_test.py31
-rw-r--r--benchmarks/workloads/curl/BUILD13
-rw-r--r--benchmarks/workloads/curl/Dockerfile14
-rw-r--r--benchmarks/workloads/ffmpeg/BUILD18
-rw-r--r--benchmarks/workloads/ffmpeg/Dockerfile10
-rw-r--r--benchmarks/workloads/ffmpeg/__init__.py20
-rw-r--r--benchmarks/workloads/fio/BUILD36
-rw-r--r--benchmarks/workloads/fio/Dockerfile23
-rw-r--r--benchmarks/workloads/fio/__init__.py369
-rw-r--r--benchmarks/workloads/fio/fio_test.py44
-rw-r--r--benchmarks/workloads/httpd/BUILD13
-rw-r--r--benchmarks/workloads/httpd/Dockerfile27
-rw-r--r--benchmarks/workloads/iperf/BUILD36
-rw-r--r--benchmarks/workloads/iperf/Dockerfile14
-rw-r--r--benchmarks/workloads/iperf/__init__.py40
-rw-r--r--benchmarks/workloads/iperf/iperf_test.py28
-rw-r--r--benchmarks/workloads/netcat/BUILD13
-rw-r--r--benchmarks/workloads/netcat/Dockerfile14
-rw-r--r--benchmarks/workloads/nginx/BUILD13
-rw-r--r--benchmarks/workloads/nginx/Dockerfile1
-rw-r--r--benchmarks/workloads/node/BUILD15
-rw-r--r--benchmarks/workloads/node/Dockerfile2
-rw-r--r--benchmarks/workloads/node/index.js28
-rw-r--r--benchmarks/workloads/node/package.json19
-rw-r--r--benchmarks/workloads/node_template/BUILD17
-rw-r--r--benchmarks/workloads/node_template/Dockerfile5
-rw-r--r--benchmarks/workloads/node_template/index.hbs8
-rw-r--r--benchmarks/workloads/node_template/index.js43
-rw-r--r--benchmarks/workloads/node_template/package-lock.json486
-rw-r--r--benchmarks/workloads/node_template/package.json19
-rw-r--r--benchmarks/workloads/redis/BUILD13
-rw-r--r--benchmarks/workloads/redis/Dockerfile1
-rw-r--r--benchmarks/workloads/redisbenchmark/BUILD36
-rw-r--r--benchmarks/workloads/redisbenchmark/Dockerfile4
-rw-r--r--benchmarks/workloads/redisbenchmark/__init__.py85
-rw-r--r--benchmarks/workloads/redisbenchmark/redisbenchmark_test.py51
-rw-r--r--benchmarks/workloads/ruby/BUILD28
-rw-r--r--benchmarks/workloads/ruby/Dockerfile28
-rw-r--r--benchmarks/workloads/ruby/Gemfile12
-rw-r--r--benchmarks/workloads/ruby/Gemfile.lock55
-rwxr-xr-xbenchmarks/workloads/ruby/config.ru2
-rwxr-xr-xbenchmarks/workloads/ruby/index.rb14
-rw-r--r--benchmarks/workloads/ruby_template/BUILD19
-rwxr-xr-xbenchmarks/workloads/ruby_template/Dockerfile38
-rwxr-xr-xbenchmarks/workloads/ruby_template/Gemfile5
-rw-r--r--benchmarks/workloads/ruby_template/Gemfile.lock26
-rwxr-xr-xbenchmarks/workloads/ruby_template/config.ru2
-rwxr-xr-xbenchmarks/workloads/ruby_template/index.erb8
-rwxr-xr-xbenchmarks/workloads/ruby_template/main.rb27
-rw-r--r--benchmarks/workloads/sleep/BUILD13
-rw-r--r--benchmarks/workloads/sleep/Dockerfile3
-rw-r--r--benchmarks/workloads/sysbench/BUILD36
-rw-r--r--benchmarks/workloads/sysbench/Dockerfile16
-rw-r--r--benchmarks/workloads/sysbench/__init__.py167
-rw-r--r--benchmarks/workloads/sysbench/sysbench_test.py34
-rw-r--r--benchmarks/workloads/syscall/BUILD37
-rw-r--r--benchmarks/workloads/syscall/Dockerfile6
-rw-r--r--benchmarks/workloads/syscall/__init__.py29
-rw-r--r--benchmarks/workloads/syscall/syscall.c55
-rw-r--r--benchmarks/workloads/syscall/syscall_test.py27
-rw-r--r--benchmarks/workloads/tensorflow/BUILD18
-rw-r--r--benchmarks/workloads/tensorflow/Dockerfile14
-rw-r--r--benchmarks/workloads/tensorflow/__init__.py20
-rw-r--r--benchmarks/workloads/true/BUILD14
-rw-r--r--benchmarks/workloads/true/Dockerfile3
-rw-r--r--go.mod34
-rw-r--r--go.sum2
-rw-r--r--kokoro/iptables_tests.cfg10
-rw-r--r--kokoro/issue_reviver.cfg15
-rw-r--r--kokoro/kythe/generate_xrefs.cfg28
-rw-r--r--kokoro/kythe/generate_xrefs.sh54
-rwxr-xr-xkokoro/ubuntu1604/10_core.sh4
-rwxr-xr-xkokoro/ubuntu1604/40_kokoro.sh2
-rw-r--r--kokoro/ubuntu1604/README.md34
-rw-r--r--pkg/abi/linux/BUILD2
-rw-r--r--pkg/abi/linux/fcntl.go41
-rw-r--r--pkg/abi/linux/file.go33
-rw-r--r--pkg/abi/linux/fs.go7
-rw-r--r--pkg/abi/linux/netfilter.go89
-rw-r--r--pkg/abi/linux/rseq.go130
-rw-r--r--pkg/abi/linux/socket.go9
-rw-r--r--pkg/abi/linux/time.go13
-rw-r--r--pkg/abi/linux/xattr.go (renamed from pkg/sentry/fsimpl/proc/filesystems.go)20
-rw-r--r--pkg/amutex/BUILD1
-rw-r--r--pkg/amutex/amutex_test.go3
-rw-r--r--pkg/atomicbitops/BUILD1
-rw-r--r--pkg/atomicbitops/atomic_bitops_test.go3
-rw-r--r--pkg/compressio/BUILD5
-rw-r--r--pkg/compressio/compressio.go2
-rw-r--r--pkg/control/server/BUILD1
-rw-r--r--pkg/control/server/server.go2
-rw-r--r--pkg/eventchannel/BUILD2
-rw-r--r--pkg/eventchannel/event.go2
-rw-r--r--pkg/eventchannel/event_test.go2
-rw-r--r--pkg/fdchannel/BUILD1
-rw-r--r--pkg/fdchannel/fdchannel_test.go3
-rw-r--r--pkg/fdnotifier/BUILD1
-rw-r--r--pkg/fdnotifier/fdnotifier.go2
-rw-r--r--pkg/flipcall/BUILD3
-rw-r--r--pkg/flipcall/flipcall_example_test.go3
-rw-r--r--pkg/flipcall/flipcall_test.go3
-rw-r--r--pkg/flipcall/flipcall_unsafe.go10
-rw-r--r--pkg/fspath/BUILD2
-rw-r--r--pkg/fspath/fspath.go24
-rw-r--r--pkg/fspath/fspath_test.go25
-rw-r--r--pkg/gate/BUILD1
-rw-r--r--pkg/gate/gate_test.go2
-rw-r--r--pkg/goid/BUILD26
-rw-r--r--pkg/goid/empty_test.go22
-rw-r--r--pkg/goid/goid.go24
-rw-r--r--pkg/goid/goid_amd64.s21
-rw-r--r--pkg/goid/goid_race.go25
-rw-r--r--pkg/goid/goid_test.go74
-rw-r--r--pkg/goid/goid_unsafe.go64
-rw-r--r--pkg/linewriter/BUILD1
-rw-r--r--pkg/linewriter/linewriter.go3
-rw-r--r--pkg/log/BUILD5
-rw-r--r--pkg/log/log.go2
-rw-r--r--pkg/metric/BUILD1
-rw-r--r--pkg/metric/metric.go2
-rw-r--r--pkg/p9/BUILD1
-rw-r--r--pkg/p9/client.go2
-rw-r--r--pkg/p9/handlers.go36
-rw-r--r--pkg/p9/p9test/BUILD2
-rw-r--r--pkg/p9/p9test/client_test.go2
-rw-r--r--pkg/p9/p9test/p9test.go2
-rw-r--r--pkg/p9/path_tree.go3
-rw-r--r--pkg/p9/pool.go2
-rw-r--r--pkg/p9/server.go2
-rw-r--r--pkg/p9/transport.go2
-rw-r--r--pkg/procid/BUILD2
-rw-r--r--pkg/procid/procid_test.go3
-rw-r--r--pkg/rand/BUILD5
-rw-r--r--pkg/rand/rand_linux.go2
-rw-r--r--pkg/refs/BUILD2
-rw-r--r--pkg/refs/refcounter.go2
-rw-r--r--pkg/refs/refcounter_test.go3
-rw-r--r--pkg/sentry/arch/BUILD7
-rw-r--r--pkg/sentry/arch/arch.go6
-rw-r--r--pkg/sentry/arch/arch_aarch64.go293
-rw-r--r--pkg/sentry/arch/arch_amd64.go8
-rw-r--r--pkg/sentry/arch/arch_arm64.go266
-rw-r--r--pkg/sentry/arch/arch_state_aarch64.go38
-rw-r--r--pkg/sentry/arch/arch_state_x86.go2
-rw-r--r--pkg/sentry/arch/arch_x86.go2
-rw-r--r--pkg/sentry/arch/registers.proto37
-rw-r--r--pkg/sentry/arch/signal.go250
-rw-r--r--pkg/sentry/arch/signal_amd64.go230
-rw-r--r--pkg/sentry/arch/signal_arm64.go126
-rw-r--r--pkg/sentry/arch/signal_stack.go2
-rw-r--r--pkg/sentry/arch/syscalls_arm64.go62
-rw-r--r--pkg/sentry/control/BUILD1
-rw-r--r--pkg/sentry/control/pprof.go17
-rw-r--r--pkg/sentry/control/proc.go48
-rw-r--r--pkg/sentry/control/proc_test.go10
-rw-r--r--pkg/sentry/device/BUILD5
-rw-r--r--pkg/sentry/device/device.go2
-rw-r--r--pkg/sentry/fs/BUILD3
-rw-r--r--pkg/sentry/fs/copy_up.go2
-rw-r--r--pkg/sentry/fs/copy_up_test.go2
-rw-r--r--pkg/sentry/fs/dirent.go2
-rw-r--r--pkg/sentry/fs/dirent_cache.go3
-rw-r--r--pkg/sentry/fs/dirent_cache_limiter.go3
-rw-r--r--pkg/sentry/fs/fdpipe/BUILD1
-rw-r--r--pkg/sentry/fs/fdpipe/pipe.go2
-rw-r--r--pkg/sentry/fs/fdpipe/pipe_state.go2
-rw-r--r--pkg/sentry/fs/file.go9
-rw-r--r--pkg/sentry/fs/file_overlay.go2
-rw-r--r--pkg/sentry/fs/filesystems.go2
-rw-r--r--pkg/sentry/fs/fs.go3
-rw-r--r--pkg/sentry/fs/fsutil/BUILD3
-rw-r--r--pkg/sentry/fs/fsutil/file_range_set.go14
-rw-r--r--pkg/sentry/fs/fsutil/host_file_mapper.go2
-rw-r--r--pkg/sentry/fs/fsutil/host_mappable.go2
-rw-r--r--pkg/sentry/fs/fsutil/inode.go3
-rw-r--r--pkg/sentry/fs/fsutil/inode_cached.go2
-rw-r--r--pkg/sentry/fs/gofer/BUILD1
-rw-r--r--pkg/sentry/fs/gofer/inode.go2
-rw-r--r--pkg/sentry/fs/gofer/path.go4
-rw-r--r--pkg/sentry/fs/gofer/session.go8
-rw-r--r--pkg/sentry/fs/host/BUILD1
-rw-r--r--pkg/sentry/fs/host/inode.go2
-rw-r--r--pkg/sentry/fs/host/socket.go2
-rw-r--r--pkg/sentry/fs/host/tty.go3
-rw-r--r--pkg/sentry/fs/inode.go11
-rw-r--r--pkg/sentry/fs/inode_inotify.go3
-rw-r--r--pkg/sentry/fs/inode_overlay.go7
-rw-r--r--pkg/sentry/fs/inotify.go2
-rw-r--r--pkg/sentry/fs/inotify_watch.go2
-rw-r--r--pkg/sentry/fs/lock/BUILD1
-rw-r--r--pkg/sentry/fs/lock/lock.go2
-rw-r--r--pkg/sentry/fs/mounts.go2
-rw-r--r--pkg/sentry/fs/overlay.go5
-rw-r--r--pkg/sentry/fs/proc/BUILD1
-rw-r--r--pkg/sentry/fs/proc/seqfile/BUILD1
-rw-r--r--pkg/sentry/fs/proc/seqfile/seqfile.go2
-rw-r--r--pkg/sentry/fs/proc/sys_net.go19
-rw-r--r--pkg/sentry/fs/proc/task.go38
-rw-r--r--pkg/sentry/fs/ramfs/BUILD1
-rw-r--r--pkg/sentry/fs/ramfs/dir.go2
-rw-r--r--pkg/sentry/fs/restore.go2
-rw-r--r--pkg/sentry/fs/splice.go5
-rw-r--r--pkg/sentry/fs/tmpfs/BUILD1
-rw-r--r--pkg/sentry/fs/tmpfs/inode_file.go2
-rw-r--r--pkg/sentry/fs/tty/BUILD1
-rw-r--r--pkg/sentry/fs/tty/dir.go2
-rw-r--r--pkg/sentry/fs/tty/line_discipline.go2
-rw-r--r--pkg/sentry/fs/tty/master.go6
-rw-r--r--pkg/sentry/fs/tty/queue.go3
-rw-r--r--pkg/sentry/fs/tty/slave.go6
-rw-r--r--pkg/sentry/fs/tty/terminal.go4
-rw-r--r--pkg/sentry/fsimpl/ext/BUILD3
-rw-r--r--pkg/sentry/fsimpl/ext/benchmark/BUILD1
-rw-r--r--pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go21
-rw-r--r--pkg/sentry/fsimpl/ext/directory.go3
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/extent.go10
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/extent_test.go6
-rw-r--r--pkg/sentry/fsimpl/ext/ext_test.go42
-rw-r--r--pkg/sentry/fsimpl/ext/extent_file.go8
-rw-r--r--pkg/sentry/fsimpl/ext/file_description.go19
-rw-r--r--pkg/sentry/fsimpl/ext/filesystem.go58
-rw-r--r--pkg/sentry/fsimpl/ext/inode.go15
-rw-r--r--pkg/sentry/fsimpl/ext/regular_file.go2
-rw-r--r--pkg/sentry/fsimpl/kernfs/BUILD64
-rw-r--r--pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go127
-rw-r--r--pkg/sentry/fsimpl/kernfs/fd_impl_util.go194
-rw-r--r--pkg/sentry/fsimpl/kernfs/filesystem.go793
-rw-r--r--pkg/sentry/fsimpl/kernfs/inode_impl_util.go512
-rw-r--r--pkg/sentry/fsimpl/kernfs/kernfs.go422
-rw-r--r--pkg/sentry/fsimpl/kernfs/kernfs_test.go426
-rw-r--r--pkg/sentry/fsimpl/kernfs/symlink.go45
-rw-r--r--pkg/sentry/fsimpl/memfs/filesystem.go584
-rw-r--r--pkg/sentry/fsimpl/memfs/regular_file.go154
-rw-r--r--pkg/sentry/fsimpl/proc/BUILD33
-rw-r--r--pkg/sentry/fsimpl/proc/boot_test.go149
-rw-r--r--pkg/sentry/fsimpl/proc/filesystem.go69
-rw-r--r--pkg/sentry/fsimpl/proc/loadavg.go8
-rw-r--r--pkg/sentry/fsimpl/proc/meminfo.go6
-rw-r--r--pkg/sentry/fsimpl/proc/mounts.go2
-rw-r--r--pkg/sentry/fsimpl/proc/stat.go6
-rw-r--r--pkg/sentry/fsimpl/proc/task.go337
-rw-r--r--pkg/sentry/fsimpl/proc/task_files.go272
-rw-r--r--pkg/sentry/fsimpl/proc/tasks.go218
-rw-r--r--pkg/sentry/fsimpl/proc/tasks_files.go92
-rw-r--r--pkg/sentry/fsimpl/proc/tasks_test.go555
-rw-r--r--pkg/sentry/fsimpl/proc/version.go6
-rw-r--r--pkg/sentry/fsimpl/tmpfs/BUILD (renamed from pkg/sentry/fsimpl/memfs/BUILD)38
-rw-r--r--pkg/sentry/fsimpl/tmpfs/benchmark_test.go (renamed from pkg/sentry/fsimpl/memfs/benchmark_test.go)43
-rw-r--r--pkg/sentry/fsimpl/tmpfs/directory.go (renamed from pkg/sentry/fsimpl/memfs/directory.go)2
-rw-r--r--pkg/sentry/fsimpl/tmpfs/filesystem.go696
-rw-r--r--pkg/sentry/fsimpl/tmpfs/named_pipe.go (renamed from pkg/sentry/fsimpl/memfs/named_pipe.go)6
-rw-r--r--pkg/sentry/fsimpl/tmpfs/pipe_test.go (renamed from pkg/sentry/fsimpl/memfs/pipe_test.go)32
-rw-r--r--pkg/sentry/fsimpl/tmpfs/regular_file.go357
-rw-r--r--pkg/sentry/fsimpl/tmpfs/regular_file_test.go315
-rw-r--r--pkg/sentry/fsimpl/tmpfs/stat_test.go232
-rw-r--r--pkg/sentry/fsimpl/tmpfs/symlink.go (renamed from pkg/sentry/fsimpl/memfs/symlink.go)2
-rw-r--r--pkg/sentry/fsimpl/tmpfs/tmpfs.go (renamed from pkg/sentry/fsimpl/memfs/memfs.go)163
-rw-r--r--pkg/sentry/kernel/BUILD5
-rw-r--r--pkg/sentry/kernel/abstract_socket_namespace.go2
-rw-r--r--pkg/sentry/kernel/auth/BUILD3
-rw-r--r--pkg/sentry/kernel/auth/user_namespace.go2
-rw-r--r--pkg/sentry/kernel/epoll/BUILD1
-rw-r--r--pkg/sentry/kernel/epoll/epoll.go2
-rw-r--r--pkg/sentry/kernel/eventfd/BUILD1
-rw-r--r--pkg/sentry/kernel/eventfd/eventfd.go2
-rw-r--r--pkg/sentry/kernel/fasync/BUILD1
-rw-r--r--pkg/sentry/kernel/fasync/fasync.go3
-rw-r--r--pkg/sentry/kernel/fd_table.go2
-rw-r--r--pkg/sentry/kernel/fd_table_test.go2
-rw-r--r--pkg/sentry/kernel/fs_context.go2
-rw-r--r--pkg/sentry/kernel/futex/BUILD8
-rw-r--r--pkg/sentry/kernel/futex/futex.go3
-rw-r--r--pkg/sentry/kernel/futex/futex_test.go2
-rw-r--r--pkg/sentry/kernel/kernel.go29
-rw-r--r--pkg/sentry/kernel/memevent/BUILD1
-rw-r--r--pkg/sentry/kernel/memevent/memory_events.go2
-rw-r--r--pkg/sentry/kernel/pipe/BUILD1
-rw-r--r--pkg/sentry/kernel/pipe/buffer.go2
-rw-r--r--pkg/sentry/kernel/pipe/node.go3
-rw-r--r--pkg/sentry/kernel/pipe/pipe.go2
-rw-r--r--pkg/sentry/kernel/pipe/pipe_util.go2
-rw-r--r--pkg/sentry/kernel/pipe/vfs.go3
-rw-r--r--pkg/sentry/kernel/rseq.go383
-rw-r--r--pkg/sentry/kernel/semaphore/BUILD1
-rw-r--r--pkg/sentry/kernel/semaphore/semaphore.go8
-rw-r--r--pkg/sentry/kernel/shm/BUILD1
-rw-r--r--pkg/sentry/kernel/shm/shm.go87
-rw-r--r--pkg/sentry/kernel/signal_handlers.go3
-rw-r--r--pkg/sentry/kernel/signalfd/BUILD1
-rw-r--r--pkg/sentry/kernel/signalfd/signalfd.go3
-rw-r--r--pkg/sentry/kernel/syscalls.go10
-rw-r--r--pkg/sentry/kernel/syslog.go3
-rw-r--r--pkg/sentry/kernel/task.go68
-rw-r--r--pkg/sentry/kernel/task_block.go8
-rw-r--r--pkg/sentry/kernel/task_clone.go10
-rw-r--r--pkg/sentry/kernel/task_exec.go9
-rw-r--r--pkg/sentry/kernel/task_exit.go1
-rw-r--r--pkg/sentry/kernel/task_log.go86
-rw-r--r--pkg/sentry/kernel/task_run.go30
-rw-r--r--pkg/sentry/kernel/task_start.go18
-rw-r--r--pkg/sentry/kernel/task_syscall.go8
-rw-r--r--pkg/sentry/kernel/thread_group.go28
-rw-r--r--pkg/sentry/kernel/threads.go2
-rw-r--r--pkg/sentry/kernel/time/BUILD1
-rw-r--r--pkg/sentry/kernel/time/time.go2
-rw-r--r--pkg/sentry/kernel/timekeeper.go2
-rw-r--r--pkg/sentry/kernel/tty.go13
-rw-r--r--pkg/sentry/kernel/uts_namespace.go3
-rw-r--r--pkg/sentry/limits/BUILD1
-rw-r--r--pkg/sentry/limits/limits.go3
-rw-r--r--pkg/sentry/loader/elf.go2
-rw-r--r--pkg/sentry/memmap/BUILD1
-rw-r--r--pkg/sentry/memmap/memmap.go8
-rw-r--r--pkg/sentry/mm/BUILD2
-rw-r--r--pkg/sentry/mm/aio_context.go3
-rw-r--r--pkg/sentry/mm/mm.go8
-rw-r--r--pkg/sentry/mm/procfs.go12
-rw-r--r--pkg/sentry/pgalloc/BUILD1
-rw-r--r--pkg/sentry/pgalloc/pgalloc.go2
-rw-r--r--pkg/sentry/platform/interrupt/BUILD1
-rw-r--r--pkg/sentry/platform/interrupt/interrupt.go3
-rw-r--r--pkg/sentry/platform/kvm/BUILD13
-rw-r--r--pkg/sentry/platform/kvm/address_space.go2
-rw-r--r--pkg/sentry/platform/kvm/bluepill.go24
-rw-r--r--pkg/sentry/platform/kvm/bluepill_amd64.go20
-rw-r--r--pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go7
-rw-r--r--pkg/sentry/platform/kvm/bluepill_arm64.go79
-rw-r--r--pkg/sentry/platform/kvm/bluepill_arm64.s87
-rw-r--r--pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go26
-rw-r--r--pkg/sentry/platform/kvm/bluepill_unsafe.go9
-rw-r--r--pkg/sentry/platform/kvm/filters_amd64.go (renamed from pkg/sentry/platform/kvm/filters.go)0
-rw-r--r--pkg/sentry/platform/kvm/filters_arm64.go32
-rw-r--r--pkg/sentry/platform/kvm/kvm.go7
-rw-r--r--pkg/sentry/platform/kvm/kvm_amd64.go9
-rw-r--r--pkg/sentry/platform/kvm/kvm_arm64.go83
-rw-r--r--pkg/sentry/platform/kvm/kvm_arm64_unsafe.go39
-rw-r--r--pkg/sentry/platform/kvm/kvm_const.go4
-rw-r--r--pkg/sentry/platform/kvm/kvm_const_arm64.go132
-rw-r--r--pkg/sentry/platform/kvm/machine.go2
-rw-r--r--pkg/sentry/platform/kvm/machine_amd64.go4
-rw-r--r--pkg/sentry/platform/kvm/machine_amd64_unsafe.go40
-rw-r--r--pkg/sentry/platform/kvm/machine_arm64.go124
-rw-r--r--pkg/sentry/platform/kvm/machine_arm64_unsafe.go362
-rw-r--r--pkg/sentry/platform/kvm/machine_unsafe.go40
-rw-r--r--pkg/sentry/platform/kvm/physical_map.go12
-rw-r--r--pkg/sentry/platform/kvm/physical_map_amd64.go22
-rw-r--r--pkg/sentry/platform/kvm/physical_map_arm64.go (renamed from pkg/sentry/fsimpl/proc/proc.go)7
-rw-r--r--pkg/sentry/platform/kvm/testutil/testutil_arm64.s15
-rw-r--r--pkg/sentry/platform/ptrace/BUILD1
-rw-r--r--pkg/sentry/platform/ptrace/ptrace.go2
-rw-r--r--pkg/sentry/platform/ptrace/stub_amd64.s29
-rw-r--r--pkg/sentry/platform/ptrace/stub_arm64.s30
-rw-r--r--pkg/sentry/platform/ptrace/subprocess.go27
-rw-r--r--pkg/sentry/platform/ptrace/subprocess_amd64.go52
-rw-r--r--pkg/sentry/platform/ptrace/subprocess_arm64.go41
-rw-r--r--pkg/sentry/platform/ptrace/subprocess_linux.go38
-rw-r--r--pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go2
-rw-r--r--pkg/sentry/platform/ring0/BUILD1
-rw-r--r--pkg/sentry/platform/ring0/defs.go2
-rw-r--r--pkg/sentry/platform/ring0/defs_amd64.go1
-rw-r--r--pkg/sentry/platform/ring0/defs_arm64.go4
-rw-r--r--pkg/sentry/platform/ring0/entry_arm64.s97
-rw-r--r--pkg/sentry/platform/ring0/lib_arm64.go26
-rw-r--r--pkg/sentry/platform/ring0/lib_arm64.s121
-rw-r--r--pkg/sentry/platform/ring0/offsets_arm64.go1
-rw-r--r--pkg/sentry/platform/ring0/pagetables/BUILD5
-rw-r--r--pkg/sentry/platform/ring0/pagetables/pcids_x86.go2
-rw-r--r--pkg/sentry/sighandling/sighandling.go75
-rw-r--r--pkg/sentry/sighandling/sighandling_unsafe.go26
-rw-r--r--pkg/sentry/socket/control/BUILD1
-rw-r--r--pkg/sentry/socket/control/control.go215
-rw-r--r--pkg/sentry/socket/hostinet/BUILD2
-rw-r--r--pkg/sentry/socket/hostinet/socket.go110
-rw-r--r--pkg/sentry/socket/netfilter/BUILD1
-rw-r--r--pkg/sentry/socket/netfilter/netfilter.go408
-rw-r--r--pkg/sentry/socket/netlink/BUILD2
-rw-r--r--pkg/sentry/socket/netlink/port/BUILD1
-rw-r--r--pkg/sentry/socket/netlink/port/port.go3
-rw-r--r--pkg/sentry/socket/netlink/socket.go31
-rw-r--r--pkg/sentry/socket/netstack/BUILD1
-rw-r--r--pkg/sentry/socket/netstack/netstack.go198
-rw-r--r--pkg/sentry/socket/rpcinet/conn/BUILD1
-rw-r--r--pkg/sentry/socket/rpcinet/conn/conn.go2
-rw-r--r--pkg/sentry/socket/rpcinet/notifier/BUILD1
-rw-r--r--pkg/sentry/socket/rpcinet/notifier/notifier.go2
-rw-r--r--pkg/sentry/socket/rpcinet/syscall_rpc.proto1
-rw-r--r--pkg/sentry/socket/socket.go5
-rw-r--r--pkg/sentry/socket/unix/io.go13
-rw-r--r--pkg/sentry/socket/unix/transport/BUILD1
-rw-r--r--pkg/sentry/socket/unix/transport/connectioned.go3
-rw-r--r--pkg/sentry/socket/unix/transport/queue.go3
-rw-r--r--pkg/sentry/socket/unix/transport/unix.go26
-rw-r--r--pkg/sentry/socket/unix/unix.go31
-rw-r--r--pkg/sentry/strace/BUILD4
-rw-r--r--pkg/sentry/strace/linux64_amd64.go (renamed from pkg/sentry/strace/linux64.go)51
-rw-r--r--pkg/sentry/strace/linux64_arm64.go323
-rw-r--r--pkg/sentry/strace/select.go56
-rw-r--r--pkg/sentry/strace/socket.go11
-rw-r--r--pkg/sentry/strace/strace.go2
-rw-r--r--pkg/sentry/strace/syscalls.go13
-rw-r--r--pkg/sentry/syscalls/linux/BUILD5
-rw-r--r--pkg/sentry/syscalls/linux/error.go2
-rw-r--r--pkg/sentry/syscalls/linux/linux64_amd64.go28
-rw-r--r--pkg/sentry/syscalls/linux/linux64_arm64.go27
-rw-r--r--pkg/sentry/syscalls/linux/sys_clone_amd64.go35
-rw-r--r--pkg/sentry/syscalls/linux/sys_clone_arm64.go35
-rw-r--r--pkg/sentry/syscalls/linux/sys_file.go82
-rw-r--r--pkg/sentry/syscalls/linux/sys_futex.go10
-rw-r--r--pkg/sentry/syscalls/linux/sys_poll.go71
-rw-r--r--pkg/sentry/syscalls/linux/sys_rseq.go48
-rw-r--r--pkg/sentry/syscalls/linux/sys_shm.go7
-rw-r--r--pkg/sentry/syscalls/linux/sys_socket.go20
-rw-r--r--pkg/sentry/syscalls/linux/sys_thread.go13
-rw-r--r--pkg/sentry/syscalls/linux/sys_xattr.go169
-rw-r--r--pkg/sentry/time/BUILD4
-rw-r--r--pkg/sentry/time/calibrated_clock.go2
-rw-r--r--pkg/sentry/usage/BUILD1
-rw-r--r--pkg/sentry/usage/memory.go2
-rw-r--r--pkg/sentry/vfs/BUILD6
-rw-r--r--pkg/sentry/vfs/context.go13
-rw-r--r--pkg/sentry/vfs/dentry.go49
-rw-r--r--pkg/sentry/vfs/device.go100
-rw-r--r--pkg/sentry/vfs/file_description.go409
-rw-r--r--pkg/sentry/vfs/file_description_impl_util.go62
-rw-r--r--pkg/sentry/vfs/file_description_impl_util_test.go14
-rw-r--r--pkg/sentry/vfs/filesystem.go362
-rw-r--r--pkg/sentry/vfs/filesystem_impl_util.go26
-rw-r--r--pkg/sentry/vfs/filesystem_type.go55
-rw-r--r--pkg/sentry/vfs/mount.go82
-rw-r--r--pkg/sentry/vfs/mount_test.go3
-rw-r--r--pkg/sentry/vfs/mount_unsafe.go4
-rw-r--r--pkg/sentry/vfs/options.go33
-rw-r--r--pkg/sentry/vfs/pathname.go152
-rw-r--r--pkg/sentry/vfs/permissions.go62
-rw-r--r--pkg/sentry/vfs/resolving_path.go88
-rw-r--r--pkg/sentry/vfs/syscalls.go237
-rw-r--r--pkg/sentry/vfs/testutil.go36
-rw-r--r--pkg/sentry/vfs/vfs.go594
-rw-r--r--pkg/sentry/watchdog/BUILD1
-rw-r--r--pkg/sentry/watchdog/watchdog.go6
-rw-r--r--pkg/sleep/sleep_test.go31
-rw-r--r--pkg/sync/BUILD (renamed from pkg/syncutil/BUILD)11
-rw-r--r--pkg/sync/LICENSE (renamed from pkg/syncutil/LICENSE)0
-rw-r--r--pkg/sync/README.md (renamed from pkg/syncutil/README.md)0
-rw-r--r--pkg/sync/aliases.go37
-rw-r--r--pkg/sync/atomicptr_unsafe.go (renamed from pkg/syncutil/atomicptr_unsafe.go)0
-rw-r--r--pkg/sync/atomicptrtest/BUILD (renamed from pkg/syncutil/atomicptrtest/BUILD)4
-rw-r--r--pkg/sync/atomicptrtest/atomicptr_test.go (renamed from pkg/syncutil/atomicptrtest/atomicptr_test.go)0
-rw-r--r--pkg/sync/downgradable_rwmutex_test.go (renamed from pkg/syncutil/downgradable_rwmutex_test.go)2
-rw-r--r--pkg/sync/downgradable_rwmutex_unsafe.go (renamed from pkg/syncutil/downgradable_rwmutex_unsafe.go)7
-rw-r--r--pkg/sync/memmove_unsafe.go (renamed from pkg/syncutil/memmove_unsafe.go)2
-rw-r--r--pkg/sync/norace_unsafe.go (renamed from pkg/syncutil/norace_unsafe.go)2
-rw-r--r--pkg/sync/race_unsafe.go (renamed from pkg/syncutil/race_unsafe.go)2
-rw-r--r--pkg/sync/seqatomic_unsafe.go (renamed from pkg/syncutil/seqatomic_unsafe.go)16
-rw-r--r--pkg/sync/seqatomictest/BUILD (renamed from pkg/syncutil/seqatomictest/BUILD)10
-rw-r--r--pkg/sync/seqatomictest/seqatomic_test.go (renamed from pkg/syncutil/seqatomictest/seqatomic_test.go)18
-rw-r--r--pkg/sync/seqcount.go (renamed from pkg/syncutil/seqcount.go)2
-rw-r--r--pkg/sync/seqcount_test.go (renamed from pkg/syncutil/seqcount_test.go)2
-rw-r--r--pkg/sync/syncutil.go (renamed from pkg/syncutil/syncutil.go)4
-rw-r--r--pkg/syncutil/downgradable_rwmutex_1_12_unsafe.go21
-rw-r--r--pkg/syncutil/downgradable_rwmutex_1_13_unsafe.go16
-rw-r--r--pkg/syserror/syserror.go1
-rw-r--r--pkg/tcpip/BUILD9
-rw-r--r--pkg/tcpip/adapters/gonet/BUILD1
-rw-r--r--pkg/tcpip/adapters/gonet/gonet.go2
-rw-r--r--pkg/tcpip/checker/checker.go22
-rw-r--r--pkg/tcpip/header/BUILD10
-rw-r--r--pkg/tcpip/header/ipv6.go149
-rw-r--r--pkg/tcpip/header/ipv6_test.go302
-rw-r--r--pkg/tcpip/header/ndp_options.go123
-rw-r--r--pkg/tcpip/header/ndp_router_solicit.go36
-rw-r--r--pkg/tcpip/header/ndp_test.go215
-rw-r--r--pkg/tcpip/iptables/BUILD5
-rw-r--r--pkg/tcpip/iptables/iptables.go120
-rw-r--r--pkg/tcpip/iptables/targets.go16
-rw-r--r--pkg/tcpip/iptables/types.go49
-rw-r--r--pkg/tcpip/link/fdbased/BUILD1
-rw-r--r--pkg/tcpip/link/fdbased/endpoint.go2
-rw-r--r--pkg/tcpip/link/sharedmem/BUILD2
-rw-r--r--pkg/tcpip/link/sharedmem/pipe/BUILD1
-rw-r--r--pkg/tcpip/link/sharedmem/pipe/pipe_test.go3
-rw-r--r--pkg/tcpip/link/sharedmem/sharedmem.go2
-rw-r--r--pkg/tcpip/link/sharedmem/sharedmem_test.go2
-rw-r--r--pkg/tcpip/network/arp/arp.go6
-rw-r--r--pkg/tcpip/network/fragmentation/BUILD1
-rw-r--r--pkg/tcpip/network/fragmentation/fragmentation.go2
-rw-r--r--pkg/tcpip/network/fragmentation/reassembler.go2
-rw-r--r--pkg/tcpip/network/ip_test.go4
-rw-r--r--pkg/tcpip/network/ipv4/ipv4.go18
-rw-r--r--pkg/tcpip/network/ipv6/ipv6.go16
-rw-r--r--pkg/tcpip/ports/BUILD3
-rw-r--r--pkg/tcpip/ports/ports.go150
-rw-r--r--pkg/tcpip/ports/ports_test.go182
-rw-r--r--pkg/tcpip/sample/tun_tcp_connect/BUILD1
-rw-r--r--pkg/tcpip/sample/tun_tcp_echo/BUILD1
-rw-r--r--pkg/tcpip/stack/BUILD5
-rw-r--r--pkg/tcpip/stack/linkaddrcache.go2
-rw-r--r--pkg/tcpip/stack/linkaddrcache_test.go2
-rw-r--r--pkg/tcpip/stack/ndp.go950
-rw-r--r--pkg/tcpip/stack/ndp_test.go2686
-rw-r--r--pkg/tcpip/stack/nic.go392
-rw-r--r--pkg/tcpip/stack/registration.go6
-rw-r--r--pkg/tcpip/stack/route.go6
-rw-r--r--pkg/tcpip/stack/stack.go237
-rw-r--r--pkg/tcpip/stack/stack_test.go401
-rw-r--r--pkg/tcpip/stack/transport_demuxer.go56
-rw-r--r--pkg/tcpip/stack/transport_demuxer_test.go92
-rw-r--r--pkg/tcpip/stack/transport_test.go14
-rw-r--r--pkg/tcpip/tcpip.go66
-rw-r--r--pkg/tcpip/timer.go161
-rw-r--r--pkg/tcpip/timer_test.go236
-rw-r--r--pkg/tcpip/transport/icmp/BUILD1
-rw-r--r--pkg/tcpip/transport/icmp/endpoint.go39
-rw-r--r--pkg/tcpip/transport/packet/BUILD1
-rw-r--r--pkg/tcpip/transport/packet/endpoint.go25
-rw-r--r--pkg/tcpip/transport/raw/BUILD1
-rw-r--r--pkg/tcpip/transport/raw/endpoint.go47
-rw-r--r--pkg/tcpip/transport/tcp/BUILD18
-rw-r--r--pkg/tcpip/transport/tcp/accept.go30
-rw-r--r--pkg/tcpip/transport/tcp/connect.go412
-rw-r--r--pkg/tcpip/transport/tcp/dispatcher.go224
-rw-r--r--pkg/tcpip/transport/tcp/dual_stack_test.go8
-rw-r--r--pkg/tcpip/transport/tcp/endpoint.go556
-rw-r--r--pkg/tcpip/transport/tcp/endpoint_state.go32
-rw-r--r--pkg/tcpip/transport/tcp/forwarder.go3
-rw-r--r--pkg/tcpip/transport/tcp/protocol.go34
-rw-r--r--pkg/tcpip/transport/tcp/rcv.go48
-rw-r--r--pkg/tcpip/transport/tcp/rcv_state.go29
-rw-r--r--pkg/tcpip/transport/tcp/segment_queue.go2
-rw-r--r--pkg/tcpip/transport/tcp/snd.go72
-rw-r--r--pkg/tcpip/transport/tcp/snd_state.go10
-rw-r--r--pkg/tcpip/transport/tcp/tcp_test.go815
-rw-r--r--pkg/tcpip/transport/tcp/testing/context/context.go25
-rw-r--r--pkg/tcpip/transport/udp/BUILD2
-rw-r--r--pkg/tcpip/transport/udp/endpoint.go193
-rw-r--r--pkg/tcpip/transport/udp/udp_test.go105
-rw-r--r--pkg/tmutex/BUILD1
-rw-r--r--pkg/tmutex/tmutex_test.go3
-rw-r--r--pkg/unet/BUILD1
-rw-r--r--pkg/unet/unet_test.go3
-rw-r--r--pkg/urpc/BUILD1
-rw-r--r--pkg/urpc/urpc.go2
-rw-r--r--pkg/waiter/BUILD1
-rw-r--r--pkg/waiter/waiter.go2
-rw-r--r--runsc/boot/BUILD4
-rw-r--r--runsc/boot/compat.go63
-rw-r--r--runsc/boot/compat_amd64.go85
-rw-r--r--runsc/boot/compat_arm64.go91
-rw-r--r--runsc/boot/compat_test.go45
-rw-r--r--runsc/boot/config.go9
-rw-r--r--runsc/boot/controller.go4
-rw-r--r--runsc/boot/filter/config.go53
-rw-r--r--runsc/boot/fs.go23
-rw-r--r--runsc/boot/fs_test.go97
-rw-r--r--runsc/boot/limits.go2
-rw-r--r--runsc/boot/loader.go52
-rw-r--r--runsc/boot/loader_test.go2
-rw-r--r--runsc/boot/network.go44
-rw-r--r--runsc/cgroup/cgroup.go24
-rw-r--r--runsc/cmd/BUILD1
-rw-r--r--runsc/cmd/create.go1
-rw-r--r--runsc/cmd/debug.go42
-rw-r--r--runsc/cmd/gofer.go2
-rw-r--r--runsc/cmd/start.go1
-rw-r--r--runsc/container/BUILD2
-rw-r--r--runsc/container/console_test.go13
-rw-r--r--runsc/container/container_test.go215
-rw-r--r--runsc/container/multi_container_test.go55
-rw-r--r--runsc/container/state_file.go2
-rw-r--r--runsc/container/test_app/BUILD1
-rw-r--r--runsc/container/test_app/test_app.go40
-rw-r--r--runsc/dockerutil/dockerutil.go10
-rw-r--r--runsc/fsgofer/BUILD1
-rw-r--r--runsc/fsgofer/fsgofer.go7
-rw-r--r--runsc/main.go2
-rw-r--r--runsc/sandbox/BUILD2
-rw-r--r--runsc/sandbox/network.go116
-rw-r--r--runsc/sandbox/sandbox.go39
-rw-r--r--runsc/testutil/BUILD1
-rw-r--r--runsc/testutil/testutil.go2
-rwxr-xr-xscripts/benchmarks.sh53
-rwxr-xr-xscripts/common.sh2
-rwxr-xr-xscripts/dev.sh3
-rwxr-xr-xscripts/go.sh2
-rwxr-xr-xscripts/iptables_tests.sh27
-rwxr-xr-xscripts/issue_reviver.sh27
-rwxr-xr-xscripts/simple_tests.sh2
-rw-r--r--test/iptables/BUILD31
-rw-r--r--test/iptables/README.md44
-rw-r--r--test/iptables/filter_input.go124
-rw-r--r--test/iptables/iptables.go53
-rw-r--r--test/iptables/iptables_test.go179
-rw-r--r--test/iptables/iptables_util.go82
-rw-r--r--test/iptables/runner/BUILD16
-rw-r--r--test/iptables/runner/Dockerfile4
-rw-r--r--test/iptables/runner/main.go70
-rw-r--r--test/syscalls/BUILD8
-rw-r--r--test/syscalls/build_defs.bzl17
-rw-r--r--test/syscalls/linux/BUILD80
-rw-r--r--test/syscalls/linux/aio.cc2
-rw-r--r--test/syscalls/linux/clock_gettime.cc5
-rw-r--r--test/syscalls/linux/exceptions.cc181
-rw-r--r--test/syscalls/linux/exec.cc145
-rw-r--r--test/syscalls/linux/fcntl.cc162
-rw-r--r--test/syscalls/linux/file_base.h89
-rw-r--r--test/syscalls/linux/futex.cc21
-rw-r--r--test/syscalls/linux/inotify.cc32
-rw-r--r--test/syscalls/linux/ioctl.cc3
-rw-r--r--test/syscalls/linux/ip_socket_test_util.cc10
-rw-r--r--test/syscalls/linux/ip_socket_test_util.h25
-rw-r--r--test/syscalls/linux/mmap.cc207
-rw-r--r--test/syscalls/linux/partial_bad_buffer.cc138
-rw-r--r--test/syscalls/linux/pipe.cc14
-rw-r--r--test/syscalls/linux/poll.cc3
-rw-r--r--test/syscalls/linux/preadv2.cc2
-rw-r--r--test/syscalls/linux/proc.cc40
-rw-r--r--test/syscalls/linux/proc_net_tcp.cc1
-rw-r--r--test/syscalls/linux/proc_net_udp.cc1
-rw-r--r--test/syscalls/linux/readv_common.cc45
-rw-r--r--test/syscalls/linux/readv_socket.cc45
-rw-r--r--test/syscalls/linux/rseq.cc198
-rw-r--r--test/syscalls/linux/rseq/BUILD59
-rw-r--r--test/syscalls/linux/rseq/critical.S66
-rw-r--r--test/syscalls/linux/rseq/critical.h39
-rw-r--r--test/syscalls/linux/rseq/rseq.cc366
-rw-r--r--test/syscalls/linux/rseq/start.S45
-rw-r--r--test/syscalls/linux/rseq/syscalls.h66
-rw-r--r--test/syscalls/linux/rseq/test.h43
-rw-r--r--test/syscalls/linux/rseq/types.h31
-rw-r--r--test/syscalls/linux/rseq/uapi.h54
-rw-r--r--test/syscalls/linux/sigaltstack.cc8
-rw-r--r--test/syscalls/linux/socket_bind_to_device_distribution.cc25
-rw-r--r--test/syscalls/linux/socket_bind_to_device_sequence.cc382
-rw-r--r--test/syscalls/linux/socket_inet_loopback.cc339
-rw-r--r--test/syscalls/linux/socket_ip_loopback_blocking.cc2
-rw-r--r--test/syscalls/linux/socket_ip_tcp_generic.cc86
-rw-r--r--test/syscalls/linux/socket_ip_tcp_loopback.cc2
-rw-r--r--test/syscalls/linux/socket_ip_udp_generic.cc40
-rw-r--r--test/syscalls/linux/socket_ip_unbound.cc33
-rw-r--r--test/syscalls/linux/socket_ipv4_udp_unbound.cc4
-rw-r--r--test/syscalls/linux/socket_non_stream.cc113
-rw-r--r--test/syscalls/linux/socket_non_stream_blocking.cc37
-rw-r--r--test/syscalls/linux/socket_stream.cc55
-rw-r--r--test/syscalls/linux/socket_unix.cc15
-rw-r--r--test/syscalls/linux/socket_unix_cmsg.cc29
-rw-r--r--test/syscalls/linux/tcp_socket.cc111
-rw-r--r--test/syscalls/linux/udp_socket.cc1321
-rw-r--r--test/syscalls/linux/udp_socket_errqueue_test_case.cc54
-rw-r--r--test/syscalls/linux/udp_socket_test_cases.cc1500
-rw-r--r--test/syscalls/linux/udp_socket_test_cases.h74
-rw-r--r--test/syscalls/linux/xattr.cc488
-rw-r--r--test/syscalls/syscall_test_runner.go10
-rw-r--r--test/util/BUILD2
-rw-r--r--test/util/fs_util.cc11
-rw-r--r--test/util/test_util.cc6
-rw-r--r--test/util/test_util.h7
-rw-r--r--test/util/test_util_runfiles.cc46
-rw-r--r--tools/go_stateify/defs.bzl17
-rw-r--r--tools/go_stateify/main.go112
-rw-r--r--tools/issue_reviver/BUILD12
-rw-r--r--tools/issue_reviver/github/BUILD17
-rw-r--r--tools/issue_reviver/github/github.go164
-rw-r--r--tools/issue_reviver/main.go89
-rw-r--r--tools/issue_reviver/reviver/BUILD19
-rw-r--r--tools/issue_reviver/reviver/reviver.go192
-rw-r--r--tools/issue_reviver/reviver/reviver_test.go88
-rw-r--r--vdso/BUILD8
730 files changed, 37214 insertions, 6856 deletions
diff --git a/.bazelrc b/.bazelrc
index 379fc8328..7f87e94b1 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -12,6 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+# Build with C++17.
+build --cxxopt=-std=c++17
+
# Display the current git revision in the info block.
build --stamp --workspace_status_command tools/workspace_status.sh
diff --git a/BUILD b/BUILD
index de410b008..76286174f 100644
--- a/BUILD
+++ b/BUILD
@@ -23,7 +23,18 @@ go_path(
"//runsc",
# Packages that are not dependencies of //runsc.
+ "//pkg/sentry/kernel/memevent",
+ "//pkg/tcpip/adapters/gonet",
"//pkg/tcpip/link/channel",
+ "//pkg/tcpip/link/muxed",
+ "//pkg/tcpip/link/sharedmem",
+ "//pkg/tcpip/link/sharedmem/pipe",
+ "//pkg/tcpip/link/sharedmem/queue",
+ "//pkg/tcpip/link/tun",
+ "//pkg/tcpip/link/waitable",
+ "//pkg/tcpip/sample/tun_tcp_connect",
+ "//pkg/tcpip/sample/tun_tcp_echo",
+ "//pkg/tcpip/transport/tcpconntrack",
],
)
diff --git a/Dockerfile b/Dockerfile
index 6e9d870db..738623023 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,8 +1,8 @@
FROM ubuntu:bionic
-RUN apt-get update && apt-get install -y curl gnupg2 git python3
+RUN apt-get update && apt-get install -y curl gnupg2 git python python3 python3-distutils python3-pip
RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
- curl https://bazel.build/bazel-release.pub.gpg | apt-key add -
+ curl https://bazel.build/bazel-release.pub.gpg | apt-key add -
RUN apt-get update && apt-get install -y bazel && apt-get clean
WORKDIR /gvisor
diff --git a/Makefile b/Makefile
index 1735c07df..a73bc0c36 100644
--- a/Makefile
+++ b/Makefile
@@ -22,7 +22,7 @@ bazel-server-start: docker-build
--privileged \
gvisor-bazel \
sh -c "while :; do sleep 100; done" && \
- docker exec --user 0:0 -i gvisor-bazel sh -c "groupadd --gid $(GID) --non-unique gvisor && useradd --uid $(UID) --gid $(GID) -d $(HOME) gvisor"
+ docker exec --user 0:0 -i gvisor-bazel sh -c "groupadd --gid $(GID) --non-unique gvisor && useradd --uid $(UID) --non-unique --gid $(GID) -d $(HOME) gvisor"
bazel-server:
docker exec gvisor-bazel true || \
diff --git a/README.md b/README.md
index 5ac6f9046..de3e06f4e 100644
--- a/README.md
+++ b/README.md
@@ -48,9 +48,10 @@ Make sure the following dependencies are installed:
* Linux 4.14.77+ ([older linux][old-linux])
* [git][git]
-* [Bazel][bazel] 0.28.0+
+* [Bazel][bazel] 1.2+
* [Python][python]
* [Docker version 17.09.0 or greater][docker]
+* C++ toolchain supporting C++17 (GCC 7+, Clang 5+)
* Gold linker (e.g. `binutils-gold` package on Ubuntu)
### Building
diff --git a/WORKSPACE b/WORKSPACE
index 0ad2bb17c..e2afc073c 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -1,6 +1,7 @@
-# Load go bazel rules and gazelle.
load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository")
+# Load go bazel rules and gazelle.
http_archive(
name = "io_bazel_rules_go",
sha256 = "b9aa86ec08a292b97ec4591cf578e020b35f98e12173bbd4a921f84f583aebd9",
@@ -58,6 +59,26 @@ load("@com_google_protobuf//:protobuf_deps.bzl", "protobuf_deps")
protobuf_deps()
+# Load python dependencies.
+git_repository(
+ name = "rules_python",
+ commit = "94677401bc56ed5d756f50b441a6a5c7f735a6d4",
+ remote = "https://github.com/bazelbuild/rules_python.git",
+ shallow_since = "1573842889 -0500",
+)
+
+load("@rules_python//python:pip.bzl", "pip_import")
+
+pip_import(
+ name = "pydeps",
+ python_interpreter = "python3",
+ requirements = "//benchmarks:requirements.txt",
+)
+
+load("@pydeps//:requirements.bzl", "pip_install")
+
+pip_install()
+
# Load bazel_toolchain to support Remote Build Execution.
# See releases at https://releases.bazel.build/bazel-toolchains.html
http_archive(
@@ -85,6 +106,45 @@ load("@rules_pkg//:deps.bzl", "rules_pkg_dependencies")
rules_pkg_dependencies()
+# Container rules.
+http_archive(
+ name = "io_bazel_rules_docker",
+ sha256 = "14ac30773fdb393ddec90e158c9ec7ebb3f8a4fd533ec2abbfd8789ad81a284b",
+ strip_prefix = "rules_docker-0.12.1",
+ urls = ["https://github.com/bazelbuild/rules_docker/releases/download/v0.12.1/rules_docker-v0.12.1.tar.gz"],
+)
+
+load(
+ "@io_bazel_rules_docker//repositories:repositories.bzl",
+ container_repositories = "repositories",
+)
+
+container_repositories()
+
+load("@io_bazel_rules_docker//repositories:deps.bzl", container_deps = "deps")
+
+container_deps()
+
+load(
+ "@io_bazel_rules_docker//container:container.bzl",
+ "container_pull",
+)
+
+# This container is built from the Dockerfile in test/iptables/runner.
+container_pull(
+ name = "iptables-test",
+ registry = "gcr.io",
+ repository = "gvisor-presubmit/iptables-test",
+ digest = "sha256:a137d692a2eb9fc7bf95c5f4a568da090e2c31098e93634421ed88f3a3f1db65",
+)
+
+load(
+ "@io_bazel_rules_docker//go:image.bzl",
+ _go_image_repos = "repositories",
+)
+
+_go_image_repos()
+
# External repositories, in sorted order.
go_repository(
name = "com_github_cenkalti_backoff",
@@ -230,6 +290,27 @@ go_repository(
version = "v1.3.1",
)
+go_repository(
+ name = "com_github_google_go-github",
+ importpath = "github.com/google/go-github",
+ sum = "h1:N0LgJ1j65A7kfXrZnUDaYCs/Sf4rEjNlfyDHW9dolSY=",
+ version = "v17.0.0",
+)
+
+go_repository(
+ name = "org_golang_x_oauth2",
+ importpath = "golang.org/x/oauth2",
+ sum = "h1:pE8b58s1HRDMi8RDc79m0HISf9D4TzseP40cEA6IGfs=",
+ version = "v0.0.0-20191202225959-858c2ad4c8b6",
+)
+
+go_repository(
+ name = "com_github_google_go-querystring",
+ importpath = "github.com/google/go-querystring",
+ sum = "h1:Xkwi/a1rcvNg1PPYe5vI8GbeBY/jrVuDX5ASuANWTrk=",
+ version = "v1.0.0",
+)
+
# System Call test dependencies.
http_archive(
name = "com_google_absl",
diff --git a/benchmarks/BUILD b/benchmarks/BUILD
new file mode 100644
index 000000000..1455c6c5b
--- /dev/null
+++ b/benchmarks/BUILD
@@ -0,0 +1,10 @@
+package(licenses = ["notice"])
+
+py_binary(
+ name = "benchmarks",
+ srcs = ["run.py"],
+ main = "run.py",
+ python_version = "PY3",
+ srcs_version = "PY3",
+ deps = ["//benchmarks/runner"],
+)
diff --git a/benchmarks/README.md b/benchmarks/README.md
new file mode 100644
index 000000000..ff21614c5
--- /dev/null
+++ b/benchmarks/README.md
@@ -0,0 +1,162 @@
+# Benchmark tools
+
+These scripts are tools for collecting performance data for Docker-based tests.
+
+## Setup
+
+The scripts assume the following:
+
+* There are two sets of machines: one where the scripts will be run
+ (controller) and one or more machines on which docker containers will be run
+ (environment).
+* The controller machine must have bazel installed along with this source
+ code. You should be able to run a command like `bazel run :benchmarks --
+ --list`
+* Environment machines must have docker and the required runtimes installed.
+ More specifically, you should be able to run a command like: `docker run
+ --runtime=$RUNTIME your/image`.
+* The controller has ssh private key which can be used to login to environment
+ machines and run docker commands without using `sudo`. This is not required
+ if running locally via the `run-local` command.
+* The docker daemon on each of your environment machines is listening on
+ `unix:///var/run/docker.sock` (docker's default).
+
+For configuring the environment manually, consult the
+[dockerd documentation][dockerd].
+
+## Running benchmarks
+
+Run the following from the benchmarks directory:
+
+```bash
+bazel run :benchmarks -- run-local startup
+
+...
+method,metric,result
+startup.empty,startup_time_ms,652.5772
+startup.node,startup_time_ms,1654.4042000000002
+startup.ruby,startup_time_ms,1429.835
+```
+
+The above command ran the startup benchmark locally, which consists of three
+benchmarks (empty, node, and ruby). Benchmark tools ran it on the default
+runtime, runc. Running on another installed runtime, like say runsc, is as
+simple as:
+
+```bash
+bazel run :benchmakrs -- run-local startup --runtime=runsc
+```
+
+There is help: ``bash bash bazel run :benchmarks -- --help bazel
+run :benchmarks -- run-local --help` ``
+
+To list available benchmarks, use the `list` commmand:
+
+```bash
+bazel run :benchmarks -- list
+ls
+
+...
+Benchmark: sysbench.cpu
+Metrics: events_per_second
+ Run sysbench CPU test. Additional arguments can be provided for sysbench.
+
+ :param max_prime: The maximum prime number to search.
+```
+
+You can choose benchmarks by name or regex like:
+
+```bash
+bazel run :benchmarks -- run-local startup.node
+...
+metric,result
+startup_time_ms,1671.7178000000001
+
+```
+
+or
+
+```bash
+bazel run :benchmarks -- run-local s
+...
+method,metric,result
+startup.empty,startup_time_ms,1792.8292
+startup.node,startup_time_ms,3113.5274
+startup.ruby,startup_time_ms,3025.2424
+sysbench.cpu,cpu_events_per_second,12661.47
+sysbench.memory,memory_ops_per_second,7228268.44
+sysbench.mutex,mutex_time,17.4835
+sysbench.mutex,mutex_latency,3496.7
+sysbench.mutex,mutex_deviation,0.04
+syscall.syscall,syscall_time_ns,2065.0
+```
+
+You can run parameterized benchmarks, for example to run with different
+runtimes:
+
+```bash
+bazel run :benchmarks -- run-local --runtime=runc --runtime=runsc sysbench.cpu
+```
+
+Or with different parameters:
+
+```bash
+bazel run :benchmarks -- run-local --max_prime=10 --max_prime=100 sysbench.cpu
+```
+
+## Writing benchmarks
+
+To write new benchmarks, you should familiarize yourself with the structure of
+the repository. There are three key components.
+
+## Harness
+
+The harness makes use of the [docker py SDK][docker-py]. It is advisable that
+you familiarize yourself with that API when making changes, specifically:
+
+* clients
+* containers
+* images
+
+In general, benchmarks need only interact with the `Machine` objects provided to
+the benchmark function, which are the machines defined in the environment. These
+objects allow the benchmark to define the relationships between different
+containers, and parse the output.
+
+## Workloads
+
+The harness requires workloads to run. These are all available in the
+`workloads` directory.
+
+In general, a workload consists of a Dockerfile to build it (while these are not
+hermetic, in general they should be as fixed and isolated as possible), some
+parsers for output if required, parser tests and sample data. Provided the test
+is named after the workload package and contains a function named `sample`, this
+variable will be used to automatically mock workload output when the `--mock`
+flag is provided to the main tool.
+
+## Writing benchmarks
+
+Benchmarks define the tests themselves. All benchmarks have the following
+function signature:
+
+```python
+def my_func(output) -> float:
+ return float(output)
+
+@benchmark(metrics = my_func, machines = 1)
+def my_benchmark(machine: machine.Machine, arg: str):
+ return "3.4432"
+```
+
+Each benchmark takes a variable amount of position arguments as
+`harness.Machine` objects and some set of keyword arguments. It is recommended
+that you accept arbitrary keyword arguments and pass them through when
+constructing the container under test.
+
+To write a new benchmark, open a module in the `suites` directory and use the
+above signature. You should add a descriptive doc string to describe what your
+benchmark is and any test centric arguments.
+
+[dockerd]: https://docs.docker.com/engine/reference/commandline/dockerd/
+[docker-py]: https://docker-py.readthedocs.io/en/stable/
diff --git a/benchmarks/defs.bzl b/benchmarks/defs.bzl
new file mode 100644
index 000000000..79e6cdbc8
--- /dev/null
+++ b/benchmarks/defs.bzl
@@ -0,0 +1,18 @@
+"""Provides python helper functions."""
+
+load("@pydeps//:requirements.bzl", _requirement = "requirement")
+
+def filter_deps(deps = None):
+ if deps == None:
+ deps = []
+ return [dep for dep in deps if dep]
+
+def py_library(deps = None, **kwargs):
+ return native.py_library(deps = filter_deps(deps), **kwargs)
+
+def py_test(deps = None, **kwargs):
+ return native.py_test(deps = filter_deps(deps), **kwargs)
+
+def requirement(name, direct = True):
+ """ requirement returns the required dependency. """
+ return _requirement(name)
diff --git a/benchmarks/examples/localhost.yaml b/benchmarks/examples/localhost.yaml
new file mode 100644
index 000000000..f70fe0fb7
--- /dev/null
+++ b/benchmarks/examples/localhost.yaml
@@ -0,0 +1,2 @@
+client: localhost
+server: localhost
diff --git a/benchmarks/harness/BUILD b/benchmarks/harness/BUILD
new file mode 100644
index 000000000..081a74243
--- /dev/null
+++ b/benchmarks/harness/BUILD
@@ -0,0 +1,92 @@
+load("//benchmarks:defs.bzl", "py_library", "requirement")
+
+package(
+ default_visibility = ["//benchmarks:__subpackages__"],
+ licenses = ["notice"],
+)
+
+py_library(
+ name = "harness",
+ srcs = ["__init__.py"],
+)
+
+py_library(
+ name = "benchmark_driver",
+ srcs = ["benchmark_driver.py"],
+ deps = [
+ "//benchmarks/harness/machine_mocks",
+ "//benchmarks/harness/machine_producers:machine_producer",
+ "//benchmarks/suites",
+ ],
+)
+
+py_library(
+ name = "container",
+ srcs = ["container.py"],
+ deps = [
+ "//benchmarks/workloads",
+ requirement("asn1crypto", False),
+ requirement("chardet", False),
+ requirement("certifi", False),
+ requirement("docker", True),
+ requirement("docker-pycreds", False),
+ requirement("idna", False),
+ requirement("ptyprocess", False),
+ requirement("requests", False),
+ requirement("urllib3", False),
+ requirement("websocket-client", False),
+ ],
+)
+
+py_library(
+ name = "machine",
+ srcs = ["machine.py"],
+ deps = [
+ "//benchmarks/harness",
+ "//benchmarks/harness:container",
+ "//benchmarks/harness:ssh_connection",
+ "//benchmarks/harness:tunnel_dispatcher",
+ "//benchmarks/harness/machine_mocks",
+ requirement("asn1crypto", False),
+ requirement("chardet", False),
+ requirement("certifi", False),
+ requirement("docker", True),
+ requirement("docker-pycreds", False),
+ requirement("idna", False),
+ requirement("ptyprocess", False),
+ requirement("requests", False),
+ requirement("six", False),
+ requirement("urllib3", False),
+ requirement("websocket-client", False),
+ ],
+)
+
+py_library(
+ name = "ssh_connection",
+ srcs = ["ssh_connection.py"],
+ deps = [
+ "//benchmarks/harness",
+ requirement("bcrypt", False),
+ requirement("cffi", True),
+ requirement("paramiko", True),
+ requirement("cryptography", False),
+ ],
+)
+
+py_library(
+ name = "tunnel_dispatcher",
+ srcs = ["tunnel_dispatcher.py"],
+ deps = [
+ requirement("asn1crypto", False),
+ requirement("chardet", False),
+ requirement("certifi", False),
+ requirement("docker", True),
+ requirement("docker-pycreds", False),
+ requirement("idna", False),
+ requirement("pexpect", True),
+ requirement("ptyprocess", False),
+ requirement("requests", False),
+ requirement("urllib3", False),
+ requirement("websocket-client", False),
+ ],
+)
diff --git a/benchmarks/harness/__init__.py b/benchmarks/harness/__init__.py
new file mode 100644
index 000000000..7b96d1666
--- /dev/null
+++ b/benchmarks/harness/__init__.py
@@ -0,0 +1,25 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Core benchmark utilities."""
+
+import os
+
+# LOCAL_WORKLOADS_PATH defines the path to use for local workloads. This is a
+# format string that accepts a single string parameter.
+LOCAL_WORKLOADS_PATH = os.path.join(
+ os.path.dirname(__file__), "../workloads/{}/tar.tar")
+
+# REMOTE_WORKLOADS_PATH defines the path to use for storing the workloads on the
+# remote host. This is a format string that accepts a single string parameter.
+REMOTE_WORKLOADS_PATH = "workloads/{}"
diff --git a/benchmarks/harness/benchmark_driver.py b/benchmarks/harness/benchmark_driver.py
new file mode 100644
index 000000000..9abc21b54
--- /dev/null
+++ b/benchmarks/harness/benchmark_driver.py
@@ -0,0 +1,85 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Main driver for benchmarks."""
+
+import copy
+import statistics
+import threading
+import types
+
+from benchmarks import suites
+from benchmarks.harness.machine_producers import machine_producer
+
+
+# pylint: disable=too-many-instance-attributes
+class BenchmarkDriver:
+ """Allocates machines and invokes a benchmark method."""
+
+ def __init__(self,
+ producer: machine_producer.MachineProducer,
+ method: types.FunctionType,
+ runs: int = 1,
+ **kwargs):
+
+ self._producer = producer
+ self._method = method
+ self._kwargs = copy.deepcopy(kwargs)
+ self._threads = []
+ self.lock = threading.RLock()
+ self._runs = runs
+ self._metric_results = {}
+
+ def start(self):
+ """Starts a benchmark thread."""
+ for _ in range(self._runs):
+ thread = threading.Thread(target=self._run_method)
+ thread.start()
+ self._threads.append(thread)
+
+ def join(self):
+ """Joins the thread."""
+ # pylint: disable=expression-not-assigned
+ [t.join() for t in self._threads]
+
+ def _run_method(self):
+ """Runs all benchmarks."""
+ machines = self._producer.get_machines(
+ suites.benchmark_machines(self._method))
+ try:
+ result = self._method(*machines, **self._kwargs)
+ for name, res in result:
+ with self.lock:
+ if name in self._metric_results:
+ self._metric_results[name].append(res)
+ else:
+ self._metric_results[name] = [res]
+ finally:
+ # Always release.
+ self._producer.release_machines(machines)
+
+ def median(self):
+ """Returns the median result, after join is finished."""
+ for key, value in self._metric_results.items():
+ yield key, [statistics.median(value)]
+
+ def all(self):
+ """Returns all results."""
+ for key, value in self._metric_results.items():
+ yield key, value
+
+ def meanstd(self):
+ """Returns all results."""
+ for key, value in self._metric_results.items():
+ mean = statistics.mean(value)
+ yield key, [mean, statistics.stdev(value, xbar=mean)]
diff --git a/benchmarks/harness/container.py b/benchmarks/harness/container.py
new file mode 100644
index 000000000..585436e20
--- /dev/null
+++ b/benchmarks/harness/container.py
@@ -0,0 +1,181 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Container definitions."""
+
+import contextlib
+import logging
+import pydoc
+import types
+from typing import Tuple
+
+import docker
+import docker.errors
+
+from benchmarks import workloads
+
+
+class Container:
+ """Abstract container.
+
+ Must be a context manager.
+
+ Usage:
+
+ with Container(client, image, ...):
+ ...
+ """
+
+ def run(self, **env) -> str:
+ """Run the container synchronously."""
+ raise NotImplementedError
+
+ def detach(self, **env):
+ """Run the container asynchronously."""
+ raise NotImplementedError
+
+ def address(self) -> Tuple[str, int]:
+ """Return the bound address for the container."""
+ raise NotImplementedError
+
+ def get_names(self) -> types.GeneratorType:
+ """Return names of all containers."""
+ raise NotImplementedError
+
+
+# pylint: disable=too-many-instance-attributes
+class DockerContainer(Container):
+ """Class that handles creating a docker container."""
+
+ # pylint: disable=too-many-arguments
+ def __init__(self,
+ client: docker.DockerClient,
+ host: str,
+ image: str,
+ count: int = 1,
+ runtime: str = "runc",
+ port: int = 0,
+ **kwargs):
+ """Trys to setup "count" containers.
+
+ Args:
+ client: A docker client from dockerpy.
+ host: The host address the image is running on.
+ image: The name of the image to run.
+ count: The number of containers to setup.
+ runtime: The container runtime to use.
+ port: The port to reserve.
+ **kwargs: Additional container options.
+ """
+ assert count >= 1
+ assert port == 0 or count == 1
+ self._client = client
+ self._host = host
+ self._containers = []
+ self._count = count
+ self._image = image
+ self._runtime = runtime
+ self._port = port
+ self._kwargs = kwargs
+ if port != 0:
+ self._ports = {"%d/tcp" % port: None}
+ else:
+ self._ports = {}
+
+ @contextlib.contextmanager
+ def detach(self, **env):
+ env = ["%s=%s" % (key, value) for (key, value) in env.items()]
+ # Start all containers.
+ for _ in range(self._count):
+ try:
+ # Start the container in a detached mode.
+ container = self._client.containers.run(
+ self._image,
+ detach=True,
+ remove=True,
+ runtime=self._runtime,
+ ports=self._ports,
+ environment=env,
+ **self._kwargs)
+ logging.info("Started detached container %s -> %s", self._image,
+ container.attrs["Id"])
+ self._containers.append(container)
+ except Exception as exc:
+ self._clean_containers()
+ raise exc
+ try:
+ # Wait for all containers to be up.
+ for container in self._containers:
+ while not container.attrs["State"]["Running"]:
+ container = self._client.containers.get(container.attrs["Id"])
+ yield self
+ finally:
+ self._clean_containers()
+
+ def address(self) -> Tuple[str, int]:
+ assert self._count == 1
+ assert self._port != 0
+ container = self._client.containers.get(self._containers[0].attrs["Id"])
+ port = container.attrs["NetworkSettings"]["Ports"][
+ "%d/tcp" % self._port][0]["HostPort"]
+ return (self._host, port)
+
+ def get_names(self) -> types.GeneratorType:
+ for container in self._containers:
+ yield container.name
+
+ def run(self, **env) -> str:
+ env = ["%s=%s" % (key, value) for (key, value) in env.items()]
+ return self._client.containers.run(
+ self._image,
+ runtime=self._runtime,
+ ports=self._ports,
+ remove=True,
+ environment=env,
+ **self._kwargs).decode("utf-8")
+
+ def _clean_containers(self):
+ """Kills all containers."""
+ for container in self._containers:
+ try:
+ container.kill()
+ except docker.errors.NotFound:
+ pass
+
+
+class MockContainer(Container):
+ """Mock of Container."""
+
+ def __init__(self, workload: str):
+ self._workload = workload
+
+ def __enter__(self):
+ return self
+
+ def run(self, **env):
+ # Lookup sample data if any exists for the workload module. We use a
+ # well-defined test locate and a well-defined sample function.
+ mod = pydoc.locate(workloads.__name__ + "." + self._workload)
+ if hasattr(mod, "sample"):
+ return mod.sample(**env)
+ return "" # No output.
+
+ def address(self) -> Tuple[str, int]:
+ return ("example.com", 80)
+
+ def get_names(self) -> types.GeneratorType:
+ yield "mock"
+
+ @contextlib.contextmanager
+ def detach(self, **env):
+ yield self
diff --git a/benchmarks/harness/machine.py b/benchmarks/harness/machine.py
new file mode 100644
index 000000000..af037dbcc
--- /dev/null
+++ b/benchmarks/harness/machine.py
@@ -0,0 +1,226 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Machine abstraction passed to benchmarks to run docker containers.
+
+Abstraction for interacting with test machines. Machines are produced
+by Machine producers and represent a local or remote machine. Benchmark
+methods in /benchmarks/suite are passed the required number of machines in order
+to run the benchmark. Machines contain methods to run commands via bash,
+possibly over ssh. Machines also hold a connection to the docker UNIX socket
+to run contianers.
+
+ Typical usage example:
+
+ machine = Machine()
+ machine.run(cmd)
+ machine.pull(path)
+ container = machine.container()
+"""
+
+import logging
+import re
+import subprocess
+import time
+from typing import Tuple
+
+import docker
+
+from benchmarks import harness
+from benchmarks.harness import container
+from benchmarks.harness import machine_mocks
+from benchmarks.harness import ssh_connection
+from benchmarks.harness import tunnel_dispatcher
+
+
+class Machine(object):
+ """The machine object is the primary object for benchmarks.
+
+ Machine objects are passed to each metric function call and benchmarks use
+ machines to access real connections to those machines.
+
+ Attributes:
+ _name: Name as a string
+ """
+ _name = ""
+
+ def run(self, cmd: str) -> Tuple[str, str]:
+ """Convenience method for running a bash command on a machine object.
+
+ Some machines may point to the local machine, and thus, do not have ssh
+ connections. Run runs a command either local or over ssh and returns the
+ output stdout and stderr as strings.
+
+ Args:
+ cmd: The command to run as a string.
+
+ Returns:
+ The command output.
+ """
+ raise NotImplementedError
+
+ def read(self, path: str) -> str:
+ """Reads the contents of some file.
+
+ This will be mocked.
+
+ Args:
+ path: The path to the file to be read.
+
+ Returns:
+ The file contents.
+ """
+ raise NotImplementedError
+
+ def pull(self, workload: str) -> str:
+ """Send the given workload to the machine, build and tag it.
+
+ All images must be defined by the workloads directory.
+
+ Args:
+ workload: The workload name.
+
+ Returns:
+ The workload tag.
+ """
+ raise NotImplementedError
+
+ def container(self, image: str, **kwargs) -> container.Container:
+ """Returns a container object.
+
+ Args:
+ image: The pulled image tag.
+ **kwargs: Additional container options.
+
+ Returns:
+ :return: a container.Container object.
+ """
+ raise NotImplementedError
+
+ def sleep(self, amount: float):
+ """Sleeps the given amount of time."""
+ time.sleep(amount)
+
+ def __str__(self):
+ return self._name
+
+
+class MockMachine(Machine):
+ """A mocked machine."""
+ _name = "mock"
+
+ def run(self, cmd: str) -> Tuple[str, str]:
+ return "", ""
+
+ def read(self, path: str) -> str:
+ return machine_mocks.Readfile(path)
+
+ def pull(self, workload: str) -> str:
+ return workload # Workload is the tag.
+
+ def container(self, image: str, **kwargs) -> container.Container:
+ return container.MockContainer(image)
+
+ def sleep(self, amount: float):
+ pass
+
+
+def get_address(machine: Machine) -> str:
+ """Return a machine's default address."""
+ default_route, _ = machine.run("ip route get 8.8.8.8")
+ return re.search(" src ([0-9.]+) ", default_route).group(1)
+
+
+class LocalMachine(Machine):
+ """The local machine.
+
+ Attributes:
+ _name: Name as a string
+ _docker_client: a pythonic connection to to the local dockerd unix socket.
+ See: https://github.com/docker/docker-py
+ """
+
+ def __init__(self, name):
+ self._name = name
+ self._docker_client = docker.from_env()
+
+ def run(self, cmd: str) -> Tuple[str, str]:
+ process = subprocess.Popen(
+ cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ stdout, stderr = process.communicate()
+ return stdout.decode("utf-8"), stderr.decode("utf-8")
+
+ def read(self, path: str) -> bytes:
+ # Read the exact path locally.
+ return open(path, "r").read()
+
+ def pull(self, workload: str) -> str:
+ # Run the docker build command locally.
+ logging.info("Building %s@%s locally...", workload, self._name)
+ with open(harness.LOCAL_WORKLOADS_PATH.format(workload),
+ "rb") as dockerfile:
+ self._docker_client.images.build(
+ fileobj=dockerfile, tag=workload, custom_context=True)
+ return workload # Workload is the tag.
+
+ def container(self, image: str, **kwargs) -> container.Container:
+ # Return a local docker container directly.
+ return container.DockerContainer(self._docker_client, get_address(self),
+ image, **kwargs)
+
+ def sleep(self, amount: float):
+ time.sleep(amount)
+
+
+class RemoteMachine(Machine):
+ """Remote machine accessible via an SSH connection.
+
+ Attributes:
+ _name: Name as a string
+ _ssh_connection: a paramiko backed ssh connection which can be used to run
+ commands on this machine
+ _tunnel: a python wrapper around a port forwarded ssh connection between a
+ local unix socket and the remote machine's dockerd unix socket.
+ _docker_client: a pythonic wrapper backed by the _tunnel. Allows sending
+ docker commands: see https://github.com/docker/docker-py
+ """
+
+ def __init__(self, name, **kwargs):
+ self._name = name
+ self._ssh_connection = ssh_connection.SSHConnection(name, **kwargs)
+ self._tunnel = tunnel_dispatcher.Tunnel(name, **kwargs)
+ self._tunnel.connect()
+ self._docker_client = self._tunnel.get_docker_client()
+
+ def run(self, cmd: str) -> Tuple[str, str]:
+ return self._ssh_connection.run(cmd)
+
+ def read(self, path: str) -> str:
+ # Just cat remotely.
+ stdout, stderr = self._ssh_connection.run("cat '{}'".format(path))
+ return stdout + stderr
+
+ def pull(self, workload: str) -> str:
+ # Push to the remote machine and build.
+ logging.info("Building %s@%s remotely...", workload, self._name)
+ remote_path = self._ssh_connection.send_workload(workload)
+ self.run("docker build --tag={} {}".format(workload, remote_path))
+ return workload # Workload is the tag.
+
+ def container(self, image: str, **kwargs) -> container.Container:
+ # Return a remote docker container.
+ return container.DockerContainer(self._docker_client, get_address(self),
+ image, **kwargs)
+
+ def sleep(self, amount: float):
+ time.sleep(amount)
diff --git a/benchmarks/harness/machine_mocks/BUILD b/benchmarks/harness/machine_mocks/BUILD
new file mode 100644
index 000000000..c8ec4bc79
--- /dev/null
+++ b/benchmarks/harness/machine_mocks/BUILD
@@ -0,0 +1,9 @@
+package(
+ default_visibility = ["//benchmarks:__subpackages__"],
+ licenses = ["notice"],
+)
+
+py_library(
+ name = "machine_mocks",
+ srcs = ["__init__.py"],
+)
diff --git a/benchmarks/harness/machine_mocks/__init__.py b/benchmarks/harness/machine_mocks/__init__.py
new file mode 100644
index 000000000..00f0085d7
--- /dev/null
+++ b/benchmarks/harness/machine_mocks/__init__.py
@@ -0,0 +1,81 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Machine mock files."""
+
+MEMINFO = """\
+MemTotal: 7652344 kB
+MemFree: 7174724 kB
+MemAvailable: 7152008 kB
+Buffers: 7544 kB
+Cached: 178856 kB
+SwapCached: 0 kB
+Active: 270928 kB
+Inactive: 68436 kB
+Active(anon): 153124 kB
+Inactive(anon): 880 kB
+Active(file): 117804 kB
+Inactive(file): 67556 kB
+Unevictable: 0 kB
+Mlocked: 0 kB
+SwapTotal: 0 kB
+SwapFree: 0 kB
+Dirty: 900 kB
+Writeback: 0 kB
+AnonPages: 153000 kB
+Mapped: 129120 kB
+Shmem: 1044 kB
+Slab: 60864 kB
+SReclaimable: 22792 kB
+SUnreclaim: 38072 kB
+KernelStack: 2672 kB
+PageTables: 5756 kB
+NFS_Unstable: 0 kB
+Bounce: 0 kB
+WritebackTmp: 0 kB
+CommitLimit: 3826172 kB
+Committed_AS: 663836 kB
+VmallocTotal: 34359738367 kB
+VmallocUsed: 0 kB
+VmallocChunk: 0 kB
+HardwareCorrupted: 0 kB
+AnonHugePages: 0 kB
+ShmemHugePages: 0 kB
+ShmemPmdMapped: 0 kB
+CmaTotal: 0 kB
+CmaFree: 0 kB
+HugePages_Total: 0
+HugePages_Free: 0
+HugePages_Rsvd: 0
+HugePages_Surp: 0
+Hugepagesize: 2048 kB
+DirectMap4k: 94196 kB
+DirectMap2M: 4624384 kB
+DirectMap1G: 3145728 kB
+"""
+
+CONTENTS = {
+ "/proc/meminfo": MEMINFO,
+}
+
+
+def Readfile(path: str) -> str:
+ """Reads a mock file.
+
+ Args:
+ path: The target path.
+
+ Returns:
+ Mocked file contents or None.
+ """
+ return CONTENTS.get(path, None)
diff --git a/benchmarks/harness/machine_producers/BUILD b/benchmarks/harness/machine_producers/BUILD
new file mode 100644
index 000000000..c4e943882
--- /dev/null
+++ b/benchmarks/harness/machine_producers/BUILD
@@ -0,0 +1,80 @@
+load("//benchmarks:defs.bzl", "py_library", "requirement")
+
+package(
+ default_visibility = ["//benchmarks:__subpackages__"],
+ licenses = ["notice"],
+)
+
+py_library(
+ name = "harness",
+ srcs = ["__init__.py"],
+)
+
+py_library(
+ name = "machine_producer",
+ srcs = ["machine_producer.py"],
+)
+
+py_library(
+ name = "mock_producer",
+ srcs = ["mock_producer.py"],
+ deps = [
+ "//benchmarks/harness:machine",
+ "//benchmarks/harness/machine_producers:gcloud_producer",
+ "//benchmarks/harness/machine_producers:machine_producer",
+ ],
+)
+
+py_library(
+ name = "yaml_producer",
+ srcs = ["yaml_producer.py"],
+ deps = [
+ "//benchmarks/harness:machine",
+ "//benchmarks/harness/machine_producers:machine_producer",
+ requirement("PyYAML", False),
+ ],
+)
+
+py_library(
+ name = "gcloud_mock_recorder",
+ srcs = ["gcloud_mock_recorder.py"],
+)
+
+py_library(
+ name = "gcloud_producer",
+ srcs = ["gcloud_producer.py"],
+ deps = [
+ "//benchmarks/harness:machine",
+ "//benchmarks/harness/machine_producers:gcloud_mock_recorder",
+ "//benchmarks/harness/machine_producers:machine_producer",
+ ],
+)
+
+filegroup(
+ name = "test_data",
+ srcs = [
+ "testdata/get_five.json",
+ "testdata/get_one.json",
+ ],
+)
+
+py_library(
+ name = "gcloud_producer_test_lib",
+ srcs = ["gcloud_producer_test.py"],
+ deps = [
+ "//benchmarks/harness/machine_producers:machine_producer",
+ "//benchmarks/harness/machine_producers:mock_producer",
+ ],
+)
+
+py_test(
+ name = "gcloud_producer_test",
+ srcs = [":gcloud_producer_test_lib"],
+ data = [
+ ":test_data",
+ ],
+ python_version = "PY3",
+ tags = [
+ "local",
+ ],
+)
diff --git a/benchmarks/harness/machine_producers/__init__.py b/benchmarks/harness/machine_producers/__init__.py
new file mode 100644
index 000000000..634ef4843
--- /dev/null
+++ b/benchmarks/harness/machine_producers/__init__.py
@@ -0,0 +1,13 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/benchmarks/harness/machine_producers/gcloud_mock_recorder.py b/benchmarks/harness/machine_producers/gcloud_mock_recorder.py
new file mode 100644
index 000000000..fd9837a37
--- /dev/null
+++ b/benchmarks/harness/machine_producers/gcloud_mock_recorder.py
@@ -0,0 +1,97 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A recorder and replay for testing the GCloudProducer.
+
+MockPrinter and MockReader handle printing and reading mock data for the
+purposes of testing. MockPrinter is passed to GCloudProducer objects. The user
+can then run scenarios and record them for playback in tests later.
+
+MockReader is passed to MockGcloudProducer objects and handles reading the
+previously recorded mock data.
+
+It is left to the user to check if data printed is properly redacted for their
+own use. The intended usecase for this class is data coming from gcloud
+commands, which will contain public IPs and other instance data.
+
+The data format is json and printed/read from the ./test_data directory. The
+data is the output of subprocess.CompletedProcess objects in json format.
+
+ Typical usage example:
+
+ recorder = MockPrinter()
+ producer = GCloudProducer(args, recorder)
+ machines = producer.get_machines(1)
+ with open("my_file.json") as fd:
+ recorder.write_out(fd)
+
+ reader = MockReader(filename)
+ producer = MockGcloudProducer(args, mock)
+ machines = producer.get_machines(1)
+ assert len(machines) == 1
+"""
+
+import io
+import json
+import subprocess
+
+
+class MockPrinter(object):
+ """Handles printing Mock data for MockGcloudProducer.
+
+ Attributes:
+ _records: list of json object records for printing
+ """
+
+ def __init__(self):
+ self._records = []
+
+ def record(self, entry: subprocess.CompletedProcess):
+ """Records data and strips out ip addresses."""
+
+ record = {
+ "args": entry.args,
+ "stdout": entry.stdout.decode("utf-8"),
+ "returncode": str(entry.returncode)
+ }
+ self._records.append(record)
+
+ def write_out(self, fd: io.FileIO):
+ """Prints out the data into the given filepath."""
+ fd.write(json.dumps(self._records, indent=4))
+
+
+class MockReader(object):
+ """Handles reading Mock data for MockGcloudProducer.
+
+ Attributes:
+ _records: List[json] records read from the passed in file.
+ """
+
+ def __init__(self, filepath: str):
+ with open(filepath, "rb") as file:
+ self._records = json.loads(file.read())
+ self._i = 0
+
+ def __iter__(self):
+ return self
+
+ def __next__(self, args) -> subprocess.CompletedProcess:
+ """Returns the next record as a CompletedProcess."""
+ if self._i < len(self._records):
+ record = self._records[self._i]
+ stdout = record["stdout"].encode("ascii")
+ returncode = int(record["returncode"])
+ return subprocess.CompletedProcess(
+ args=args, returncode=returncode, stdout=stdout, stderr=b"")
+ raise StopIteration()
diff --git a/benchmarks/harness/machine_producers/gcloud_producer.py b/benchmarks/harness/machine_producers/gcloud_producer.py
new file mode 100644
index 000000000..4693dd8a2
--- /dev/null
+++ b/benchmarks/harness/machine_producers/gcloud_producer.py
@@ -0,0 +1,250 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A machine producer which produces machine objects using `gcloud`.
+
+Machine producers produce valid harness.Machine objects which are backed by
+real machines. This producer produces those machines on the given user's GCP
+account using the `gcloud` tool.
+
+GCloudProducer creates instances on the given GCP account named like:
+`machine-XXXXXXX-XXXX-XXXX-XXXXXXXXXXXX` in a randomized fashion such that name
+collisions with user instances shouldn't happen.
+
+ Typical usage example:
+
+ producer = GCloudProducer(args)
+ machines = producer.get_machines(NUM_MACHINES)
+ # run stuff on machines with machines[i].run(CMD)
+ producer.release_machines(NUM_MACHINES)
+"""
+import datetime
+import getpass
+import json
+import subprocess
+import threading
+from typing import List, Dict, Any
+import uuid
+
+from benchmarks.harness import machine
+from benchmarks.harness.machine_producers import gcloud_mock_recorder
+from benchmarks.harness.machine_producers import machine_producer
+
+DEFAULT_USER = getpass.getuser()
+
+
+class GCloudProducer(machine_producer.MachineProducer):
+ """Implementation of MachineProducer backed by GCP.
+
+ Produces Machine objects backed by GCP instances.
+
+ Attributes:
+ project: The GCP project name under which to create the machines.
+ ssh_key_path: path to a valid ssh key. See README on vaild ssh keys.
+ image: image name as a string.
+ image_project: image project as a string.
+ zone: string to a valid GCP zone.
+ ssh_user: string of user name for ssh_key
+ ssh_password: string of password for ssh key
+ mock: a mock printer which will print mock data if required. Mock data is
+ recorded output from subprocess calls (returncode, stdout, args).
+ condition: mutex for this class around machine creation and deleteion.
+ """
+
+ def __init__(self,
+ project: str,
+ ssh_key_path: str,
+ image: str,
+ image_project: str,
+ zone: str,
+ ssh_user: str,
+ mock: gcloud_mock_recorder.MockPrinter = None):
+ self.project = project
+ self.ssh_key_path = ssh_key_path
+ self.image = image
+ self.image_project = image_project
+ self.zone = zone
+ self.ssh_user = ssh_user if ssh_user else DEFAULT_USER
+ self.mock = mock
+ self.condition = threading.Condition()
+
+ def get_machines(self, num_machines: int) -> List[machine.Machine]:
+ """Returns requested number of machines backed by GCP instances."""
+ if num_machines <= 0:
+ raise ValueError(
+ "Cannot ask for {num} machines!".format(num=num_machines))
+ with self.condition:
+ names = self._get_unique_names(num_machines)
+ self._build_instances(names)
+ instances = self._start_command(names)
+ self._add_ssh_key_to_instances(names)
+ return self._machines_from_instances(instances)
+
+ def release_machines(self, machine_list: List[machine.Machine]):
+ """Releases the requested number of machines, deleting the instances."""
+ if not machine_list:
+ return
+ with self.condition:
+ cmd = "gcloud compute instances delete --quiet".split(" ")
+ names = [str(m) for m in machine_list]
+ cmd.extend(names)
+ cmd.append("--zone={zone}".format(zone=self.zone))
+ self._run_command(cmd)
+
+ def _machines_from_instances(
+ self, instances: List[Dict[str, Any]]) -> List[machine.Machine]:
+ """Creates Machine Objects from json data describing created instances."""
+ machines = []
+ for instance in instances:
+ name = instance["name"]
+ kwargs = {
+ "hostname":
+ instance["networkInterfaces"][0]["accessConfigs"][0]["natIP"],
+ "key_path":
+ self.ssh_key_path,
+ "username":
+ self.ssh_user
+ }
+ machines.append(machine.RemoteMachine(name=name, **kwargs))
+ return machines
+
+ def _get_unique_names(self, num_names) -> List[str]:
+ """Returns num_names unique names based on data from the GCP project."""
+ curr_machines = self._list_machines()
+ curr_names = set([machine["name"] for machine in curr_machines])
+ ret = []
+ while len(ret) < num_names:
+ new_name = "machine-" + str(uuid.uuid4())
+ if new_name not in curr_names:
+ ret.append(new_name)
+ curr_names.update(new_name)
+ return ret
+
+ def _build_instances(self, names: List[str]) -> List[Dict[str, Any]]:
+ """Creates instances using gcloud command.
+
+ Runs the command `gcloud compute instances create` and returns json data
+ on created instances on success. Creates len(names) instances, one for each
+ name.
+
+ Args:
+ names: list of names of instances to create.
+
+ Returns:
+ List of json data describing created machines.
+ """
+ if not names:
+ raise ValueError(
+ "_build_instances cannot create instances without names.")
+ cmd = "gcloud compute instances create".split(" ")
+ cmd.extend(names)
+ cmd.extend("--preemptible --image={image} --zone={zone}".format(
+ image=self.image, zone=self.zone).split(" "))
+ if self.image_project:
+ cmd.append("--image-project={project}".format(project=self.image_project))
+ res = self._run_command(cmd)
+ return json.loads(res.stdout)
+
+ def _start_command(self, names):
+ """Starts instances using gcloud command.
+
+ Runs the command `gcloud compute instances start` on list of instances by
+ name and returns json data on started instances on success.
+
+ Args:
+ names: list of names of instances to start.
+
+ Returns:
+ List of json data describing started machines.
+ """
+ if not names:
+ raise ValueError("_start_command cannot start empty instance list.")
+ cmd = "gcloud compute instances start".split(" ")
+ cmd.extend(names)
+ cmd.append("--zone={zone}".format(zone=self.zone))
+ cmd.append("--project={project}".format(project=self.project))
+ res = self._run_command(cmd)
+ return json.loads(res.stdout)
+
+ def _add_ssh_key_to_instances(self, names: List[str]) -> None:
+ """Adds ssh key to instances by calling gcloud ssh command.
+
+ Runs the command `gcloud compute ssh instance_name` on list of images by
+ name. Tries to ssh into given instance
+
+ Args:
+ names: list of machine names to which to add the ssh-key
+ self.ssh_key_path.
+
+ Raises:
+ subprocess.CalledProcessError: when underlying subprocess call returns an
+ error other than 255 (Connection closed by remote host).
+ TimeoutError: when 3 unsuccessful tries to ssh into the host return 255.
+ """
+ for name in names:
+ cmd = "gcloud compute ssh {name}".format(name=name).split(" ")
+ cmd.append("--ssh-key-file={key}".format(key=self.ssh_key_path))
+ cmd.append("--zone={zone}".format(zone=self.zone))
+ cmd.append("--command=uname")
+ timeout = datetime.timedelta(seconds=5 * 60)
+ start = datetime.datetime.now()
+ while datetime.datetime.now() <= timeout + start:
+ try:
+ self._run_command(cmd)
+ break
+ except subprocess.CalledProcessError as e:
+ if datetime.datetime.now() > timeout + start:
+ raise TimeoutError(
+ "Could not SSH into instance after 5 min: {name}".format(
+ name=name))
+ # 255 is the returncode for ssh connection refused.
+ elif e.returncode == 255:
+
+ continue
+ else:
+ raise e
+
+ def _list_machines(self) -> List[Dict[str, Any]]:
+ """Runs `list` gcloud command and returns list of Machine data."""
+ cmd = "gcloud compute instances list --project {project}".format(
+ project=self.project).split(" ")
+ res = self._run_command(cmd)
+ return json.loads(res.stdout)
+
+ def _run_command(self, cmd: List[str]) -> subprocess.CompletedProcess:
+ """Runs command as a subprocess.
+
+ Runs command as subprocess and returns the result.
+ If this has a mock recorder, use the record method to record the subprocess
+ call.
+
+ Args:
+ cmd: command to be run as a list of strings.
+
+ Returns:
+ Completed process object to be parsed by caller.
+
+ Raises:
+ CalledProcessError: if subprocess.run returns an error.
+ """
+ cmd = cmd + ["--format=json"]
+ res = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ if self.mock:
+ self.mock.record(res)
+ if res.returncode != 0:
+ raise subprocess.CalledProcessError(
+ cmd=res.args,
+ output=res.stdout,
+ stderr=res.stderr,
+ returncode=res.returncode)
+ return res
diff --git a/benchmarks/harness/machine_producers/gcloud_producer_test.py b/benchmarks/harness/machine_producers/gcloud_producer_test.py
new file mode 100644
index 000000000..c8adb2bdc
--- /dev/null
+++ b/benchmarks/harness/machine_producers/gcloud_producer_test.py
@@ -0,0 +1,48 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests GCloudProducer using mock data.
+
+GCloudProducer produces machines using 'get_machines' and 'release_machines'
+methods. The tests check recorded data (jsonified subprocess.CompletedProcess
+objects) of the producer producing one and five machines.
+"""
+import os
+import types
+
+from benchmarks.harness.machine_producers import machine_producer
+from benchmarks.harness.machine_producers import mock_producer
+
+TEST_DIR = os.path.dirname(__file__)
+
+
+def run_get_release(producer: machine_producer.MachineProducer,
+ num_machines: int,
+ validator: types.FunctionType = None):
+ machines = producer.get_machines(num_machines)
+ assert len(machines) == num_machines
+ if validator:
+ validator(machines=machines, cmd="uname -a", workload=None)
+ producer.release_machines(machines)
+
+
+def test_run_one():
+ mock = mock_producer.MockReader(TEST_DIR + "get_one.json")
+ producer = mock_producer.MockGCloudProducer(mock)
+ run_get_release(producer, 1)
+
+
+def test_run_five():
+ mock = mock_producer.MockReader(TEST_DIR + "get_five.json")
+ producer = mock_producer.MockGCloudProducer(mock)
+ run_get_release(producer, 5)
diff --git a/benchmarks/harness/machine_producers/machine_producer.py b/benchmarks/harness/machine_producers/machine_producer.py
new file mode 100644
index 000000000..f5591c026
--- /dev/null
+++ b/benchmarks/harness/machine_producers/machine_producer.py
@@ -0,0 +1,51 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Abstract types."""
+
+import threading
+from typing import List
+
+from benchmarks.harness import machine
+
+
+class MachineProducer:
+ """Abstract Machine producer."""
+
+ def get_machines(self, num_machines: int) -> List[machine.Machine]:
+ """Returns the requested number of machines."""
+ raise NotImplementedError
+
+ def release_machines(self, machine_list: List[machine.Machine]):
+ """Releases the given set of machines."""
+ raise NotImplementedError
+
+
+class LocalMachineProducer(MachineProducer):
+ """Produces Local Machines."""
+
+ def __init__(self, limit: int):
+ self.limit_sem = threading.Semaphore(value=limit)
+
+ def get_machines(self, num_machines: int) -> List[machine.Machine]:
+ """Returns the request number of MockMachines."""
+
+ self.limit_sem.acquire()
+ return [machine.LocalMachine("local") for _ in range(num_machines)]
+
+ def release_machines(self, machine_list: List[machine.MockMachine]):
+ """No-op."""
+ if not machine_list:
+ raise ValueError("Cannot release an empty list!")
+ self.limit_sem.release()
+ machine_list.clear()
diff --git a/benchmarks/harness/machine_producers/mock_producer.py b/benchmarks/harness/machine_producers/mock_producer.py
new file mode 100644
index 000000000..37e9cb4b7
--- /dev/null
+++ b/benchmarks/harness/machine_producers/mock_producer.py
@@ -0,0 +1,52 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Producers of mocks."""
+
+from typing import List, Any
+
+from benchmarks.harness import machine
+from benchmarks.harness.machine_producers import gcloud_mock_recorder
+from benchmarks.harness.machine_producers import gcloud_producer
+from benchmarks.harness.machine_producers import machine_producer
+
+
+class MockMachineProducer(machine_producer.MachineProducer):
+ """Produces MockMachine objects."""
+
+ def get_machines(self, num_machines: int) -> List[machine.MockMachine]:
+ """Returns the request number of MockMachines."""
+ return [machine.MockMachine() for i in range(num_machines)]
+
+ def release_machines(self, machine_list: List[machine.MockMachine]):
+ """No-op."""
+ return
+
+
+class MockGCloudProducer(gcloud_producer.GCloudProducer):
+ """Mocks GCloudProducer for testing purposes."""
+
+ def __init__(self, mock: gcloud_mock_recorder.MockReader, **kwargs):
+ gcloud_producer.GCloudProducer.__init__(
+ self, project="mock", ssh_private_key_path="mock", **kwargs)
+ self.mock = mock
+
+ def _validate_ssh_file(self):
+ pass
+
+ def _run_command(self, cmd):
+ return self.mock.pop(cmd)
+
+ def _machines_from_instances(
+ self, instances: List[Any]) -> List[machine.MockMachine]:
+ return [machine.MockMachine() for _ in instances]
diff --git a/benchmarks/harness/machine_producers/testdata/get_five.json b/benchmarks/harness/machine_producers/testdata/get_five.json
new file mode 100644
index 000000000..32bad1b06
--- /dev/null
+++ b/benchmarks/harness/machine_producers/testdata/get_five.json
@@ -0,0 +1,211 @@
+[
+ {
+ "args": [
+ "gcloud",
+ "compute",
+ "instances",
+ "list",
+ "--project",
+ "project",
+ "--format=json"
+ ],
+ "stdout": "[{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]},{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]},{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]},{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]},{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":{\"natIP\":\"0.0.0.0\"}]}]}]",
+ "returncode": "0"
+ },
+ {
+ "args": [
+ "gcloud",
+ "compute",
+ "instances",
+ "create",
+ "machine-42c9bf6e-8d45-4c37-b1c0-7e4fdcf530fc",
+ "machine-5f28f145-cc2d-427d-9cbf-428d164cdb92",
+ "machine-da5859b5-bae6-435d-8005-0202d6f6e065",
+ "machine-880a8a2f-918c-4f9e-a43c-ed3c8e02ea05",
+ "machine-1149147d-71e2-43ea-8fe1-49256e5c441c",
+ "--preemptible",
+ "--image=ubuntu-1910-eoan-v20191204",
+ "--zone=us-west1-b",
+ "--image-project=ubuntu-os-cloud",
+ "--format=json"
+ ],
+ "stdout": "[{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]},{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]},{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]},{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]},{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]}]",
+ "returncode": "0"
+ },
+ {
+ "args": [
+ "gcloud",
+ "compute",
+ "instances",
+ "start",
+ "machine-42c9bf6e-8d45-4c37-b1c0-7e4fdcf530fc",
+ "machine-5f28f145-cc2d-427d-9cbf-428d164cdb92",
+ "machine-da5859b5-bae6-435d-8005-0202d6f6e065",
+ "machine-880a8a2f-918c-4f9e-a43c-ed3c8e02ea05",
+ "machine-1149147d-71e2-43ea-8fe1-49256e5c441c",
+ "--zone=us-west1-b",
+ "--project=project",
+ "--format=json"
+ ],
+ "stdout": "[{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]},{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]},{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]},{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]},{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]}]",
+ "returncode": "0"
+ },
+ {
+ "args": [
+ "gcloud",
+ "compute",
+ "ssh",
+ "machine-42c9bf6e-8d45-4c37-b1c0-7e4fdcf530fc",
+ "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools",
+ "--zone=us-west1-b",
+ "--command=uname",
+ "--format=json"
+ ],
+ "stdout": "",
+ "returncode": "255"
+ },
+ {
+ "args": [
+ "gcloud",
+ "compute",
+ "ssh",
+ "machine-42c9bf6e-8d45-4c37-b1c0-7e4fdcf530fc",
+ "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools",
+ "--zone=us-west1-b",
+ "--command=uname",
+ "--format=json"
+ ],
+ "stdout": "",
+ "returncode": "255"
+ },
+ {
+ "args": [
+ "gcloud",
+ "compute",
+ "ssh",
+ "machine-42c9bf6e-8d45-4c37-b1c0-7e4fdcf530fc",
+ "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools",
+ "--zone=us-west1-b",
+ "--command=uname",
+ "--format=json"
+ ],
+ "stdout": "",
+ "returncode": "255"
+ },
+ {
+ "args": [
+ "gcloud",
+ "compute",
+ "ssh",
+ "machine-42c9bf6e-8d45-4c37-b1c0-7e4fdcf530fc",
+ "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools",
+ "--zone=us-west1-b",
+ "--command=uname",
+ "--format=json"
+ ],
+ "stdout": "",
+ "returncode": "255"
+ },
+ {
+ "args": [
+ "gcloud",
+ "compute",
+ "ssh",
+ "machine-42c9bf6e-8d45-4c37-b1c0-7e4fdcf530fc",
+ "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools",
+ "--zone=us-west1-b",
+ "--command=uname",
+ "--format=json"
+ ],
+ "stdout": "",
+ "returncode": "255"
+ },
+ {
+ "args": [
+ "gcloud",
+ "compute",
+ "ssh",
+ "machine-42c9bf6e-8d45-4c37-b1c0-7e4fdcf530fc",
+ "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools",
+ "--zone=us-west1-b",
+ "--command=uname",
+ "--format=json"
+ ],
+ "stdout": "Linux\n[]\n",
+ "returncode": "0"
+ },
+ {
+ "args": [
+ "gcloud",
+ "compute",
+ "ssh",
+ "machine-5f28f145-cc2d-427d-9cbf-428d164cdb92",
+ "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools",
+ "--zone=us-west1-b",
+ "--command=uname",
+ "--format=json"
+ ],
+ "stdout": "Linux\n[]\n",
+ "returncode": "0"
+ },
+ {
+ "args": [
+ "gcloud",
+ "compute",
+ "ssh",
+ "machine-da5859b5-bae6-435d-8005-0202d6f6e065",
+ "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools",
+ "--zone=us-west1-b",
+ "--command=uname",
+ "--format=json"
+ ],
+ "stdout": "Linux\n[]\n",
+ "returncode": "0"
+ },
+ {
+ "args": [
+ "gcloud",
+ "compute",
+ "ssh",
+ "machine-880a8a2f-918c-4f9e-a43c-ed3c8e02ea05",
+ "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools",
+ "--zone=us-west1-b",
+ "--command=uname",
+ "--format=json"
+ ],
+ "stdout": "Linux\n[]\n",
+ "returncode": "0"
+ },
+ {
+ "args": [
+ "gcloud",
+ "compute",
+ "ssh",
+ "machine-1149147d-71e2-43ea-8fe1-49256e5c441c",
+ "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools",
+ "--zone=us-west1-b",
+ "--command=uname",
+ "--format=json"
+ ],
+ "stdout": "Linux\n[]\n",
+ "returncode": "0"
+ },
+ {
+ "args": [
+ "gcloud",
+ "compute",
+ "instances",
+ "delete",
+ "--quiet",
+ "machine-42c9bf6e-8d45-4c37-b1c0-7e4fdcf530fc",
+ "machine-5f28f145-cc2d-427d-9cbf-428d164cdb92",
+ "machine-da5859b5-bae6-435d-8005-0202d6f6e065",
+ "machine-880a8a2f-918c-4f9e-a43c-ed3c8e02ea05",
+ "machine-1149147d-71e2-43ea-8fe1-49256e5c441c",
+ "--zone=us-west1-b",
+ "--format=json"
+ ],
+ "stdout": "[]\n",
+ "returncode": "0"
+ }
+]
diff --git a/benchmarks/harness/machine_producers/testdata/get_one.json b/benchmarks/harness/machine_producers/testdata/get_one.json
new file mode 100644
index 000000000..c359c19c8
--- /dev/null
+++ b/benchmarks/harness/machine_producers/testdata/get_one.json
@@ -0,0 +1,145 @@
+[
+ {
+ "args": [
+ "gcloud",
+ "compute",
+ "instances",
+ "list",
+ "--project",
+ "linux-testing-user",
+ "--format=json"
+ ],
+ "stdout": "[{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]}]",
+
+ "returncode": "0"
+ },
+ {
+ "args": [
+ "gcloud",
+ "compute",
+ "instances",
+ "create",
+ "machine-129dfcf9-b05b-4c16-a4cd-21353b570ddc",
+ "--preemptible",
+ "--image=ubuntu-1910-eoan-v20191204",
+ "--zone=us-west1-b",
+ "--image-project=ubuntu-os-cloud",
+ "--format=json"
+ ],
+ "stdout": "[{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]}]",
+ "returncode": "0"
+ },
+ {
+ "args": [
+ "gcloud",
+ "compute",
+ "instances",
+ "start",
+ "machine-129dfcf9-b05b-4c16-a4cd-21353b570ddc",
+ "--zone=us-west1-b",
+ "--project=linux-testing-user",
+ "--format=json"
+ ],
+ "stdout": "[{\"name\":\"name\", \"networkInterfaces\":[{\"accessConfigs\":[{\"natIP\":\"0.0.0.0\"}]}]}]",
+
+ "returncode": "0"
+ },
+ {
+ "args": [
+ "gcloud",
+ "compute",
+ "ssh",
+ "machine-129dfcf9-b05b-4c16-a4cd-21353b570ddc",
+ "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools",
+ "--zone=us-west1-b",
+ "--command=uname",
+ "--format=json"
+ ],
+ "stdout": "",
+ "returncode": "255"
+ },
+ {
+ "args": [
+ "gcloud",
+ "compute",
+ "ssh",
+ "machine-129dfcf9-b05b-4c16-a4cd-21353b570ddc",
+ "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools",
+ "--zone=us-west1-b",
+ "--command=uname",
+ "--format=json"
+ ],
+ "stdout": "",
+ "returncode": "255"
+ },
+ {
+ "args": [
+ "gcloud",
+ "compute",
+ "ssh",
+ "machine-129dfcf9-b05b-4c16-a4cd-21353b570ddc",
+ "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools",
+ "--zone=us-west1-b",
+ "--command=uname",
+ "--format=json"
+ ],
+ "stdout": "",
+ "returncode": "255"
+ },
+ {
+ "args": [
+ "gcloud",
+ "compute",
+ "ssh",
+ "machine-129dfcf9-b05b-4c16-a4cd-21353b570ddc",
+ "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools",
+ "--zone=us-west1-b",
+ "--command=uname",
+ "--format=json"
+ ],
+ "stdout": "",
+ "returncode": "255"
+ },
+ {
+ "args": [
+ "gcloud",
+ "compute",
+ "ssh",
+ "machine-129dfcf9-b05b-4c16-a4cd-21353b570ddc",
+ "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools",
+ "--zone=us-west1-b",
+ "--command=uname",
+ "--format=json"
+ ],
+ "stdout": "",
+ "returncode": "255"
+ },
+ {
+ "args": [
+ "gcloud",
+ "compute",
+ "ssh",
+ "machine-129dfcf9-b05b-4c16-a4cd-21353b570ddc",
+ "--ssh-key-file=/usr/local/google/home/user/.ssh/benchmark-tools",
+ "--zone=us-west1-b",
+ "--command=uname",
+ "--format=json"
+ ],
+ "stdout": "Linux\n[]\n",
+ "returncode": "0"
+ },
+ {
+ "args": [
+ "gcloud",
+ "compute",
+ "instances",
+ "delete",
+ "--quiet",
+ "machine-129dfcf9-b05b-4c16-a4cd-21353b570ddc",
+ "--zone=us-west1-b",
+ "--format=json"
+ ],
+ "stdout": "[]\n",
+ "returncode": "0"
+ }
+]
diff --git a/benchmarks/harness/machine_producers/yaml_producer.py b/benchmarks/harness/machine_producers/yaml_producer.py
new file mode 100644
index 000000000..5d334e480
--- /dev/null
+++ b/benchmarks/harness/machine_producers/yaml_producer.py
@@ -0,0 +1,106 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Producers based on yaml files."""
+
+import os
+import threading
+from typing import Dict
+from typing import List
+
+import yaml
+
+from benchmarks.harness import machine
+from benchmarks.harness.machine_producers import machine_producer
+
+
+class YamlMachineProducer(machine_producer.MachineProducer):
+ """Loads machines from a yaml file."""
+
+ def __init__(self, path: str):
+ self.machines = build_machines(path)
+ self.max_machines = len(self.machines)
+ self.machine_condition = threading.Condition()
+
+ def get_machines(self, num_machines: int) -> List[machine.Machine]:
+ if num_machines > self.max_machines:
+ raise ValueError(
+ "Insufficient Ammount of Machines. {ask} asked for and have {max_num} max."
+ .format(ask=num_machines, max_num=self.max_machines))
+
+ with self.machine_condition:
+ while not self._enough_machines(num_machines):
+ self.machine_condition.wait(timeout=1)
+ return [self.machines.pop(0) for _ in range(num_machines)]
+
+ def release_machines(self, machine_list: List[machine.Machine]):
+ with self.machine_condition:
+ while machine_list:
+ next_machine = machine_list.pop()
+ self.machines.append(next_machine)
+ self.machine_condition.notify()
+
+ def _enough_machines(self, ask: int):
+ return ask <= len(self.machines)
+
+
+def build_machines(path: str, num_machines: str = -1) -> List[machine.Machine]:
+ """Builds machine objects defined by the yaml file "path".
+
+ Args:
+ path: The path to a yaml file which defines machines.
+ num_machines: Optional limit on how many machine objects to build.
+
+ Returns:
+ Machine objects in a list.
+
+ If num_machines is set, len(machines) <= num_machines.
+ """
+ data = parse_yaml(path)
+ machines = []
+ for key, value in data.items():
+ if len(machines) == num_machines:
+ return machines
+ if isinstance(value, dict):
+ machines.append(machine.RemoteMachine(key, **value))
+ else:
+ machines.append(machine.LocalMachine(key))
+ return machines
+
+
+def parse_yaml(path: str) -> Dict[str, Dict[str, str]]:
+ """Parse the yaml file pointed by path.
+
+ Args:
+ path: The path to yaml file.
+
+ Returns:
+ The contents of the yaml file as a dictionary.
+ """
+ data = get_file_contents(path)
+ return yaml.load(data, Loader=yaml.Loader)
+
+
+def get_file_contents(path: str) -> str:
+ """Dumps the file contents to a string and returns them.
+
+ Args:
+ path: The path to dump.
+
+ Returns:
+ The file contents as a string.
+ """
+ if not os.path.isabs(path):
+ path = os.path.abspath(path)
+ with open(path) as input_file:
+ return input_file.read()
diff --git a/benchmarks/harness/ssh_connection.py b/benchmarks/harness/ssh_connection.py
new file mode 100644
index 000000000..fcbfbcdb2
--- /dev/null
+++ b/benchmarks/harness/ssh_connection.py
@@ -0,0 +1,111 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""SSHConnection handles the details of SSH connections."""
+
+import os
+import warnings
+
+import paramiko
+
+from benchmarks import harness
+
+# Get rid of paramiko Cryptography Warnings.
+warnings.filterwarnings(action="ignore", module=".*paramiko.*")
+
+
+def send_one_file(client: paramiko.SSHClient, path: str, remote_dir: str):
+ """Sends a single file via an SSH client.
+
+ Args:
+ client: The existing SSH client.
+ path: The local path.
+ remote_dir: The remote directory.
+ """
+ filename = path.split("/").pop()
+ client.exec_command("mkdir -p " + remote_dir)
+ with client.open_sftp() as ftp_client:
+ ftp_client.put(path, os.path.join(remote_dir, filename))
+
+
+class SSHConnection:
+ """SSH connection to a remote machine."""
+
+ def __init__(self, name: str, hostname: str, key_path: str, username: str,
+ **kwargs):
+ """Sets up a paramiko ssh connection to the given hostname."""
+ self._name = name # Unused.
+ self._hostname = hostname
+ self._username = username
+ self._key_path = key_path # RSA Key path
+ self._kwargs = kwargs
+ # SSHConnection wraps paramiko. paramiko supports RSA, ECDSA, and Ed25519
+ # keys, and we've chosen to only suport and require RSA keys. paramiko
+ # supports RSA keys that begin with '----BEGIN RSAKEY----'.
+ # https://stackoverflow.com/questions/53600581/ssh-key-generated-by-ssh-keygen-is-not-recognized-by-paramiko
+ self.rsa_key = self._rsa()
+ self.run("true") # Validate.
+
+ def _client(self) -> paramiko.SSHClient:
+ """Returns a connected SSH client."""
+ client = paramiko.SSHClient()
+ client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+ client.connect(
+ hostname=self._hostname,
+ port=22,
+ username=self._username,
+ pkey=self.rsa_key,
+ allow_agent=False,
+ look_for_keys=False)
+ return client
+
+ def _rsa(self):
+ if "key_password" in self._kwargs:
+ password = self._kwargs["key_password"]
+ else:
+ password = None
+ rsa = paramiko.RSAKey.from_private_key_file(self._key_path, password)
+ return rsa
+
+ def run(self, cmd: str) -> (str, str):
+ """Runs a command via ssh.
+
+ Args:
+ cmd: The shell command to run.
+
+ Returns:
+ The contents of stdout and stderr.
+ """
+ with self._client() as client:
+ _, stdout, stderr = client.exec_command(command=cmd)
+ stdout.channel.recv_exit_status()
+ stdout = stdout.read().decode("utf-8")
+ stderr = stderr.read().decode("utf-8")
+ return stdout, stderr
+
+ def send_workload(self, name: str) -> str:
+ """Sends a workload to the remote machine.
+
+ Args:
+ name: The workload name.
+
+ Returns:
+ The remote path.
+ """
+ with self._client() as client:
+ for dirpath, _, filenames in os.walk(
+ harness.LOCAL_WORKLOADS_PATH.format(name)):
+ for filename in filenames:
+ send_one_file(client, os.path.join(dirpath, filename),
+ harness.REMOTE_WORKLOADS_PATH.format(name))
+ return harness.REMOTE_WORKLOADS_PATH.format(name)
diff --git a/benchmarks/harness/tunnel_dispatcher.py b/benchmarks/harness/tunnel_dispatcher.py
new file mode 100644
index 000000000..c56fd022a
--- /dev/null
+++ b/benchmarks/harness/tunnel_dispatcher.py
@@ -0,0 +1,122 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tunnel handles setting up connections to remote machines.
+
+Tunnel dispatcher is a wrapper around the connection from a local UNIX socket
+and a remote UNIX socket via SSH with port forwarding. This is done to
+initialize the pythonic dockerpy client to run containers on the remote host by
+connecting to /var/run/docker.sock (where Docker is listening). Tunnel
+dispatcher sets up the local UNIX socket and calls the `ssh` command as a
+subprocess, and holds a reference to that subprocess. It manages clean-up on
+exit as best it can by killing the ssh subprocess and deleting the local UNIX
+socket,stored in /tmp for easy cleanup in most systems if this fails.
+
+ Typical usage example:
+
+ t = Tunnel(name, **kwargs)
+ t.connect()
+ client = t.get_docker_client() #
+ client.containers.run("ubuntu", "echo hello world")
+
+"""
+
+import os
+import tempfile
+import time
+
+import docker
+import pexpect
+
+SSH_TUNNEL_COMMAND = """ssh
+ -o GlobalKnownHostsFile=/dev/null
+ -o UserKnownHostsFile=/dev/null
+ -o StrictHostKeyChecking=no
+ -o IdentitiesOnly=yes
+ -nNT -L {filename}:/var/run/docker.sock
+ -i {key_path}
+ {username}@{hostname}"""
+
+
+class Tunnel(object):
+ """The tunnel object represents the tunnel via ssh.
+
+ This connects a local unix domain socket with a remote socket.
+
+ Attributes:
+ _filename: a temporary name of the UNIX socket prefixed by the name
+ argument.
+ _hostname: the IP or resolvable hostname of the remote host.
+ _username: the username of the ssh_key used to run ssh.
+ _key_path: path to a valid key.
+ _key_password: optional password to the ssh key in _key_path
+ _process: holds reference to the ssh subprocess created.
+
+ Returns:
+ The new minimum port.
+
+ Raises:
+ ConnectionError: If no available port is found.
+ """
+
+ def __init__(self,
+ name: str,
+ hostname: str,
+ username: str,
+ key_path: str,
+ key_password: str = "",
+ **kwargs):
+ self._filename = tempfile.NamedTemporaryFile(prefix=name).name
+ self._hostname = hostname
+ self._username = username
+ self._key_path = key_path
+ self._key_password = key_password
+ self._kwargs = kwargs
+ self._process = None
+
+ def connect(self):
+ """Connects the SSH tunnel and stores the subprocess reference in _process."""
+ cmd = SSH_TUNNEL_COMMAND.format(
+ filename=self._filename,
+ key_path=self._key_path,
+ username=self._username,
+ hostname=self._hostname)
+ self._process = pexpect.spawn(cmd, timeout=10)
+
+ # If given a password, assume we'll be asked for it.
+ if self._key_password:
+ self._process.expect(["Enter passphrase for key .*: "])
+ self._process.sendline(self._key_password)
+
+ while True:
+ # Wait for the tunnel to appear.
+ if self._process.exitstatus is not None:
+ raise ConnectionError("Error in setting up ssh tunnel")
+ if os.path.exists(self._filename):
+ return
+ time.sleep(0.1)
+
+ def path(self):
+ """Return the socket file."""
+ return self._filename
+
+ def get_docker_client(self):
+ """Returns a docker client for this Tunnel."""
+ return docker.DockerClient(base_url="unix:/" + self._filename)
+
+ def __del__(self):
+ """Closes the ssh connection process and deletes the socket file."""
+ if self._process:
+ self._process.close()
+ if os.path.exists(self._filename):
+ os.remove(self._filename)
diff --git a/benchmarks/requirements.txt b/benchmarks/requirements.txt
new file mode 100644
index 000000000..577eb1a2e
--- /dev/null
+++ b/benchmarks/requirements.txt
@@ -0,0 +1,32 @@
+asn1crypto==1.2.0
+atomicwrites==1.3.0
+attrs==19.3.0
+bcrypt==3.1.7
+certifi==2019.9.11
+cffi==1.13.2
+chardet==3.0.4
+Click==7.0
+cryptography==2.8
+docker==3.7.0
+docker-pycreds==0.4.0
+idna==2.8
+importlib-metadata==0.23
+more-itertools==7.2.0
+packaging==19.2
+paramiko==2.6.0
+pathlib2==2.3.5
+pexpect==4.7.0
+pluggy==0.9.0
+ptyprocess==0.6.0
+py==1.8.0
+pycparser==2.19
+PyNaCl==1.3.0
+pyparsing==2.4.5
+pytest==4.3.0
+PyYAML==5.1.2
+requests==2.22.0
+six==1.13.0
+urllib3==1.25.7
+wcwidth==0.1.7
+websocket-client==0.56.0
+zipp==0.6.0
diff --git a/benchmarks/run.py b/benchmarks/run.py
new file mode 100644
index 000000000..a22eb8641
--- /dev/null
+++ b/benchmarks/run.py
@@ -0,0 +1,19 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Benchmark runner."""
+
+from benchmarks import runner
+
+if __name__ == "__main__":
+ runner.runner()
diff --git a/benchmarks/runner/BUILD b/benchmarks/runner/BUILD
new file mode 100644
index 000000000..e1b2ea550
--- /dev/null
+++ b/benchmarks/runner/BUILD
@@ -0,0 +1,63 @@
+load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement")
+
+package(licenses = ["notice"])
+
+py_library(
+ name = "runner",
+ srcs = ["__init__.py"],
+ data = [
+ "//benchmarks/workloads:files",
+ ],
+ visibility = ["//benchmarks:__pkg__"],
+ deps = [
+ ":commands",
+ "//benchmarks/harness:benchmark_driver",
+ "//benchmarks/harness/machine_producers:machine_producer",
+ "//benchmarks/harness/machine_producers:mock_producer",
+ "//benchmarks/harness/machine_producers:yaml_producer",
+ "//benchmarks/suites",
+ "//benchmarks/suites:absl",
+ "//benchmarks/suites:density",
+ "//benchmarks/suites:fio",
+ "//benchmarks/suites:helpers",
+ "//benchmarks/suites:http",
+ "//benchmarks/suites:media",
+ "//benchmarks/suites:ml",
+ "//benchmarks/suites:network",
+ "//benchmarks/suites:redis",
+ "//benchmarks/suites:startup",
+ "//benchmarks/suites:sysbench",
+ "//benchmarks/suites:syscall",
+ requirement("click", True),
+ ],
+)
+
+py_library(
+ name = "commands",
+ srcs = ["commands.py"],
+ deps = [
+ requirement("click", True),
+ ],
+)
+
+py_test(
+ name = "runner_test",
+ srcs = ["runner_test.py"],
+ python_version = "PY3",
+ tags = [
+ "local",
+ "manual",
+ ],
+ deps = [
+ ":runner",
+ requirement("click", True),
+ requirement("attrs", False),
+ requirement("atomicwrites", False),
+ requirement("more-itertools", False),
+ requirement("pathlib2", False),
+ requirement("pluggy", False),
+ requirement("py", False),
+ requirement("pytest", True),
+ requirement("six", False),
+ ],
+)
diff --git a/benchmarks/runner/__init__.py b/benchmarks/runner/__init__.py
new file mode 100644
index 000000000..6f56704d8
--- /dev/null
+++ b/benchmarks/runner/__init__.py
@@ -0,0 +1,278 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""High-level benchmark utility."""
+
+import copy
+import csv
+import logging
+import pkgutil
+import pydoc
+import re
+import sys
+import types
+from typing import List
+from typing import Tuple
+
+import click
+
+from benchmarks import suites
+from benchmarks.harness import benchmark_driver
+from benchmarks.harness.machine_producers import machine_producer
+from benchmarks.harness.machine_producers import mock_producer
+from benchmarks.harness.machine_producers import yaml_producer
+from benchmarks.runner import commands
+
+
+@click.group()
+@click.option(
+ "--verbose/--no-verbose", default=False, help="Enable verbose logging.")
+@click.option("--debug/--no-debug", default=False, help="Enable debug logging.")
+def runner(verbose: bool = False, debug: bool = False):
+ """Run distributed benchmarks.
+
+ See the run and list commands for details.
+
+ Args:
+ verbose: Enable verbose logging.
+ debug: Enable debug logging (supercedes verbose).
+ """
+ if debug:
+ logging.basicConfig(level=logging.DEBUG)
+ elif verbose:
+ logging.basicConfig(level=logging.INFO)
+
+
+def find_benchmarks(
+ regex: str) -> List[Tuple[str, types.ModuleType, types.FunctionType]]:
+ """Finds all available benchmarks.
+
+ Args:
+ regex: A regular expression to match.
+
+ Returns:
+ A (short_name, module, function) tuple for each match.
+ """
+ pkgs = pkgutil.walk_packages(suites.__path__, suites.__name__ + ".")
+ found = []
+ for _, name, _ in pkgs:
+ mod = pydoc.locate(name)
+ funcs = [
+ getattr(mod, x)
+ for x in dir(mod)
+ if suites.is_benchmark(getattr(mod, x))
+ ]
+ for func in funcs:
+ # Use the short_name with the benchmarks. prefix stripped.
+ prefix_len = len(suites.__name__ + ".")
+ short_name = mod.__name__[prefix_len:] + "." + func.__name__
+ # Add to the list if a pattern is provided.
+ if re.compile(regex).match(short_name):
+ found.append((short_name, mod, func))
+ return found
+
+
+@runner.command("list")
+@click.argument("method", nargs=-1)
+def list_all(method):
+ """Lists available benchmarks."""
+ if not method:
+ method = ".*"
+ else:
+ method = "(" + ",".join(method) + ")"
+ for (short_name, _, func) in find_benchmarks(method):
+ print("Benchmark %s:" % short_name)
+ metrics = suites.benchmark_metrics(func)
+ if func.__doc__:
+ print(" " + func.__doc__.lstrip().rstrip())
+ if metrics:
+ print("\n Metrics:")
+ for metric in metrics:
+ print("\t{name}: {doc}".format(name=metric[0], doc=metric[1]))
+ print("\n")
+
+
+@runner.command("run-local", commands.LocalCommand)
+@click.pass_context
+def run_local(ctx, limit: float, **kwargs):
+ """Runs benchmarks locally."""
+ run(ctx, machine_producer.LocalMachineProducer(limit=limit), **kwargs)
+
+
+@runner.command("run-mock", commands.RunCommand)
+@click.pass_context
+def run_mock(ctx, **kwargs):
+ """Runs benchmarks on Mock machines. Used for testing."""
+ run(ctx, mock_producer.MockMachineProducer(), **kwargs)
+
+
+def run(ctx, producer: machine_producer.MachineProducer, method: str, runs: int,
+ runtime: List[str], metric: List[str], stat: str, **kwargs):
+ """Runs arbitrary benchmarks.
+
+ All unknown command line flags are passed through to the underlying benchmark
+ method. Flags may be specified multiple times, in which case it is considered
+ a "dimension" for the test, and a comma-separated table will be emitted
+ instead of a single result.
+
+ See the output of list to see available metrics for any given benchmark
+ method. The method parameter is a regular expression that will match against
+ available benchmarks. If multiple benchmarks match, then that is considered a
+ distinct "dimension" for the test.
+
+ All benchmarks are run in parallel where possible, but have exclusive
+ ownership over the individual machines.
+
+ Every benchmark method will be run the times indicated by --runs.
+
+ Args:
+ ctx: Click context.
+ producer: A Machine Producer from which to get Machines.
+ method: A regular expression for methods to be run.
+ runs: Number of runs.
+ runtime: A list of runtimes to test.
+ metric: A list of metrics to extract.
+ stat: The class of statistics to extract.
+ **kwargs: Dimensions to test.
+ """
+ # First, calculate additional arguments.
+ #
+ # This essentially calculates any arguments that appear multiple times, and
+ # moves those to the "dimensions" dictionary, which maps to lists. These
+ # dimensions are then iterated over to generate the relevant csv output.
+ dimensions = {}
+
+ if stat not in ["median", "all", "meanstd"]:
+ raise ValueError("Illegal value for --result, see help.")
+
+ def squish(key: str, value: str):
+ """Collapse an argument into kwargs or dimensions."""
+ if key in dimensions:
+ # Extend an existing dimension.
+ dimensions[key].append(value)
+ elif key in kwargs:
+ # Create a new dimension.
+ dimensions[key] = [kwargs[key], value]
+ del kwargs[key]
+ else:
+ # A single value.
+ kwargs[key] = value
+
+ for item in ctx.args:
+ if "=" in method:
+ # This must be the method. The method is simply set to the first
+ # non-matching argument, which we're also parsing here.
+ item, method = method, item
+ if "=" not in item:
+ logging.error("illegal argument: %s", item)
+ sys.exit(1)
+ (key, value) = item.lstrip("-").split("=", 1)
+ squish(key, value)
+
+ # Convert runtime and metric to dimensions.
+ #
+ # They exist only in the arguments above for documentation purposes.
+ # Essentially here we are treating them like anything else. Note however,
+ # that an empty set here will result in a dimension. This is important for
+ # metrics, where an empty set actually means all metrics.
+ def fold(key: str, value, allow_flatten=False):
+ """Collapse a list value into kwargs or dimensions."""
+ if len(value) == 1 and allow_flatten:
+ kwargs[key] = value[0]
+ else:
+ dimensions[key] = value
+
+ fold("runtime", runtime, allow_flatten=True)
+ fold("metric", metric)
+
+ # Lookup the methods.
+ #
+ # We match the method parameter to a regular expression. This allows you to
+ # do things like `run --mock .*` for a broad test. Note that we track the
+ # short_names in the dimensions here, and look up again in the recursion.
+ methods = {
+ short_name: func for (short_name, _, func) in find_benchmarks(method)
+ }
+ if not methods:
+ # Must match at least one method.
+ logging.error("no matching benchmarks for %s: try list.", method)
+ sys.exit(1)
+ fold("method", list(methods.keys()), allow_flatten=True)
+
+ # Spin up the drivers.
+ #
+ # We ensure that metric is the last entry, because we have special behavior.
+ # They actually run the test once and the benchmark is a generator that
+ # produces all viable metrics.
+ dimension_keys = list(dimensions.keys())
+ if "metric" in dimension_keys:
+ dimension_keys.remove("metric")
+ dimension_keys.append("metric")
+ drivers = []
+
+ def _start(keywords, finished, left):
+ """Runs a test across dimensions recursively."""
+ # Resolve the method fully, it starts as a string.
+ if "method" in keywords and isinstance(keywords["method"], str):
+ keywords["method"] = methods[keywords["method"]]
+ # Is this a non-recursive case?
+ if not left:
+ driver = benchmark_driver.BenchmarkDriver(producer, runs=runs, **keywords)
+ driver.start()
+ drivers.append((finished, driver))
+ else:
+ # Recurse on the next dimension.
+ current, left = left[0], left[1:]
+ keywords = copy.deepcopy(keywords)
+ if current == "metric":
+ # We use a generator, popped below. Note that metric is
+ # guaranteed to be the last element here, and we will provide
+ # the value for 'done' below when generating the csv.
+ keywords[current] = dimensions[current]
+ _start(keywords, finished, left)
+ else:
+ # Generate manually.
+ for value in dimensions[current]:
+ keywords[current] = value
+ _start(keywords, finished + [value], left)
+
+ # Start all the drivers, recursively.
+ _start(kwargs, [], dimension_keys)
+
+ # Finish all tests, write results.
+ output = csv.writer(sys.stdout)
+ output.writerow(dimension_keys + ["result"])
+ for (done, driver) in drivers:
+ driver.join()
+ for (metric_name, result) in getattr(driver, stat)():
+ output.writerow([ # Collapse the method name.
+ hasattr(x, "__name__") and x.__name__ or x for x in done
+ ] + [metric_name] + result)
+
+
+@runner.command()
+@click.argument("env")
+@click.option(
+ "--cmd", default="uname -a", help="command to run on all found machines")
+@click.option(
+ "--workload", default="true", help="workload to run all found machines")
+def validate(env, cmd, workload):
+ """Validates an environment described by yaml file."""
+ producer = yaml_producer.YamlMachineProducer(env)
+ for machine in producer.machines:
+ print("Machine %s:" % machine)
+ stdout, _ = machine.run(cmd)
+ print(" Output of '%s': %s" % (cmd, stdout.lstrip().rstrip()))
+ image = machine.pull(workload)
+ stdout = machine.container(image).run()
+ print(" Container %s: %s" % (workload, stdout.lstrip().rstrip()))
diff --git a/benchmarks/runner/commands.py b/benchmarks/runner/commands.py
new file mode 100644
index 000000000..4973843b9
--- /dev/null
+++ b/benchmarks/runner/commands.py
@@ -0,0 +1,84 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Module with the guts of `click` commands.
+
+Overrides of the click.core.Command. This is done so flags are inherited between
+similar commands (the run command). The classes below are meant to be used in
+click templates like so.
+
+@runner.command("run-mock", RunCommand)
+def run_mock(**kwargs):
+ # mock implementation
+
+"""
+import click
+
+
+class RunCommand(click.core.Command):
+ """Base Run Command with flags.
+
+ Attributes:
+ method: regex of which suite to choose (e.g. sysbench would run
+ sysbench.cpu, sysbench.memory, and sysbench.mutex) See list command for
+ details.
+ metric: metric(s) to extract. See list command for details.
+ runtime: the runtime(s) on which to run.
+ runs: the number of runs to do of each method.
+ stat: how to compile results in the case of multiple run (e.g. median).
+ """
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ method = click.core.Argument(("method",))
+
+ metric = click.core.Option(("--metric",),
+ help="The metric to extract.",
+ multiple=True)
+
+ runtime = click.core.Option(("--runtime",),
+ default=["runc"],
+ help="The runtime to use.",
+ multiple=True)
+ runs = click.core.Option(("--runs",),
+ default=1,
+ help="The number of times to run each benchmark.")
+ stat = click.core.Option(
+ ("--stat",),
+ default="median",
+ help="How to aggregate the data from all runs."
+ "\nmedian - returns the median of all runs (default)"
+ "\nall - returns all results comma separated"
+ "\nmeanstd - returns result as mean,std")
+ self.params.extend([method, runtime, runs, stat, metric])
+ self.ignore_unknown_options = True
+ self.allow_extra_args = True
+
+
+class LocalCommand(RunCommand):
+ """LocalCommand inherits all flags from RunCommand.
+
+ Attributes:
+ limit: limits the number of machines on which to run benchmarks. This limits
+ for local how many benchmarks may run at a time. e.g. "startup" requires
+ one machine -- passing two machines would limit two startup jobs at a
+ time. Default is infinity.
+ """
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.params.append(
+ click.core.Option(
+ ("--limit",),
+ default=1,
+ help="Limit of number of benchmarks that can run at a given time."))
diff --git a/benchmarks/runner/runner_test.py b/benchmarks/runner/runner_test.py
new file mode 100644
index 000000000..7818d631a
--- /dev/null
+++ b/benchmarks/runner/runner_test.py
@@ -0,0 +1,59 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Top-level tests."""
+
+import os
+import subprocess
+import sys
+
+from click import testing
+import pytest
+
+from benchmarks import runner
+
+
+def _get_locale():
+ output = subprocess.check_output(["locale", "-a"])
+ locales = output.split()
+ if b"en_US.utf8" in locales:
+ return "en_US.UTF-8"
+ else:
+ return "C.UTF-8"
+
+
+def _set_locale():
+ locale = _get_locale()
+ if os.getenv("LANG") != locale:
+ os.environ["LANG"] = locale
+ os.environ["LC_ALL"] = locale
+ os.execv("/proc/self/exe", ["python"] + sys.argv)
+
+
+def test_list():
+ cli_runner = testing.CliRunner()
+ result = cli_runner.invoke(runner.runner, ["list"])
+ print(result.output)
+ assert result.exit_code == 0
+
+
+def test_run():
+ cli_runner = testing.CliRunner()
+ result = cli_runner.invoke(runner.runner, ["run-mock", "."])
+ print(result.output)
+ assert result.exit_code == 0
+
+
+if __name__ == "__main__":
+ _set_locale()
+ sys.exit(pytest.main([__file__]))
diff --git a/benchmarks/suites/BUILD b/benchmarks/suites/BUILD
new file mode 100644
index 000000000..04fc23261
--- /dev/null
+++ b/benchmarks/suites/BUILD
@@ -0,0 +1,130 @@
+package(
+ default_visibility = ["//benchmarks:__subpackages__"],
+ licenses = ["notice"],
+)
+
+py_library(
+ name = "suites",
+ srcs = ["__init__.py"],
+)
+
+py_library(
+ name = "absl",
+ srcs = ["absl.py"],
+ deps = [
+ "//benchmarks/harness:machine",
+ "//benchmarks/suites",
+ "//benchmarks/workloads/absl",
+ ],
+)
+
+py_library(
+ name = "density",
+ srcs = ["density.py"],
+ deps = [
+ "//benchmarks/harness:container",
+ "//benchmarks/harness:machine",
+ "//benchmarks/suites",
+ "//benchmarks/suites:helpers",
+ ],
+)
+
+py_library(
+ name = "fio",
+ srcs = ["fio.py"],
+ deps = [
+ "//benchmarks/harness:machine",
+ "//benchmarks/suites",
+ "//benchmarks/suites:helpers",
+ "//benchmarks/workloads/fio",
+ ],
+)
+
+py_library(
+ name = "helpers",
+ srcs = ["helpers.py"],
+ deps = ["//benchmarks/harness:machine"],
+)
+
+py_library(
+ name = "http",
+ srcs = ["http.py"],
+ deps = [
+ "//benchmarks/harness:machine",
+ "//benchmarks/suites",
+ "//benchmarks/workloads/ab",
+ ],
+)
+
+py_library(
+ name = "media",
+ srcs = ["media.py"],
+ deps = [
+ "//benchmarks/harness:machine",
+ "//benchmarks/suites",
+ "//benchmarks/suites:helpers",
+ "//benchmarks/workloads/ffmpeg",
+ ],
+)
+
+py_library(
+ name = "ml",
+ srcs = ["ml.py"],
+ deps = [
+ "//benchmarks/harness:machine",
+ "//benchmarks/suites",
+ "//benchmarks/suites:startup",
+ "//benchmarks/workloads/tensorflow",
+ ],
+)
+
+py_library(
+ name = "network",
+ srcs = ["network.py"],
+ deps = [
+ "//benchmarks/harness:machine",
+ "//benchmarks/suites",
+ "//benchmarks/suites:helpers",
+ "//benchmarks/workloads/iperf",
+ ],
+)
+
+py_library(
+ name = "redis",
+ srcs = ["redis.py"],
+ deps = [
+ "//benchmarks/harness:machine",
+ "//benchmarks/suites",
+ "//benchmarks/workloads/redisbenchmark",
+ ],
+)
+
+py_library(
+ name = "startup",
+ srcs = ["startup.py"],
+ deps = [
+ "//benchmarks/harness:machine",
+ "//benchmarks/suites",
+ "//benchmarks/suites:helpers",
+ ],
+)
+
+py_library(
+ name = "sysbench",
+ srcs = ["sysbench.py"],
+ deps = [
+ "//benchmarks/harness:machine",
+ "//benchmarks/suites",
+ "//benchmarks/workloads/sysbench",
+ ],
+)
+
+py_library(
+ name = "syscall",
+ srcs = ["syscall.py"],
+ deps = [
+ "//benchmarks/harness:machine",
+ "//benchmarks/suites",
+ "//benchmarks/workloads/syscall",
+ ],
+)
diff --git a/benchmarks/suites/__init__.py b/benchmarks/suites/__init__.py
new file mode 100644
index 000000000..360736cc3
--- /dev/null
+++ b/benchmarks/suites/__init__.py
@@ -0,0 +1,119 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Core benchmark annotations."""
+
+import functools
+import inspect
+import types
+from typing import List
+from typing import Tuple
+
+BENCHMARK_METRICS = '__benchmark_metrics__'
+BENCHMARK_MACHINES = '__benchmark_machines__'
+
+
+def is_benchmark(func: types.FunctionType) -> bool:
+ """Returns true if the given function is a benchmark."""
+ return isinstance(func, types.FunctionType) and \
+ hasattr(func, BENCHMARK_METRICS) and \
+ hasattr(func, BENCHMARK_MACHINES)
+
+
+def benchmark_metrics(func: types.FunctionType) -> List[Tuple[str, str]]:
+ """Returns the list of available metrics."""
+ return [(metric.__name__, metric.__doc__)
+ for metric in getattr(func, BENCHMARK_METRICS)]
+
+
+def benchmark_machines(func: types.FunctionType) -> int:
+ """Returns the number of machines required."""
+ return getattr(func, BENCHMARK_MACHINES)
+
+
+# pylint: disable=unused-argument
+def default(value, **kwargs):
+ """Returns the passed value."""
+ return value
+
+
+def benchmark(metrics: List[types.FunctionType] = None,
+ machines: int = 1) -> types.FunctionType:
+ """Define a benchmark function with metrics.
+
+ Args:
+ metrics: A list of metric functions.
+ machines: The number of machines required.
+
+ Returns:
+ A function that accepts the given number of machines, and iteratively
+ returns a set of (metric_name, metric_value) pairs when called repeatedly.
+ """
+ if not metrics:
+ # The default passes through.
+ metrics = [default]
+
+ def decorator(func: types.FunctionType) -> types.FunctionType:
+ """Decorator function."""
+ # Every benchmark should accept at least two parameters:
+ # runtime: The runtime to use for the benchmark (str, required).
+ # metrics: The metrics to use, if not the default (str, optional).
+ @functools.wraps(func)
+ def wrapper(*args, runtime: str, metric: list = None, **kwargs):
+ """Wrapper function."""
+ # First -- ensure that we marshall all types appropriately. In
+ # general, we will call this with only strings. These strings will
+ # need to be converted to their underlying types/classes.
+ sig = inspect.signature(func)
+ for param in sig.parameters.values():
+ if param.annotation != inspect.Parameter.empty and \
+ param.name in kwargs and not isinstance(kwargs[param.name], param.annotation):
+ try:
+ # Marshall to the appropriate type.
+ kwargs[param.name] = param.annotation(kwargs[param.name])
+ except Exception as exc:
+ raise ValueError(
+ 'illegal type for %s(%s=%s): %s' %
+ (func.__name__, param.name, kwargs[param.name], exc))
+ elif param.default != inspect.Parameter.empty and \
+ param.name not in kwargs:
+ # Ensure that we have the value set, because it will
+ # be passed to the metric function for evaluation.
+ kwargs[param.name] = param.default
+
+ # Next, figure out how to apply a metric. We do this prior to
+ # running the underlying function to prevent having to wait a few
+ # minutes for a result just to see some error.
+ if not metric:
+ # Return all metrics in the iterator.
+ result = func(*args, runtime=runtime, **kwargs)
+ for metric_func in metrics:
+ yield (metric_func.__name__, metric_func(result, **kwargs))
+ else:
+ result = None
+ for single_metric in metric:
+ for metric_func in metrics:
+ # Is this a function that matches the name?
+ # Apply this function to the result.
+ if metric_func.__name__ == single_metric:
+ if not result:
+ # Lazy evaluation: only if metric matches.
+ result = func(*args, runtime=runtime, **kwargs)
+ yield single_metric, metric_func(result, **kwargs)
+
+ # Set metadata on the benchmark (used above).
+ setattr(wrapper, BENCHMARK_METRICS, metrics)
+ setattr(wrapper, BENCHMARK_MACHINES, machines)
+ return wrapper
+
+ return decorator
diff --git a/benchmarks/suites/absl.py b/benchmarks/suites/absl.py
new file mode 100644
index 000000000..5d9b57a09
--- /dev/null
+++ b/benchmarks/suites/absl.py
@@ -0,0 +1,37 @@
+# python3
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""absl build benchmark."""
+
+from benchmarks import suites
+from benchmarks.harness import machine
+from benchmarks.workloads import absl
+
+
+@suites.benchmark(metrics=[absl.elapsed_time], machines=1)
+def build(target: machine.Machine, **kwargs) -> str:
+ """Runs the absl workload and report the absl build time.
+
+ Runs the 'bazel build //absl/...' in a clean bazel directory and
+ monitors time elapsed.
+
+ Args:
+ target: A machine object.
+ **kwargs: Additional container options.
+
+ Returns:
+ Container output.
+ """
+ image = target.pull("absl")
+ return target.container(image, **kwargs).run()
diff --git a/benchmarks/suites/density.py b/benchmarks/suites/density.py
new file mode 100644
index 000000000..89d29fb26
--- /dev/null
+++ b/benchmarks/suites/density.py
@@ -0,0 +1,121 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Density tests."""
+
+import re
+import types
+
+from benchmarks import suites
+from benchmarks.harness import container
+from benchmarks.harness import machine
+from benchmarks.suites import helpers
+
+
+# pylint: disable=unused-argument
+def memory_usage(value, **kwargs):
+ """Returns the passed value."""
+ return value
+
+
+def density(target: machine.Machine,
+ workload: str,
+ count: int = 50,
+ wait: float = 0,
+ load_func: types.FunctionType = None,
+ **kwargs):
+ """Calculate the average memory usage per container.
+
+ Args:
+ target: A machine object.
+ workload: The workload to run.
+ count: The number of containers to start.
+ wait: The time to wait after starting.
+ load_func: Callback that is called after count images have been started on
+ the given machine.
+ **kwargs: Additional container options.
+
+ Returns:
+ The average usage in Kb per container.
+ """
+ count = int(count)
+
+ # Drop all caches.
+ helpers.drop_caches(target)
+ before = target.read("/proc/meminfo")
+
+ # Load the workload.
+ image = target.pull(workload)
+
+ with target.container(
+ image=image, count=count, **kwargs).detach() as containers:
+ # Call the optional load function callback if given.
+ if load_func:
+ load_func(target, containers)
+ # Wait 'wait' time before taking a measurement.
+ target.sleep(wait)
+
+ # Drop caches again.
+ helpers.drop_caches(target)
+ after = target.read("/proc/meminfo")
+
+ # Calculate the memory used.
+ available_re = re.compile(r"MemAvailable:\s*(\d+)\skB\n")
+ before_available = available_re.findall(before)
+ after_available = available_re.findall(after)
+ return 1024 * float(int(before_available[0]) -
+ int(after_available[0])) / float(count)
+
+
+def load_redis(target: machine.Machine, containers: container.Container):
+ """Use redis-benchmark "LPUSH" to load each container with 1G of data.
+
+ Args:
+ target: A machine object.
+ containers: A set of containers.
+ """
+ target.pull("redisbenchmark")
+ for name in containers.get_names():
+ flags = "-d 10000 -t LPUSH"
+ target.container(
+ "redisbenchmark", links={
+ name: name
+ }).run(
+ host=name, flags=flags)
+
+
+@suites.benchmark(metrics=[memory_usage], machines=1)
+def empty(target: machine.Machine, **kwargs) -> float:
+ """Run trivial containers in a density test."""
+ return density(target, workload="sleep", wait=1.0, **kwargs)
+
+
+@suites.benchmark(metrics=[memory_usage], machines=1)
+def node(target: machine.Machine, **kwargs) -> float:
+ """Run node containers in a density test."""
+ return density(target, workload="node", wait=3.0, **kwargs)
+
+
+@suites.benchmark(metrics=[memory_usage], machines=1)
+def ruby(target: machine.Machine, **kwargs) -> float:
+ """Run ruby containers in a density test."""
+ return density(target, workload="ruby", wait=3.0, **kwargs)
+
+
+@suites.benchmark(metrics=[memory_usage], machines=1)
+def redis(target: machine.Machine, **kwargs) -> float:
+ """Run redis containers in a density test."""
+ if "count" not in kwargs:
+ kwargs["count"] = 5
+ return density(
+ target, workload="redis", wait=3.0, load_func=load_redis, **kwargs)
diff --git a/benchmarks/suites/fio.py b/benchmarks/suites/fio.py
new file mode 100644
index 000000000..2171790c5
--- /dev/null
+++ b/benchmarks/suites/fio.py
@@ -0,0 +1,165 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""File I/O tests."""
+
+import os
+
+from benchmarks import suites
+from benchmarks.harness import machine
+from benchmarks.suites import helpers
+from benchmarks.workloads import fio
+
+
+# pylint: disable=too-many-arguments
+# pylint: disable=too-many-locals
+def run_fio(target: machine.Machine,
+ test: str,
+ ioengine: str = "sync",
+ size: int = 1024 * 1024 * 1024,
+ iodepth: int = 4,
+ blocksize: int = 1024 * 1024,
+ time: int = -1,
+ mount_dir: str = "",
+ filename: str = "file.dat",
+ tmpfs: bool = False,
+ ramp_time: int = 0,
+ **kwargs) -> str:
+ """FIO benchmarks.
+
+ For more on fio see:
+ https://media.readthedocs.org/pdf/fio/latest/fio.pdf
+
+ Args:
+ target: A machine object.
+ test: The test to run (read, write, randread, randwrite, etc.)
+ ioengine: The engine for I/O.
+ size: The size of the generated file in bytes (if an integer) or 5g, 16k,
+ etc.
+ iodepth: The I/O for certain engines.
+ blocksize: The blocksize for reads and writes in bytes (if an integer) or
+ 4k, etc.
+ time: If test is time based, how long to run in seconds.
+ mount_dir: The absolute path on the host to mount a bind mount.
+ filename: The name of the file to creat inside container. For a path of
+ /dir/dir/file, the script setup a volume like 'docker run -v
+ mount_dir:/dir/dir fio' and fio will create (and delete) the file
+ /dir/dir/file. If tmpfs is set, this /dir/dir will be a tmpfs.
+ tmpfs: If true, mount on tmpfs.
+ ramp_time: The time to run before recording statistics
+ **kwargs: Additional container options.
+
+ Returns:
+ The output of fio as a string.
+ """
+ # Pull the image before dropping caches.
+ image = target.pull("fio")
+
+ if not mount_dir:
+ stdout, _ = target.run("pwd")
+ mount_dir = stdout.rstrip()
+
+ # Setup the volumes.
+ volumes = {mount_dir: {"bind": "/disk", "mode": "rw"}} if not tmpfs else None
+ tmpfs = {"/disk": ""} if tmpfs else None
+
+ # Construct a file in the volume.
+ filepath = os.path.join("/disk", filename)
+
+ # If we are running a read test, us fio to write a file and then flush file
+ # data from memory.
+ if "read" in test:
+ target.container(
+ image, volumes=volumes, tmpfs=tmpfs, **kwargs).run(
+ test="write",
+ ioengine="sync",
+ size=size,
+ iodepth=iodepth,
+ blocksize=blocksize,
+ path=filepath)
+ helpers.drop_caches(target)
+
+ # Run the test.
+ time_str = "--time_base --runtime={time}".format(
+ time=time) if int(time) > 0 else ""
+ res = target.container(
+ image, volumes=volumes, tmpfs=tmpfs, **kwargs).run(
+ test=test,
+ ioengine=ioengine,
+ size=size,
+ iodepth=iodepth,
+ blocksize=blocksize,
+ time=time_str,
+ path=filepath,
+ ramp_time=ramp_time)
+
+ target.run(
+ "rm {path}".format(path=os.path.join(mount_dir.rstrip(), filename)))
+
+ return res
+
+
+@suites.benchmark(metrics=[fio.read_bandwidth, fio.read_io_ops], machines=1)
+def read(*args, **kwargs):
+ """Read test.
+
+ Args:
+ *args: None.
+ **kwargs: Additional container options.
+
+ Returns:
+ The output of fio.
+ """
+ return run_fio(*args, test="read", **kwargs)
+
+
+@suites.benchmark(metrics=[fio.read_bandwidth, fio.read_io_ops], machines=1)
+def randread(*args, **kwargs):
+ """Random read test.
+
+ Args:
+ *args: None.
+ **kwargs: Additional container options.
+
+ Returns:
+ The output of fio.
+ """
+ return run_fio(*args, test="randread", **kwargs)
+
+
+@suites.benchmark(metrics=[fio.write_bandwidth, fio.write_io_ops], machines=1)
+def write(*args, **kwargs):
+ """Write test.
+
+ Args:
+ *args: None.
+ **kwargs: Additional container options.
+
+ Returns:
+ The output of fio.
+ """
+ return run_fio(*args, test="write", **kwargs)
+
+
+@suites.benchmark(metrics=[fio.write_bandwidth, fio.write_io_ops], machines=1)
+def randwrite(*args, **kwargs):
+ """Random write test.
+
+ Args:
+ *args: None.
+ **kwargs: Additional container options.
+
+ Returns:
+ The output of fio.
+ """
+ return run_fio(*args, test="randwrite", **kwargs)
diff --git a/benchmarks/suites/helpers.py b/benchmarks/suites/helpers.py
new file mode 100644
index 000000000..b3c7360ab
--- /dev/null
+++ b/benchmarks/suites/helpers.py
@@ -0,0 +1,57 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Benchmark helpers."""
+
+import datetime
+from benchmarks.harness import machine
+
+
+class Timer:
+ """Helper to time runtime of some call.
+
+ Usage:
+
+ with Timer as t:
+ # do something.
+ t.get_time_in_seconds()
+ """
+
+ def __init__(self):
+ self._start = datetime.datetime.now()
+
+ def __enter__(self):
+ self.start()
+ return self
+
+ def start(self):
+ """Starts the timer."""
+ self._start = datetime.datetime.now()
+
+ def elapsed(self) -> float:
+ """Returns the elapsed time in seconds."""
+ return (datetime.datetime.now() - self._start).total_seconds()
+
+ def __exit__(self, exception_type, exception_value, exception_traceback):
+ pass
+
+
+def drop_caches(target: machine.Machine):
+ """Drops caches on the machine.
+
+ Args:
+ target: A machine object.
+ """
+ target.run("sudo sync")
+ target.run("sudo sysctl vm.drop_caches=3")
+ target.run("sudo sysctl vm.drop_caches=3")
diff --git a/benchmarks/suites/http.py b/benchmarks/suites/http.py
new file mode 100644
index 000000000..6efea938c
--- /dev/null
+++ b/benchmarks/suites/http.py
@@ -0,0 +1,138 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""HTTP benchmarks."""
+
+from benchmarks import suites
+from benchmarks.harness import machine
+from benchmarks.workloads import ab
+
+
+# pylint: disable=too-many-arguments
+def http(server: machine.Machine,
+ client: machine.Machine,
+ workload: str,
+ requests: int = 5000,
+ connections: int = 10,
+ port: int = 80,
+ path: str = "notfound",
+ **kwargs) -> str:
+ """Run apachebench (ab) against an http server.
+
+ Args:
+ server: A machine object.
+ client: A machine object.
+ workload: The http-serving workload.
+ requests: Number of requests to send the server. Default is 5000.
+ connections: Number of concurent connections to use. Default is 10.
+ port: The port to access in benchmarking.
+ path: File to download, generally workload-specific.
+ **kwargs: Additional container options.
+
+ Returns:
+ The full apachebench output.
+ """
+ # Pull the client & server.
+ apachebench = client.pull("ab")
+ netcat = client.pull("netcat")
+ image = server.pull(workload)
+
+ with server.container(image, port=port, **kwargs).detach() as container:
+ (host, port) = container.address()
+ # Wait for the server to come up.
+ client.container(netcat).run(host=host, port=port)
+ # Run the benchmark, no arguments.
+ return client.container(apachebench).run(
+ host=host,
+ port=port,
+ requests=requests,
+ connections=connections,
+ path=path)
+
+
+# pylint: disable=too-many-arguments
+# pylint: disable=too-many-locals
+def http_app(server: machine.Machine,
+ client: machine.Machine,
+ workload: str,
+ requests: int = 5000,
+ connections: int = 10,
+ port: int = 80,
+ path: str = "notfound",
+ **kwargs) -> str:
+ """Run apachebench (ab) against an http application.
+
+ Args:
+ server: A machine object.
+ client: A machine object.
+ workload: The http-serving workload.
+ requests: Number of requests to send the server. Default is 5000.
+ connections: Number of concurent connections to use. Default is 10.
+ port: The port to use for benchmarking.
+ path: File to download, generally workload-specific.
+ **kwargs: Additional container options.
+
+ Returns:
+ The full apachebench output.
+ """
+ # Pull the client & server.
+ apachebench = client.pull("ab")
+ netcat = client.pull("netcat")
+ server_netcat = server.pull("netcat")
+ redis = server.pull("redis")
+ image = server.pull(workload)
+ redis_port = 6379
+ redis_name = "{workload}_redis_server".format(workload=workload)
+
+ with server.container(redis, name=redis_name).detach():
+ server.container(server_netcat, links={redis_name: redis_name})\
+ .run(host=redis_name, port=redis_port)
+ with server.container(image, port=port, links={redis_name: redis_name}, **kwargs)\
+ .detach(host=redis_name) as container:
+ (host, port) = container.address()
+ # Wait for the server to come up.
+ client.container(netcat).run(host=host, port=port)
+ # Run the benchmark, no arguments.
+ return client.container(apachebench).run(
+ host=host,
+ port=port,
+ requests=requests,
+ connections=connections,
+ path=path)
+
+
+@suites.benchmark(metrics=[ab.transfer_rate, ab.latency], machines=2)
+def httpd(*args, **kwargs) -> str:
+ """Apache2 benchmark."""
+ return http(*args, workload="httpd", port=80, **kwargs)
+
+
+@suites.benchmark(
+ metrics=[ab.transfer_rate, ab.latency, ab.requests_per_second], machines=2)
+def nginx(*args, **kwargs) -> str:
+ """Nginx benchmark."""
+ return http(*args, workload="nginx", port=80, **kwargs)
+
+
+@suites.benchmark(
+ metrics=[ab.transfer_rate, ab.latency, ab.requests_per_second], machines=2)
+def node(*args, **kwargs) -> str:
+ """Node benchmark."""
+ return http_app(*args, workload="node_template", path="", port=8080, **kwargs)
+
+
+@suites.benchmark(
+ metrics=[ab.transfer_rate, ab.latency, ab.requests_per_second], machines=2)
+def ruby(*args, **kwargs) -> str:
+ """Ruby benchmark."""
+ return http_app(*args, workload="ruby_template", path="", port=9292, **kwargs)
diff --git a/benchmarks/suites/media.py b/benchmarks/suites/media.py
new file mode 100644
index 000000000..9cbffdaa1
--- /dev/null
+++ b/benchmarks/suites/media.py
@@ -0,0 +1,42 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Media processing benchmarks."""
+
+from benchmarks import suites
+from benchmarks.harness import machine
+from benchmarks.suites import helpers
+from benchmarks.workloads import ffmpeg
+
+
+@suites.benchmark(metrics=[ffmpeg.run_time], machines=1)
+def transcode(target: machine.Machine, **kwargs) -> float:
+ """Runs a video transcoding workload and times it.
+
+ Args:
+ target: A machine object.
+ **kwargs: Additional container options.
+
+ Returns:
+ Total workload runtime.
+ """
+ # Load before timing.
+ image = target.pull("ffmpeg")
+
+ # Drop caches.
+ helpers.drop_caches(target)
+
+ # Time startup + transcoding.
+ with helpers.Timer() as timer:
+ target.container(image, **kwargs).run()
+ return timer.elapsed()
diff --git a/benchmarks/suites/ml.py b/benchmarks/suites/ml.py
new file mode 100644
index 000000000..a394d1f69
--- /dev/null
+++ b/benchmarks/suites/ml.py
@@ -0,0 +1,33 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Machine Learning tests."""
+
+from benchmarks import suites
+from benchmarks.harness import machine
+from benchmarks.suites import startup
+from benchmarks.workloads import tensorflow
+
+
+@suites.benchmark(metrics=[tensorflow.run_time], machines=1)
+def train(target: machine.Machine, **kwargs):
+ """Run the tensorflow benchmark and return the runtime in seconds of workload.
+
+ Args:
+ target: A machine object.
+ **kwargs: Additional container options.
+
+ Returns:
+ The total runtime.
+ """
+ return startup.startup(target, workload="tensorflow", count=1, **kwargs)
diff --git a/benchmarks/suites/network.py b/benchmarks/suites/network.py
new file mode 100644
index 000000000..f973cf3f1
--- /dev/null
+++ b/benchmarks/suites/network.py
@@ -0,0 +1,101 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Network microbenchmarks."""
+
+from typing import Dict
+
+from benchmarks import suites
+from benchmarks.harness import machine
+from benchmarks.suites import helpers
+from benchmarks.workloads import iperf
+
+
+def run_iperf(client: machine.Machine,
+ server: machine.Machine,
+ client_kwargs: Dict[str, str] = None,
+ server_kwargs: Dict[str, str] = None) -> str:
+ """Measure iperf performance.
+
+ Args:
+ client: A machine object.
+ server: A machine object.
+ client_kwargs: Additional client container options.
+ server_kwargs: Additional server container options.
+
+ Returns:
+ The output of iperf.
+ """
+ if not client_kwargs:
+ client_kwargs = dict()
+ if not server_kwargs:
+ server_kwargs = dict()
+
+ # Pull images.
+ netcat = client.pull("netcat")
+ iperf_client_image = client.pull("iperf")
+ iperf_server_image = server.pull("iperf")
+
+ # Set this due to a bug in the kernel that resets connections.
+ client.run("sudo /sbin/sysctl -w net.netfilter.nf_conntrack_tcp_be_liberal=1")
+ server.run("sudo /sbin/sysctl -w net.netfilter.nf_conntrack_tcp_be_liberal=1")
+
+ with server.container(
+ iperf_server_image, port=5001, **server_kwargs).detach() as iperf_server:
+ (host, port) = iperf_server.address()
+ # Wait until the service is available.
+ client.container(netcat).run(host=host, port=port)
+ # Run a warm-up run.
+ client.container(
+ iperf_client_image, stderr=True, **client_kwargs).run(
+ host=host, port=port)
+ # Run the client with relevant arguments.
+ res = client.container(iperf_client_image, stderr=True, **client_kwargs)\
+ .run(host=host, port=port)
+ helpers.drop_caches(client)
+ return res
+
+
+@suites.benchmark(metrics=[iperf.bandwidth], machines=2)
+def upload(client: machine.Machine, server: machine.Machine, **kwargs) -> str:
+ """Measure upload performance.
+
+ Args:
+ client: A machine object.
+ server: A machine object.
+ **kwargs: Client container options.
+
+ Returns:
+ The output of iperf.
+ """
+ if kwargs["runtime"] == "runc":
+ kwargs["network_mode"] = "host"
+ return run_iperf(client, server, client_kwargs=kwargs)
+
+
+@suites.benchmark(metrics=[iperf.bandwidth], machines=2)
+def download(client: machine.Machine, server: machine.Machine, **kwargs) -> str:
+ """Measure download performance.
+
+ Args:
+ client: A machine object.
+ server: A machine object.
+ **kwargs: Server container options.
+
+ Returns:
+ The output of iperf.
+ """
+
+ client_kwargs = {"network_mode": "host"}
+ return run_iperf(
+ client, server, client_kwargs=client_kwargs, server_kwargs=kwargs)
diff --git a/benchmarks/suites/redis.py b/benchmarks/suites/redis.py
new file mode 100644
index 000000000..b84dd073d
--- /dev/null
+++ b/benchmarks/suites/redis.py
@@ -0,0 +1,46 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Redis benchmarks."""
+
+from benchmarks import suites
+from benchmarks.harness import machine
+from benchmarks.workloads import redisbenchmark
+
+
+@suites.benchmark(metrics=list(redisbenchmark.METRICS.values()), machines=2)
+def redis(server: machine.Machine,
+ client: machine.Machine,
+ flags: str = "",
+ **kwargs) -> str:
+ """Run redis-benchmark on client pointing at server machine.
+
+ Args:
+ server: A machine object.
+ client: A machine object.
+ flags: Flags to pass redis-benchmark.
+ **kwargs: Additional container options.
+
+ Returns:
+ Output from redis-benchmark.
+ """
+ redis_server = server.pull("redis")
+ redis_client = client.pull("redisbenchmark")
+ netcat = client.pull("netcat")
+ with server.container(
+ redis_server, port=6379, **kwargs).detach() as container:
+ (host, port) = container.address()
+ # Wait for the container to be up.
+ client.container(netcat).run(host=host, port=port)
+ # Run all redis benchmarks.
+ return client.container(redis_client).run(host=host, port=port, flags=flags)
diff --git a/benchmarks/suites/startup.py b/benchmarks/suites/startup.py
new file mode 100644
index 000000000..a1b6c5753
--- /dev/null
+++ b/benchmarks/suites/startup.py
@@ -0,0 +1,110 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Start-up benchmarks."""
+
+from benchmarks import suites
+from benchmarks.harness import machine
+from benchmarks.suites import helpers
+
+
+# pylint: disable=unused-argument
+def startup_time_ms(value, **kwargs):
+ """Returns average startup time per container in milliseconds.
+
+ Args:
+ value: The floating point time in seconds.
+ **kwargs: Ignored.
+
+ Returns:
+ The time given in milliseconds.
+ """
+ return value * 1000
+
+
+def startup(target: machine.Machine,
+ workload: str,
+ count: int = 5,
+ port: int = 0,
+ **kwargs):
+ """Time the startup of some workload.
+
+ Args:
+ target: A machine object.
+ workload: The workload to run.
+ count: Number of containers to start.
+ port: The port to check for liveness, if provided.
+ **kwargs: Additional container options.
+
+ Returns:
+ The mean start-up time in seconds.
+ """
+ # Load before timing.
+ image = target.pull(workload)
+ netcat = target.pull("netcat")
+ count = int(count)
+ port = int(port)
+
+ with helpers.Timer() as timer:
+ for _ in range(count):
+ if not port:
+ # Run the container synchronously.
+ target.container(image, **kwargs).run()
+ else:
+ # Run a detached container until httpd available.
+ with target.container(image, port=port, **kwargs).detach() as server:
+ (server_host, server_port) = server.address()
+ target.container(netcat).run(host=server_host, port=server_port)
+ return timer.elapsed() / float(count)
+
+
+@suites.benchmark(metrics=[startup_time_ms], machines=1)
+def empty(target: machine.Machine, **kwargs) -> float:
+ """Time the startup of a trivial container.
+
+ Args:
+ target: A machine object.
+ **kwargs: Additional startup options.
+
+ Returns:
+ The time to run the container.
+ """
+ return startup(target, workload="true", **kwargs)
+
+
+@suites.benchmark(metrics=[startup_time_ms], machines=1)
+def node(target: machine.Machine, **kwargs) -> float:
+ """Time the startup of the node container.
+
+ Args:
+ target: A machine object.
+ **kwargs: Additional statup options.
+
+ Returns:
+ The time to run the container.
+ """
+ return startup(target, workload="node", port=8080, **kwargs)
+
+
+@suites.benchmark(metrics=[startup_time_ms], machines=1)
+def ruby(target: machine.Machine, **kwargs) -> float:
+ """Time the startup of the ruby container.
+
+ Args:
+ target: A machine object.
+ **kwargs: Additional startup options.
+
+ Returns:
+ The time to run the container.
+ """
+ return startup(target, workload="ruby", port=3000, **kwargs)
diff --git a/benchmarks/suites/sysbench.py b/benchmarks/suites/sysbench.py
new file mode 100644
index 000000000..2a6e2126c
--- /dev/null
+++ b/benchmarks/suites/sysbench.py
@@ -0,0 +1,119 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Sysbench-based benchmarks."""
+
+from benchmarks import suites
+from benchmarks.harness import machine
+from benchmarks.workloads import sysbench
+
+
+def run_sysbench(target: machine.Machine,
+ test: str = "cpu",
+ threads: int = 8,
+ time: int = 5,
+ options: str = "",
+ **kwargs) -> str:
+ """Run sysbench container with arguments.
+
+ Args:
+ target: A machine object.
+ test: Relevant sysbench test to run (e.g. cpu, memory).
+ threads: The number of threads to use for tests.
+ time: The time to run tests.
+ options: Additional sysbench options.
+ **kwargs: Additional container options.
+
+ Returns:
+ The output of the command as a string.
+ """
+ image = target.pull("sysbench")
+ return target.container(image, **kwargs).run(
+ test=test, threads=threads, time=time, options=options)
+
+
+@suites.benchmark(metrics=[sysbench.cpu_events_per_second], machines=1)
+def cpu(target: machine.Machine, max_prime: int = 5000, **kwargs) -> str:
+ """Run sysbench CPU test.
+
+ Additional arguments can be provided for sysbench.
+
+ Args:
+ target: A machine object.
+ max_prime: The maximum prime number to search.
+ **kwargs:
+ - threads: The number of threads to use for tests.
+ - time: The time to run tests.
+ - options: Additional sysbench options. See sysbench tool:
+ https://github.com/akopytov/sysbench
+
+ Returns:
+ Sysbench output.
+ """
+ options = kwargs.pop("options", "")
+ options += " --cpu-max-prime={}".format(max_prime)
+ return run_sysbench(target, test="cpu", options=options, **kwargs)
+
+
+@suites.benchmark(metrics=[sysbench.memory_ops_per_second], machines=1)
+def memory(target: machine.Machine, **kwargs) -> str:
+ """Run sysbench memory test.
+
+ Additional arguments can be provided per sysbench.
+
+ Args:
+ target: A machine object.
+ **kwargs:
+ - threads: The number of threads to use for tests.
+ - time: The time to run tests.
+ - options: Additional sysbench options. See sysbench tool:
+ https://github.com/akopytov/sysbench
+
+ Returns:
+ Sysbench output.
+ """
+ return run_sysbench(target, test="memory", **kwargs)
+
+
+@suites.benchmark(
+ metrics=[
+ sysbench.mutex_time, sysbench.mutex_latency, sysbench.mutex_deviation
+ ],
+ machines=1)
+def mutex(target: machine.Machine,
+ locks: int = 4,
+ count: int = 10000000,
+ threads: int = 8,
+ **kwargs) -> str:
+ """Run sysbench mutex test.
+
+ Additional arguments can be provided per sysbench.
+
+ Args:
+ target: A machine object.
+ locks: The number of locks to use.
+ count: The number of mutexes.
+ threads: The number of threads to use for tests.
+ **kwargs:
+ - time: The time to run tests.
+ - options: Additional sysbench options. See sysbench tool:
+ https://github.com/akopytov/sysbench
+
+ Returns:
+ Sysbench output.
+ """
+ options = kwargs.pop("options", "")
+ options += " --mutex-loops=1 --mutex-locks={} --mutex-num={}".format(
+ count, locks)
+ return run_sysbench(
+ target, test="mutex", options=options, threads=threads, **kwargs)
diff --git a/benchmarks/suites/syscall.py b/benchmarks/suites/syscall.py
new file mode 100644
index 000000000..fa7665b00
--- /dev/null
+++ b/benchmarks/suites/syscall.py
@@ -0,0 +1,37 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Syscall microbenchmark."""
+
+from benchmarks import suites
+from benchmarks.harness import machine
+from benchmarks.workloads.syscall import syscall_time_ns
+
+
+@suites.benchmark(metrics=[syscall_time_ns], machines=1)
+def syscall(target: machine.Machine, count: int = 1000000, **kwargs) -> str:
+ """Runs the syscall workload and report the syscall time.
+
+ Runs the syscall 'SYS_gettimeofday(0,0)' 'count' times and monitors time
+ elapsed based on the runtime's MONOTONIC clock.
+
+ Args:
+ target: A machine object.
+ count: The number of syscalls to execute.
+ **kwargs: Additional container options.
+
+ Returns:
+ Container output.
+ """
+ image = target.pull("syscall")
+ return target.container(image, **kwargs).run(count=count)
diff --git a/benchmarks/tcp/BUILD b/benchmarks/tcp/BUILD
new file mode 100644
index 000000000..735d7127f
--- /dev/null
+++ b/benchmarks/tcp/BUILD
@@ -0,0 +1,41 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_binary")
+load("@rules_cc//cc:defs.bzl", "cc_binary")
+
+package(licenses = ["notice"])
+
+go_binary(
+ name = "tcp_proxy",
+ srcs = ["tcp_proxy.go"],
+ visibility = ["//:sandbox"],
+ deps = [
+ "//pkg/tcpip",
+ "//pkg/tcpip/adapters/gonet",
+ "//pkg/tcpip/link/fdbased",
+ "//pkg/tcpip/network/arp",
+ "//pkg/tcpip/network/ipv4",
+ "//pkg/tcpip/stack",
+ "//pkg/tcpip/transport/tcp",
+ "//pkg/tcpip/transport/udp",
+ "@org_golang_x_sys//unix:go_default_library",
+ ],
+)
+
+# nsjoin is a trivial replacement for nsenter. This is used because nsenter is
+# not available on all systems where this benchmark is run (and we aim to
+# minimize external dependencies.)
+
+cc_binary(
+ name = "nsjoin",
+ srcs = ["nsjoin.c"],
+ visibility = ["//:sandbox"],
+)
+
+sh_binary(
+ name = "tcp_benchmark",
+ srcs = ["tcp_benchmark.sh"],
+ data = [
+ ":nsjoin",
+ ":tcp_proxy",
+ ],
+ visibility = ["//:sandbox"],
+)
diff --git a/benchmarks/tcp/README.md b/benchmarks/tcp/README.md
new file mode 100644
index 000000000..38e6e69f0
--- /dev/null
+++ b/benchmarks/tcp/README.md
@@ -0,0 +1,87 @@
+# TCP Benchmarks
+
+This directory contains a standardized TCP benchmark. This helps to evaluate the
+performance of netstack and native networking stacks under various conditions.
+
+## `tcp_benchmark`
+
+This benchmark allows TCP throughput testing under various conditions. The setup
+consists of an iperf client, a client proxy, a server proxy and an iperf server.
+The client proxy and server proxy abstract the network mechanism used to
+communicate between the iperf client and server.
+
+The setup looks like the following:
+
+```
+ +--------------+ (native) +--------------+
+ | iperf client |[lo @ 10.0.0.1]------>| client proxy |
+ +--------------+ +--------------+
+ [client.0 @ 10.0.0.2]
+ (netstack) | | (native)
+ +------+-----+
+ |
+ [br0]
+ |
+ Network emulation applied ---> [wan.0:wan.1]
+ |
+ [br1]
+ |
+ +------+-----+
+ (netstack) | | (native)
+ [server.0 @ 10.0.0.3]
+ +--------------+ +--------------+
+ | iperf server |<------[lo @ 10.0.0.4]| server proxy |
+ +--------------+ (native) +--------------+
+```
+
+Different configurations can be run using different arguments. For example:
+
+* Native test under normal internet conditions: `tcp_benchmark`
+* Native test under ideal conditions: `tcp_benchmark --ideal`
+* Netstack client under ideal conditions: `tcp_benchmark --client --ideal`
+* Netstack client with 5% packet loss: `tcp_benchmark --client --ideal --loss
+ 5`
+
+Use `tcp_benchmark --help` for full arguments.
+
+This tool may be used to easily generate data for graphing. For example, to
+generate a CSV for various latencies, you might do:
+
+```
+rm -f /tmp/netstack_latency.csv /tmp/native_latency.csv
+latencies=$(seq 0 5 50;
+ seq 60 10 100;
+ seq 125 25 250;
+ seq 300 50 500)
+for latency in $latencies; do
+ read throughput client_cpu server_cpu <<< \
+ $(./tcp_benchmark --duration 30 --client --ideal --latency $latency)
+ echo $latency,$throughput,$client_cpu >> /tmp/netstack_latency.csv
+done
+for latency in $latencies; do
+ read throughput client_cpu server_cpu <<< \
+ $(./tcp_benchmark --duration 30 --ideal --latency $latency)
+ echo $latency,$throughput,$client_cpu >> /tmp/native_latency.csv
+done
+```
+
+Similarly, to generate a CSV for various levels of packet loss, the following
+would be appropriate:
+
+```
+rm -f /tmp/netstack_loss.csv /tmp/native_loss.csv
+losses=$(seq 0 0.1 1.0;
+ seq 1.2 0.2 2.0;
+ seq 2.5 0.5 5.0;
+ seq 6.0 1.0 10.0)
+for loss in $losses; do
+ read throughput client_cpu server_cpu <<< \
+ $(./tcp_benchmark --duration 30 --client --ideal --latency 10 --loss $loss)
+ echo $loss,$throughput,$client_cpu >> /tmp/netstack_loss.csv
+done
+for loss in $losses; do
+ read throughput client_cpu server_cpu <<< \
+ $(./tcp_benchmark --duration 30 --ideal --latency 10 --loss $loss)
+ echo $loss,$throughput,$client_cpu >> /tmp/native_loss.csv
+done
+```
diff --git a/benchmarks/tcp/nsjoin.c b/benchmarks/tcp/nsjoin.c
new file mode 100644
index 000000000..524b4d549
--- /dev/null
+++ b/benchmarks/tcp/nsjoin.c
@@ -0,0 +1,47 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+int main(int argc, char** argv) {
+ if (argc <= 2) {
+ fprintf(stderr, "error: must provide a namespace file.\n");
+ fprintf(stderr, "usage: %s <file> [arguments...]\n", argv[0]);
+ return 1;
+ }
+
+ int fd = open(argv[1], O_RDONLY);
+ if (fd < 0) {
+ fprintf(stderr, "error opening %s: %s\n", argv[1], strerror(errno));
+ return 1;
+ }
+ if (setns(fd, 0) < 0) {
+ fprintf(stderr, "error joining %s: %s\n", argv[1], strerror(errno));
+ return 1;
+ }
+
+ execvp(argv[2], &argv[2]);
+ return 1;
+}
diff --git a/benchmarks/tcp/tcp_benchmark.sh b/benchmarks/tcp/tcp_benchmark.sh
new file mode 100755
index 000000000..e65801a7b
--- /dev/null
+++ b/benchmarks/tcp/tcp_benchmark.sh
@@ -0,0 +1,388 @@
+#!/bin/bash
+
+# Copyright 2018 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# TCP benchmark; see README.md for documentation.
+
+# Fixed parameters.
+iperf_port=45201 # Not likely to be privileged.
+proxy_port=44000 # Ditto.
+client_addr=10.0.0.1
+client_proxy_addr=10.0.0.2
+server_proxy_addr=10.0.0.3
+server_addr=10.0.0.4
+mask=8
+
+# Defaults; this provides a reasonable approximation of a decent internet link.
+# Parameters can be varied independently from this set to see response to
+# various changes in the kind of link available.
+client=false
+server=false
+verbose=false
+gso=0
+swgso=false
+mtu=1280 # 1280 is a reasonable lowest-common-denominator.
+latency=10 # 10ms approximates a fast, dedicated connection.
+latency_variation=1 # +/- 1ms is a relatively low amount of jitter.
+loss=0.1 # 0.1% loss is non-zero, but not extremely high.
+duplicate=0.1 # 0.1% means duplicates are 1/10x as frequent as losses.
+duration=30 # 30s is enough time to consistent results (experimentally).
+helper_dir=$(dirname $0)
+netstack_opts=
+disable_linux_gso=
+num_client_threads=1
+
+# Check for netem support.
+lsmod_output=$(lsmod | grep sch_netem)
+if [ "$?" != "0" ]; then
+ echo "warning: sch_netem may not be installed." >&2
+fi
+
+while [ $# -gt 0 ]; do
+ case "$1" in
+ --client)
+ client=true
+ ;;
+ --client_tcp_probe_file)
+ shift
+ netstack_opts="${netstack_opts} -client_tcp_probe_file=$1"
+ ;;
+ --server)
+ server=true
+ ;;
+ --verbose)
+ verbose=true
+ ;;
+ --gso)
+ shift
+ gso=$1
+ ;;
+ --swgso)
+ swgso=true
+ ;;
+ --server_tcp_probe_file)
+ shift
+ netstack_opts="${netstack_opts} -server_tcp_probe_file=$1"
+ ;;
+ --ideal)
+ mtu=1500 # Standard ethernet.
+ latency=0 # No latency.
+ latency_variation=0 # No jitter.
+ loss=0 # No loss.
+ duplicate=0 # No duplicates.
+ ;;
+ --mtu)
+ shift
+ [ "$#" -le 0 ] && echo "no mtu provided" && exit 1
+ mtu=$1
+ ;;
+ --sack)
+ netstack_opts="${netstack_opts} -sack"
+ ;;
+ --cubic)
+ netstack_opts="${netstack_opts} -cubic"
+ ;;
+ --duration)
+ shift
+ [ "$#" -le 0 ] && echo "no duration provided" && exit 1
+ duration=$1
+ ;;
+ --latency)
+ shift
+ [ "$#" -le 0 ] && echo "no latency provided" && exit 1
+ latency=$1
+ ;;
+ --latency-variation)
+ shift
+ [ "$#" -le 0 ] && echo "no latency variation provided" && exit 1
+ latency_variation=$1
+ ;;
+ --loss)
+ shift
+ [ "$#" -le 0 ] && echo "no loss probability provided" && exit 1
+ loss=$1
+ ;;
+ --duplicate)
+ shift
+ [ "$#" -le 0 ] && echo "no duplicate provided" && exit 1
+ duplicate=$1
+ ;;
+ --cpuprofile)
+ shift
+ netstack_opts="${netstack_opts} -cpuprofile=$1"
+ ;;
+ --memprofile)
+ shift
+ netstack_opts="${netstack_opts} -memprofile=$1"
+ ;;
+ --disable-linux-gso)
+ disable_linux_gso=1
+ ;;
+ --num-client-threads)
+ shift
+ num_client_threads=$1
+ ;;
+ --helpers)
+ shift
+ [ "$#" -le 0 ] && echo "no helper dir provided" && exit 1
+ helper_dir=$1
+ ;;
+ *)
+ echo "usage: $0 [options]"
+ echo "options:"
+ echo " --help show this message"
+ echo " --verbose verbose output"
+ echo " --client use netstack as the client"
+ echo " --ideal reset all network emulation"
+ echo " --server use netstack as the server"
+ echo " --mtu set the mtu (bytes)"
+ echo " --sack enable SACK support"
+ echo " --cubic enable CUBIC congestion control for Netstack"
+ echo " --duration set the test duration (s)"
+ echo " --latency set the latency (ms)"
+ echo " --latency-variation set the latency variation"
+ echo " --loss set the loss probability (%)"
+ echo " --duplicate set the duplicate probability (%)"
+ echo " --helpers set the helper directory"
+ echo " --num-client-threads number of parallel client threads to run"
+ echo " --disable-linux-gso disable segmentation offload in the Linux network stack"
+ echo ""
+ echo "The output will of the script will be:"
+ echo " <throughput> <client-cpu-usage> <server-cpu-usage>"
+ exit 1
+ esac
+ shift
+done
+
+if [ ${verbose} == "true" ]; then
+ set -x
+fi
+
+# Latency needs to be halved, since it's applied on both ways.
+half_latency=$(echo ${latency}/2 | bc -l | awk '{printf "%1.2f", $0}')
+half_loss=$(echo ${loss}/2 | bc -l | awk '{printf "%1.6f", $0}')
+half_duplicate=$(echo ${duplicate}/2 | bc -l | awk '{printf "%1.6f", $0}')
+helper_dir=${helper_dir#$(pwd)/} # Use relative paths.
+proxy_binary=${helper_dir}/tcp_proxy
+nsjoin_binary=${helper_dir}/nsjoin
+
+if [ ! -e ${proxy_binary} ]; then
+ echo "Could not locate ${proxy_binary}, please make sure you've built the binary"
+ exit 1
+fi
+
+if [ ! -e ${nsjoin_binary} ]; then
+ echo "Could not locate ${nsjoin_binary}, please make sure you've built the binary"
+ exit 1
+fi
+
+if [ $(echo ${latency_variation} | awk '{printf "%1.2f", $0}') != "0.00" ]; then
+ # As long as there's some jitter, then we use the paretonormal distribution.
+ # This will preserve the minimum RTT, but add a realistic amount of jitter to
+ # the connection and cause re-ordering, etc. The regular pareto distribution
+ # appears to an unreasonable level of delay (we want only small spikes.)
+ distribution="distribution paretonormal"
+else
+ distribution=""
+fi
+
+# Client proxy that will listen on the client's iperf target forward traffic
+# using the host networking stack.
+client_args="${proxy_binary} -port ${proxy_port} -forward ${server_proxy_addr}:${proxy_port}"
+if ${client}; then
+ # Client proxy that will listen on the client's iperf target
+ # and forward traffic using netstack.
+ client_args="${proxy_binary} ${netstack_opts} -port ${proxy_port} -client \\
+ -mtu ${mtu} -iface client.0 -addr ${client_proxy_addr} -mask ${mask} \\
+ -forward ${server_proxy_addr}:${proxy_port} -gso=${gso} -swgso=${swgso}"
+fi
+
+# Server proxy that will listen on the proxy port and forward to the server's
+# iperf server using the host networking stack.
+server_args="${proxy_binary} -port ${proxy_port} -forward ${server_addr}:${iperf_port}"
+if ${server}; then
+ # Server proxy that will listen on the proxy port and forward to the servers'
+ # iperf server using netstack.
+ server_args="${proxy_binary} ${netstack_opts} -port ${proxy_port} -server \\
+ -mtu ${mtu} -iface server.0 -addr ${server_proxy_addr} -mask ${mask} \\
+ -forward ${server_addr}:${iperf_port} -gso=${gso} -swgso=${swgso}"
+fi
+
+# Specify loss and duplicate parameters only if they are non-zero
+loss_opt=""
+if [ "$(echo $half_loss | bc -q)" != "0" ]; then
+ loss_opt="loss random ${half_loss}%"
+fi
+duplicate_opt=""
+if [ "$(echo $half_duplicate | bc -q)" != "0" ]; then
+ duplicate_opt="duplicate ${half_duplicate}%"
+fi
+
+exec unshare -U -m -n -r -f -p --mount-proc /bin/bash << EOF
+set -e -m
+
+if [ ${verbose} == "true" ]; then
+ set -x
+fi
+
+mount -t tmpfs netstack-bench /tmp
+
+# We may have reset the path in the unshare if the shell loaded some public
+# profiles. Ensure that tools are discoverable via the parent's PATH.
+export PATH=${PATH}
+
+# Add client, server interfaces.
+ip link add client.0 type veth peer name client.1
+ip link add server.0 type veth peer name server.1
+
+# Add network emulation devices.
+ip link add wan.0 type veth peer name wan.1
+ip link set wan.0 up
+ip link set wan.1 up
+
+# Enroll on the bridge.
+ip link add name br0 type bridge
+ip link add name br1 type bridge
+ip link set client.1 master br0
+ip link set server.1 master br1
+ip link set wan.0 master br0
+ip link set wan.1 master br1
+ip link set br0 up
+ip link set br1 up
+
+# Set the MTU appropriately.
+ip link set client.0 mtu ${mtu}
+ip link set server.0 mtu ${mtu}
+ip link set wan.0 mtu ${mtu}
+ip link set wan.1 mtu ${mtu}
+
+# Add appropriate latency, loss and duplication.
+#
+# This is added in at the point of bridge connection.
+for device in wan.0 wan.1; do
+ # NOTE: We don't support a loss correlation as testing has shown that it
+ # actually doesn't work. The man page actually has a small comment about this
+ # "It is also possible to add a correlation, but this option is now deprecated
+ # due to the noticed bad behavior." For more information see netem(8).
+ tc qdisc add dev \$device root netem \\
+ delay ${half_latency}ms ${latency_variation}ms ${distribution} \\
+ ${loss_opt} ${duplicate_opt}
+done
+
+# Start a client proxy.
+touch /tmp/client.netns
+unshare -n mount --bind /proc/self/ns/net /tmp/client.netns
+
+# Move the endpoint into the namespace.
+while ip link | grep client.0 > /dev/null; do
+ ip link set dev client.0 netns /tmp/client.netns
+done
+
+if ! ${client}; then
+ # Only add the address to NIC if netstack is not in use. Otherwise the host
+ # will also process the inbound SYN and send a RST back.
+ ${nsjoin_binary} /tmp/client.netns ip addr add ${client_proxy_addr}/${mask} dev client.0
+fi
+
+# Start a server proxy.
+touch /tmp/server.netns
+unshare -n mount --bind /proc/self/ns/net /tmp/server.netns
+# Move the endpoint into the namespace.
+while ip link | grep server.0 > /dev/null; do
+ ip link set dev server.0 netns /tmp/server.netns
+done
+if ! ${server}; then
+ # Only add the address to NIC if netstack is not in use. Otherwise the host
+ # will also process the inbound SYN and send a RST back.
+ ${nsjoin_binary} /tmp/server.netns ip addr add ${server_proxy_addr}/${mask} dev server.0
+fi
+
+# Add client and server addresses, and bring everything up.
+${nsjoin_binary} /tmp/client.netns ip addr add ${client_addr}/${mask} dev client.0
+${nsjoin_binary} /tmp/server.netns ip addr add ${server_addr}/${mask} dev server.0
+if [ "${disable_linux_gso}" == "1" ]; then
+ ${nsjoin_binary} /tmp/client.netns ethtool -K client.0 tso off
+ ${nsjoin_binary} /tmp/client.netns ethtool -K client.0 gro off
+ ${nsjoin_binary} /tmp/client.netns ethtool -K client.0 gso off
+ ${nsjoin_binary} /tmp/server.netns ethtool -K server.0 tso off
+ ${nsjoin_binary} /tmp/server.netns ethtool -K server.0 gso off
+ ${nsjoin_binary} /tmp/server.netns ethtool -K server.0 gro off
+fi
+${nsjoin_binary} /tmp/client.netns ip link set client.0 up
+${nsjoin_binary} /tmp/client.netns ip link set lo up
+${nsjoin_binary} /tmp/server.netns ip link set server.0 up
+${nsjoin_binary} /tmp/server.netns ip link set lo up
+ip link set dev client.1 up
+ip link set dev server.1 up
+
+${nsjoin_binary} /tmp/client.netns ${client_args} &
+client_pid=\$!
+${nsjoin_binary} /tmp/server.netns ${server_args} &
+server_pid=\$!
+
+# Start the iperf server.
+${nsjoin_binary} /tmp/server.netns iperf -p ${iperf_port} -s >&2 &
+iperf_pid=\$!
+
+# Show traffic information.
+if ! ${client} && ! ${server}; then
+ ${nsjoin_binary} /tmp/client.netns ping -c 100 -i 0.001 -W 1 ${server_addr} >&2 || true
+fi
+
+results_file=\$(mktemp)
+function cleanup {
+ rm -f \$results_file
+ kill -TERM \$client_pid
+ kill -TERM \$server_pid
+ wait \$client_pid
+ wait \$server_pid
+ kill -9 \$iperf_pid 2>/dev/null
+}
+
+# Allow failure from this point.
+set +e
+trap cleanup EXIT
+
+# Run the benchmark, recording the results file.
+while ${nsjoin_binary} /tmp/client.netns iperf \\
+ -p ${proxy_port} -c ${client_addr} -t ${duration} -f m -P ${num_client_threads} 2>&1 \\
+ | tee \$results_file \\
+ | grep "connect failed" >/dev/null; do
+ sleep 0.1 # Wait for all services.
+done
+
+# Unlink all relevant devices from the bridge. This is because when the bridge
+# is deleted, the kernel may hang. It appears that this problem is fixed in
+# upstream commit 1ce5cce895309862d2c35d922816adebe094fe4a.
+ip link set client.1 nomaster
+ip link set server.1 nomaster
+ip link set wan.0 nomaster
+ip link set wan.1 nomaster
+
+# Emit raw results.
+cat \$results_file >&2
+
+# Emit a useful result (final throughput).
+mbits=\$(grep Mbits/sec \$results_file \\
+ | sed -n -e 's/^.*[[:space:]]\\([[:digit:]]\\+\\(\\.[[:digit:]]\\+\\)\\?\\)[[:space:]]*Mbits\\/sec.*/\\1/p')
+client_cpu_ticks=\$(cat /proc/\$client_pid/stat \\
+ | awk '{print (\$14+\$15);}')
+server_cpu_ticks=\$(cat /proc/\$server_pid/stat \\
+ | awk '{print (\$14+\$15);}')
+ticks_per_sec=\$(getconf CLK_TCK)
+client_cpu_load=\$(bc -l <<< \$client_cpu_ticks/\$ticks_per_sec/${duration})
+server_cpu_load=\$(bc -l <<< \$server_cpu_ticks/\$ticks_per_sec/${duration})
+echo \$mbits \$client_cpu_load \$server_cpu_load
+EOF
diff --git a/benchmarks/tcp/tcp_proxy.go b/benchmarks/tcp/tcp_proxy.go
new file mode 100644
index 000000000..72ada5700
--- /dev/null
+++ b/benchmarks/tcp/tcp_proxy.go
@@ -0,0 +1,444 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Binary tcp_proxy is a simple TCP proxy.
+package main
+
+import (
+ "encoding/gob"
+ "flag"
+ "fmt"
+ "io"
+ "log"
+ "math/rand"
+ "net"
+ "os"
+ "os/signal"
+ "regexp"
+ "runtime"
+ "runtime/pprof"
+ "strconv"
+ "syscall"
+ "time"
+
+ "golang.org/x/sys/unix"
+ "gvisor.dev/gvisor/pkg/tcpip"
+ "gvisor.dev/gvisor/pkg/tcpip/adapters/gonet"
+ "gvisor.dev/gvisor/pkg/tcpip/link/fdbased"
+ "gvisor.dev/gvisor/pkg/tcpip/network/arp"
+ "gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+ "gvisor.dev/gvisor/pkg/tcpip/stack"
+ "gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
+ "gvisor.dev/gvisor/pkg/tcpip/transport/udp"
+)
+
+var (
+ port = flag.Int("port", 0, "bind port (all addresses)")
+ forward = flag.String("forward", "", "forwarding target")
+ client = flag.Bool("client", false, "use netstack for listen")
+ server = flag.Bool("server", false, "use netstack for dial")
+
+ // Netstack-specific options.
+ mtu = flag.Int("mtu", 1280, "mtu for network stack")
+ addr = flag.String("addr", "", "address for tap-based netstack")
+ mask = flag.Int("mask", 8, "mask size for address")
+ iface = flag.String("iface", "", "network interface name to bind for netstack")
+ sack = flag.Bool("sack", false, "enable SACK support for netstack")
+ cubic = flag.Bool("cubic", false, "enable use of CUBIC congestion control for netstack")
+ gso = flag.Int("gso", 0, "GSO maximum size")
+ swgso = flag.Bool("swgso", false, "software-level GSO")
+ clientTCPProbeFile = flag.String("client_tcp_probe_file", "", "if specified, installs a tcp probe to dump endpoint state to the specified file.")
+ serverTCPProbeFile = flag.String("server_tcp_probe_file", "", "if specified, installs a tcp probe to dump endpoint state to the specified file.")
+ cpuprofile = flag.String("cpuprofile", "", "write cpu profile to the specified file.")
+ memprofile = flag.String("memprofile", "", "write memory profile to the specified file.")
+)
+
+type impl interface {
+ dial(address string) (net.Conn, error)
+ listen(port int) (net.Listener, error)
+ printStats()
+}
+
+type netImpl struct{}
+
+func (netImpl) dial(address string) (net.Conn, error) {
+ return net.Dial("tcp", address)
+}
+
+func (netImpl) listen(port int) (net.Listener, error) {
+ return net.Listen("tcp", fmt.Sprintf(":%d", port))
+}
+
+func (netImpl) printStats() {
+}
+
+const (
+ nicID = 1 // Fixed.
+ bufSize = 4 << 20 // 4MB.
+)
+
+type netstackImpl struct {
+ s *stack.Stack
+ addr tcpip.Address
+ mode string
+}
+
+func setupNetwork(ifaceName string, numChannels int) (fds []int, err error) {
+ // Get all interfaces in the namespace.
+ ifaces, err := net.Interfaces()
+ if err != nil {
+ return nil, fmt.Errorf("querying interfaces: %v", err)
+ }
+
+ for _, iface := range ifaces {
+ if iface.Name != ifaceName {
+ continue
+ }
+ // Create the socket.
+ const protocol = 0x0300 // htons(ETH_P_ALL)
+ fds := make([]int, numChannels)
+ for i := range fds {
+ fd, err := syscall.Socket(syscall.AF_PACKET, syscall.SOCK_RAW, protocol)
+ if err != nil {
+ return nil, fmt.Errorf("unable to create raw socket: %v", err)
+ }
+
+ // Bind to the appropriate device.
+ ll := syscall.SockaddrLinklayer{
+ Protocol: protocol,
+ Ifindex: iface.Index,
+ Pkttype: syscall.PACKET_HOST,
+ }
+ if err := syscall.Bind(fd, &ll); err != nil {
+ return nil, fmt.Errorf("unable to bind to %q: %v", iface.Name, err)
+ }
+
+ // RAW Sockets by default have a very small SO_RCVBUF of 256KB,
+ // up it to at least 4MB to reduce packet drops.
+ if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUF, bufSize); err != nil {
+ return nil, fmt.Errorf("setsockopt(..., SO_RCVBUF, %v,..) = %v", bufSize, err)
+ }
+
+ if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_SNDBUF, bufSize); err != nil {
+ return nil, fmt.Errorf("setsockopt(..., SO_SNDBUF, %v,..) = %v", bufSize, err)
+ }
+
+ if !*swgso && *gso != 0 {
+ if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_VNET_HDR, 1); err != nil {
+ return nil, fmt.Errorf("unable to enable the PACKET_VNET_HDR option: %v", err)
+ }
+ }
+ fds[i] = fd
+ }
+ return fds, nil
+ }
+ return nil, fmt.Errorf("failed to find interface: %v", ifaceName)
+}
+
+func newNetstackImpl(mode string) (impl, error) {
+ fds, err := setupNetwork(*iface, runtime.GOMAXPROCS(-1))
+ if err != nil {
+ return nil, err
+ }
+
+ // Parse details.
+ parsedAddr := tcpip.Address(net.ParseIP(*addr).To4())
+ parsedDest := tcpip.Address("") // Filled in below.
+ parsedMask := tcpip.AddressMask("") // Filled in below.
+ switch *mask {
+ case 8:
+ parsedDest = tcpip.Address([]byte{parsedAddr[0], 0, 0, 0})
+ parsedMask = tcpip.AddressMask([]byte{0xff, 0, 0, 0})
+ case 16:
+ parsedDest = tcpip.Address([]byte{parsedAddr[0], parsedAddr[1], 0, 0})
+ parsedMask = tcpip.AddressMask([]byte{0xff, 0xff, 0, 0})
+ case 24:
+ parsedDest = tcpip.Address([]byte{parsedAddr[0], parsedAddr[1], parsedAddr[2], 0})
+ parsedMask = tcpip.AddressMask([]byte{0xff, 0xff, 0xff, 0})
+ default:
+ // This is just laziness; we don't expect a different mask.
+ return nil, fmt.Errorf("mask %d not supported", mask)
+ }
+
+ // Create a new network stack.
+ netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), arp.NewProtocol()}
+ transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol()}
+ s := stack.New(stack.Options{
+ NetworkProtocols: netProtos,
+ TransportProtocols: transProtos,
+ })
+
+ // Generate a new mac for the eth device.
+ mac := make(net.HardwareAddr, 6)
+ rand.Read(mac) // Fill with random data.
+ mac[0] &^= 0x1 // Clear multicast bit.
+ mac[0] |= 0x2 // Set local assignment bit (IEEE802).
+ ep, err := fdbased.New(&fdbased.Options{
+ FDs: fds,
+ MTU: uint32(*mtu),
+ EthernetHeader: true,
+ Address: tcpip.LinkAddress(mac),
+ // Enable checksum generation as we need to generate valid
+ // checksums for the veth device to deliver our packets to the
+ // peer. But we do want to disable checksum verification as veth
+ // devices do perform GRO and the linux host kernel may not
+ // regenerate valid checksums after GRO.
+ TXChecksumOffload: false,
+ RXChecksumOffload: true,
+ PacketDispatchMode: fdbased.RecvMMsg,
+ GSOMaxSize: uint32(*gso),
+ SoftwareGSOEnabled: *swgso,
+ })
+ if err != nil {
+ return nil, fmt.Errorf("failed to create FD endpoint: %v", err)
+ }
+ if err := s.CreateNIC(nicID, ep); err != nil {
+ return nil, fmt.Errorf("error creating NIC %q: %v", *iface, err)
+ }
+ if err := s.AddAddress(nicID, arp.ProtocolNumber, arp.ProtocolAddress); err != nil {
+ return nil, fmt.Errorf("error adding ARP address to %q: %v", *iface, err)
+ }
+ if err := s.AddAddress(nicID, ipv4.ProtocolNumber, parsedAddr); err != nil {
+ return nil, fmt.Errorf("error adding IP address to %q: %v", *iface, err)
+ }
+
+ subnet, err := tcpip.NewSubnet(parsedDest, parsedMask)
+ if err != nil {
+ return nil, fmt.Errorf("tcpip.Subnet(%s, %s): %s", parsedDest, parsedMask, err)
+ }
+ // Add default route; we only support
+ s.SetRouteTable([]tcpip.Route{
+ {
+ Destination: subnet,
+ NIC: nicID,
+ },
+ })
+
+ // Set protocol options.
+ if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(*sack)); err != nil {
+ return nil, fmt.Errorf("SetTransportProtocolOption for SACKEnabled failed: %v", err)
+ }
+
+ // Set Congestion Control to cubic if requested.
+ if *cubic {
+ if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.CongestionControlOption("cubic")); err != nil {
+ return nil, fmt.Errorf("SetTransportProtocolOption for CongestionControlOption(cubic) failed: %v", err)
+ }
+ }
+
+ return netstackImpl{
+ s: s,
+ addr: parsedAddr,
+ mode: mode,
+ }, nil
+}
+
+func (n netstackImpl) dial(address string) (net.Conn, error) {
+ host, port, err := net.SplitHostPort(address)
+ if err != nil {
+ return nil, err
+ }
+ if host == "" {
+ // A host must be provided for the dial.
+ return nil, fmt.Errorf("no host provided")
+ }
+ portNumber, err := strconv.Atoi(port)
+ if err != nil {
+ return nil, err
+ }
+ addr := tcpip.FullAddress{
+ NIC: nicID,
+ Addr: tcpip.Address(net.ParseIP(host).To4()),
+ Port: uint16(portNumber),
+ }
+ conn, err := gonet.DialTCP(n.s, addr, ipv4.ProtocolNumber)
+ if err != nil {
+ return nil, err
+ }
+ return conn, nil
+}
+
+func (n netstackImpl) listen(port int) (net.Listener, error) {
+ addr := tcpip.FullAddress{
+ NIC: nicID,
+ Port: uint16(port),
+ }
+ listener, err := gonet.NewListener(n.s, addr, ipv4.ProtocolNumber)
+ if err != nil {
+ return nil, err
+ }
+ return listener, nil
+}
+
+var zeroFieldsRegexp = regexp.MustCompile(`\s*[a-zA-Z0-9]*:0`)
+
+func (n netstackImpl) printStats() {
+ // Don't show zero fields.
+ stats := zeroFieldsRegexp.ReplaceAllString(fmt.Sprintf("%+v", n.s.Stats()), "")
+ log.Printf("netstack %s Stats: %+v\n", n.mode, stats)
+}
+
+// installProbe installs a TCP Probe function that will dump endpoint
+// state to the specified file. It also returns a close func() that
+// can be used to close the probeFile.
+func (n netstackImpl) installProbe(probeFileName string) (close func()) {
+ // Install Probe to dump out end point state.
+ probeFile, err := os.Create(probeFileName)
+ if err != nil {
+ log.Fatalf("failed to create tcp_probe file %s: %v", probeFileName, err)
+ }
+ probeEncoder := gob.NewEncoder(probeFile)
+ // Install a TCP Probe.
+ n.s.AddTCPProbe(func(state stack.TCPEndpointState) {
+ probeEncoder.Encode(state)
+ })
+ return func() { probeFile.Close() }
+}
+
+func main() {
+ flag.Parse()
+ if *port == 0 {
+ log.Fatalf("no port provided")
+ }
+ if *forward == "" {
+ log.Fatalf("no forward provided")
+ }
+ // Seed the random number generator to ensure that we are given MAC addresses that don't
+ // for the case of the client and server stack.
+ rand.Seed(time.Now().UTC().UnixNano())
+
+ if *cpuprofile != "" {
+ f, err := os.Create(*cpuprofile)
+ if err != nil {
+ log.Fatal("could not create CPU profile: ", err)
+ }
+ defer func() {
+ if err := f.Close(); err != nil {
+ log.Print("error closing CPU profile: ", err)
+ }
+ }()
+ if err := pprof.StartCPUProfile(f); err != nil {
+ log.Fatal("could not start CPU profile: ", err)
+ }
+ defer pprof.StopCPUProfile()
+ }
+
+ var (
+ in impl
+ out impl
+ err error
+ )
+ if *server {
+ in, err = newNetstackImpl("server")
+ if *serverTCPProbeFile != "" {
+ defer in.(netstackImpl).installProbe(*serverTCPProbeFile)()
+ }
+
+ } else {
+ in = netImpl{}
+ }
+ if err != nil {
+ log.Fatalf("netstack error: %v", err)
+ }
+ if *client {
+ out, err = newNetstackImpl("client")
+ if *clientTCPProbeFile != "" {
+ defer out.(netstackImpl).installProbe(*clientTCPProbeFile)()
+ }
+ } else {
+ out = netImpl{}
+ }
+ if err != nil {
+ log.Fatalf("netstack error: %v", err)
+ }
+
+ // Dial forward before binding.
+ var next net.Conn
+ for {
+ next, err = out.dial(*forward)
+ if err == nil {
+ break
+ }
+ time.Sleep(50 * time.Millisecond)
+ log.Printf("connect failed retrying: %v", err)
+ }
+
+ // Bind once to the server socket.
+ listener, err := in.listen(*port)
+ if err != nil {
+ // Should not happen, everything must be bound by this time
+ // this proxy is started.
+ log.Fatalf("unable to listen: %v", err)
+ }
+ log.Printf("client=%v, server=%v, ready.", *client, *server)
+
+ sigs := make(chan os.Signal, 1)
+ signal.Notify(sigs, syscall.SIGTERM)
+ go func() {
+ <-sigs
+ if *cpuprofile != "" {
+ pprof.StopCPUProfile()
+ }
+ if *memprofile != "" {
+ f, err := os.Create(*memprofile)
+ if err != nil {
+ log.Fatal("could not create memory profile: ", err)
+ }
+ defer func() {
+ if err := f.Close(); err != nil {
+ log.Print("error closing memory profile: ", err)
+ }
+ }()
+ runtime.GC() // get up-to-date statistics
+ if err := pprof.WriteHeapProfile(f); err != nil {
+ log.Fatalf("Unable to write heap profile: %v", err)
+ }
+ }
+ os.Exit(0)
+ }()
+
+ for {
+ // Forward all connections.
+ inConn, err := listener.Accept()
+ if err != nil {
+ // This should not happen; we are listening
+ // successfully. Exhausted all available FDs?
+ log.Fatalf("accept error: %v", err)
+ }
+ log.Printf("incoming connection established.")
+
+ // Copy both ways.
+ go io.Copy(inConn, next)
+ go io.Copy(next, inConn)
+
+ // Print stats every second.
+ go func() {
+ t := time.NewTicker(time.Second)
+ defer t.Stop()
+ for {
+ <-t.C
+ in.printStats()
+ out.printStats()
+ }
+ }()
+
+ for {
+ // Dial again.
+ next, err = out.dial(*forward)
+ if err == nil {
+ break
+ }
+ }
+ }
+}
diff --git a/benchmarks/workloads/BUILD b/benchmarks/workloads/BUILD
new file mode 100644
index 000000000..ccb86af5b
--- /dev/null
+++ b/benchmarks/workloads/BUILD
@@ -0,0 +1,35 @@
+package(
+ default_visibility = ["//benchmarks:__subpackages__"],
+ licenses = ["notice"],
+)
+
+py_library(
+ name = "workloads",
+ srcs = ["__init__.py"],
+)
+
+filegroup(
+ name = "files",
+ srcs = [
+ "//benchmarks/workloads/ab:tar",
+ "//benchmarks/workloads/absl:tar",
+ "//benchmarks/workloads/curl:tar",
+ "//benchmarks/workloads/ffmpeg:tar",
+ "//benchmarks/workloads/fio:tar",
+ "//benchmarks/workloads/httpd:tar",
+ "//benchmarks/workloads/iperf:tar",
+ "//benchmarks/workloads/netcat:tar",
+ "//benchmarks/workloads/nginx:tar",
+ "//benchmarks/workloads/node:tar",
+ "//benchmarks/workloads/node_template:tar",
+ "//benchmarks/workloads/redis:tar",
+ "//benchmarks/workloads/redisbenchmark:tar",
+ "//benchmarks/workloads/ruby:tar",
+ "//benchmarks/workloads/ruby_template:tar",
+ "//benchmarks/workloads/sleep:tar",
+ "//benchmarks/workloads/sysbench:tar",
+ "//benchmarks/workloads/syscall:tar",
+ "//benchmarks/workloads/tensorflow:tar",
+ "//benchmarks/workloads/true:tar",
+ ],
+)
diff --git a/benchmarks/workloads/__init__.py b/benchmarks/workloads/__init__.py
new file mode 100644
index 000000000..e12651e76
--- /dev/null
+++ b/benchmarks/workloads/__init__.py
@@ -0,0 +1,14 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Workloads, parsers and test data."""
diff --git a/benchmarks/workloads/ab/BUILD b/benchmarks/workloads/ab/BUILD
new file mode 100644
index 000000000..4fc0ab735
--- /dev/null
+++ b/benchmarks/workloads/ab/BUILD
@@ -0,0 +1,36 @@
+load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement")
+load("@rules_pkg//:pkg.bzl", "pkg_tar")
+
+package(
+ default_visibility = ["//benchmarks:__subpackages__"],
+ licenses = ["notice"],
+)
+
+py_library(
+ name = "ab",
+ srcs = ["__init__.py"],
+)
+
+py_test(
+ name = "ab_test",
+ srcs = ["ab_test.py"],
+ python_version = "PY3",
+ deps = [
+ ":ab",
+ requirement("attrs", False),
+ requirement("atomicwrites", False),
+ requirement("more-itertools", False),
+ requirement("pathlib2", False),
+ requirement("pluggy", False),
+ requirement("py", False),
+ requirement("pytest", True),
+ requirement("six", False),
+ ],
+)
+
+pkg_tar(
+ name = "tar",
+ srcs = [
+ "Dockerfile",
+ ],
+)
diff --git a/benchmarks/workloads/ab/Dockerfile b/benchmarks/workloads/ab/Dockerfile
new file mode 100644
index 000000000..0d0b6e2eb
--- /dev/null
+++ b/benchmarks/workloads/ab/Dockerfile
@@ -0,0 +1,15 @@
+FROM ubuntu:18.04
+
+RUN set -x \
+ && apt-get update \
+ && apt-get install -y \
+ apache2-utils \
+ && rm -rf /var/lib/apt/lists/*
+
+# Parameterized workload.
+ENV requests 5000
+ENV connections 10
+ENV host localhost
+ENV port 8080
+ENV path notfound
+CMD ["sh", "-c", "ab -n ${requests} -c ${connections} http://${host}:${port}/${path}"]
diff --git a/benchmarks/workloads/ab/__init__.py b/benchmarks/workloads/ab/__init__.py
new file mode 100644
index 000000000..eedf8e083
--- /dev/null
+++ b/benchmarks/workloads/ab/__init__.py
@@ -0,0 +1,88 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Apachebench tool."""
+
+import re
+
+SAMPLE_DATA = """This is ApacheBench, Version 2.3 <$Revision: 1826891 $>
+Copyright 1996 Adam Twiss, Zeus Technology Ltd, http://www.zeustech.net/
+Licensed to The Apache Software Foundation, http://www.apache.org/
+
+Benchmarking 10.10.10.10 (be patient).....done
+
+
+Server Software: Apache/2.4.38
+Server Hostname: 10.10.10.10
+Server Port: 80
+
+Document Path: /latin10k.txt
+Document Length: 210 bytes
+
+Concurrency Level: 1
+Time taken for tests: 0.180 seconds
+Complete requests: 100
+Failed requests: 0
+Non-2xx responses: 100
+Total transferred: 38800 bytes
+HTML transferred: 21000 bytes
+Requests per second: 556.44 [#/sec] (mean)
+Time per request: 1.797 [ms] (mean)
+Time per request: 1.797 [ms] (mean, across all concurrent requests)
+Transfer rate: 210.84 [Kbytes/sec] received
+
+Connection Times (ms)
+ min mean[+/-sd] median max
+Connect: 0 0 0.2 0 2
+Processing: 1 2 1.0 1 8
+Waiting: 1 1 1.0 1 7
+Total: 1 2 1.2 1 10
+
+Percentage of the requests served within a certain time (ms)
+ 50% 1
+ 66% 2
+ 75% 2
+ 80% 2
+ 90% 2
+ 95% 3
+ 98% 7
+ 99% 10
+ 100% 10 (longest request)"""
+
+
+# pylint: disable=unused-argument
+def sample(**kwargs) -> str:
+ return SAMPLE_DATA
+
+
+# pylint: disable=unused-argument
+def transfer_rate(data: str, **kwargs) -> float:
+ """Mean transfer rate in Kbytes/sec."""
+ regex = r"Transfer rate:\s+(\d+\.?\d+?)\s+\[Kbytes/sec\]\s+received"
+ return float(re.compile(regex).search(data).group(1))
+
+
+# pylint: disable=unused-argument
+def latency(data: str, **kwargs) -> float:
+ """Mean latency in milliseconds."""
+ regex = r"Total:\s+\d+\s+(\d+)\s+(\d+\.?\d+?)\s+\d+\s+\d+\s"
+ res = re.compile(regex).search(data)
+ return float(res.group(1))
+
+
+# pylint: disable=unused-argument
+def requests_per_second(data: str, **kwargs) -> float:
+ """Requests per second."""
+ regex = r"Requests per second:\s+(\d+\.?\d+?)\s+"
+ res = re.compile(regex).search(data)
+ return float(res.group(1))
diff --git a/benchmarks/workloads/ab/ab_test.py b/benchmarks/workloads/ab/ab_test.py
new file mode 100644
index 000000000..4afac2996
--- /dev/null
+++ b/benchmarks/workloads/ab/ab_test.py
@@ -0,0 +1,42 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Parser test."""
+
+import sys
+
+import pytest
+
+from benchmarks.workloads import ab
+
+
+def test_transfer_rate_parser():
+ """Test transfer rate parser."""
+ res = ab.transfer_rate(ab.sample())
+ assert res == 210.84
+
+
+def test_latency_parser():
+ """Test latency parser."""
+ res = ab.latency(ab.sample())
+ assert res == 2
+
+
+def test_requests_per_second():
+ """Test requests per second parser."""
+ res = ab.requests_per_second(ab.sample())
+ assert res == 556.44
+
+
+if __name__ == "__main__":
+ sys.exit(pytest.main([__file__]))
diff --git a/benchmarks/workloads/absl/BUILD b/benchmarks/workloads/absl/BUILD
new file mode 100644
index 000000000..61e010096
--- /dev/null
+++ b/benchmarks/workloads/absl/BUILD
@@ -0,0 +1,36 @@
+load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement")
+load("@rules_pkg//:pkg.bzl", "pkg_tar")
+
+package(
+ default_visibility = ["//benchmarks:__subpackages__"],
+ licenses = ["notice"],
+)
+
+py_library(
+ name = "absl",
+ srcs = ["__init__.py"],
+)
+
+py_test(
+ name = "absl_test",
+ srcs = ["absl_test.py"],
+ python_version = "PY3",
+ deps = [
+ ":absl",
+ requirement("attrs", False),
+ requirement("atomicwrites", False),
+ requirement("more-itertools", False),
+ requirement("pathlib2", False),
+ requirement("pluggy", False),
+ requirement("py", False),
+ requirement("pytest", True),
+ requirement("six", False),
+ ],
+)
+
+pkg_tar(
+ name = "tar",
+ srcs = [
+ "Dockerfile",
+ ],
+)
diff --git a/benchmarks/workloads/absl/Dockerfile b/benchmarks/workloads/absl/Dockerfile
new file mode 100644
index 000000000..e935c5ddc
--- /dev/null
+++ b/benchmarks/workloads/absl/Dockerfile
@@ -0,0 +1,24 @@
+FROM ubuntu:18.04
+
+RUN set -x \
+ && apt-get update \
+ && apt-get install -y \
+ wget \
+ git \
+ pkg-config \
+ zip \
+ g++ \
+ zlib1g-dev \
+ unzip \
+ python3 \
+ && rm -rf /var/lib/apt/lists/*
+RUN wget https://github.com/bazelbuild/bazel/releases/download/0.27.0/bazel-0.27.0-installer-linux-x86_64.sh
+RUN chmod +x bazel-0.27.0-installer-linux-x86_64.sh
+RUN ./bazel-0.27.0-installer-linux-x86_64.sh
+
+RUN git clone https://github.com/abseil/abseil-cpp.git
+WORKDIR abseil-cpp
+RUN git checkout 43ef2148c0936ebf7cb4be6b19927a9d9d145b8f
+RUN bazel clean
+ENV path "absl/base/..."
+CMD bazel build ${path} 2>&1
diff --git a/benchmarks/workloads/absl/__init__.py b/benchmarks/workloads/absl/__init__.py
new file mode 100644
index 000000000..b40e3f915
--- /dev/null
+++ b/benchmarks/workloads/absl/__init__.py
@@ -0,0 +1,63 @@
+# python3
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ABSL build benchmark."""
+
+import re
+
+SAMPLE_BAZEL_OUTPUT = """Extracting Bazel installation...
+Starting local Bazel server and connecting to it...
+Loading:
+Loading: 0 packages loaded
+Loading: 0 packages loaded
+ currently loading: absl/algorithm ... (11 packages)
+Analyzing: 241 targets (16 packages loaded, 0 targets configured)
+Analyzing: 241 targets (21 packages loaded, 617 targets configured)
+Analyzing: 241 targets (27 packages loaded, 687 targets configured)
+Analyzing: 241 targets (32 packages loaded, 1105 targets configured)
+Analyzing: 241 targets (32 packages loaded, 1294 targets configured)
+Analyzing: 241 targets (35 packages loaded, 1575 targets configured)
+Analyzing: 241 targets (35 packages loaded, 1575 targets configured)
+Analyzing: 241 targets (36 packages loaded, 1603 targets configured)
+Analyzing: 241 targets (36 packages loaded, 1603 targets configured)
+INFO: Analyzed 241 targets (37 packages loaded, 1864 targets configured).
+INFO: Found 241 targets...
+[0 / 5] [Prepa] BazelWorkspaceStatusAction stable-status.txt
+[16 / 50] [Analy] Compiling absl/base/dynamic_annotations.cc ... (20 actions, 10 running)
+[60 / 77] Compiling external/com_google_googletest/googletest/src/gtest.cc; 5s processwrapper-sandbox ... (12 actions, 11 running)
+[158 / 174] Compiling absl/container/internal/raw_hash_set_test.cc; 2s processwrapper-sandbox ... (12 actions, 11 running)
+[278 / 302] Compiling absl/container/internal/raw_hash_set_test.cc; 6s processwrapper-sandbox ... (12 actions, 11 running)
+[384 / 406] Compiling absl/container/internal/raw_hash_set_test.cc; 10s processwrapper-sandbox ... (12 actions, 11 running)
+[581 / 604] Compiling absl/container/flat_hash_set_test.cc; 11s processwrapper-sandbox ... (12 actions, 11 running)
+[722 / 745] Compiling absl/container/node_hash_set_test.cc; 9s processwrapper-sandbox ... (12 actions, 11 running)
+[846 / 867] Compiling absl/hash/hash_test.cc; 11s processwrapper-sandbox ... (12 actions, 11 running)
+INFO: From Compiling absl/debugging/symbolize_test.cc:
+/tmp/cclCVipU.s: Assembler messages:
+/tmp/cclCVipU.s:1662: Warning: ignoring changed section attributes for .text
+[999 / 1,022] Compiling absl/hash/hash_test.cc; 19s processwrapper-sandbox ... (12 actions, 11 running)
+[1,082 / 1,084] Compiling absl/container/flat_hash_map_test.cc; 7s processwrapper-sandbox
+INFO: Elapsed time: 81.861s, Critical Path: 23.81s
+INFO: 515 processes: 515 processwrapper-sandbox.
+INFO: Build completed successfully, 1084 total actions
+INFO: Build completed successfully, 1084 total actions"""
+
+
+def sample():
+ return SAMPLE_BAZEL_OUTPUT
+
+
+# pylint: disable=unused-argument
+def elapsed_time(data: str, **kwargs) -> float:
+ """Returns the elapsed time for running an absl build."""
+ return float(re.compile(r"Elapsed time: (\d*.?\d*)s").search(data).group(1))
diff --git a/benchmarks/workloads/absl/absl_test.py b/benchmarks/workloads/absl/absl_test.py
new file mode 100644
index 000000000..41f216999
--- /dev/null
+++ b/benchmarks/workloads/absl/absl_test.py
@@ -0,0 +1,31 @@
+# python3
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ABSL build test."""
+
+import sys
+
+import pytest
+
+from benchmarks.workloads import absl
+
+
+def test_elapsed_time():
+ """Test elapsed_time."""
+ res = absl.elapsed_time(absl.sample())
+ assert res == 81.861
+
+
+if __name__ == "__main__":
+ sys.exit(pytest.main([__file__]))
diff --git a/benchmarks/workloads/curl/BUILD b/benchmarks/workloads/curl/BUILD
new file mode 100644
index 000000000..eb0fb6165
--- /dev/null
+++ b/benchmarks/workloads/curl/BUILD
@@ -0,0 +1,13 @@
+load("@rules_pkg//:pkg.bzl", "pkg_tar")
+
+package(
+ default_visibility = ["//benchmarks:__subpackages__"],
+ licenses = ["notice"],
+)
+
+pkg_tar(
+ name = "tar",
+ srcs = [
+ "Dockerfile",
+ ],
+)
diff --git a/benchmarks/workloads/curl/Dockerfile b/benchmarks/workloads/curl/Dockerfile
new file mode 100644
index 000000000..336cb088a
--- /dev/null
+++ b/benchmarks/workloads/curl/Dockerfile
@@ -0,0 +1,14 @@
+FROM ubuntu:18.04
+
+RUN set -x \
+ && apt-get update \
+ && apt-get install -y \
+ curl \
+ && rm -rf /var/lib/apt/lists/*
+
+# Accept a host and port parameter.
+ENV host localhost
+ENV port 8080
+
+# Spin until we make a successful request.
+CMD ["sh", "-c", "while ! curl -v -i http://$host:$port; do true; done"]
diff --git a/benchmarks/workloads/ffmpeg/BUILD b/benchmarks/workloads/ffmpeg/BUILD
new file mode 100644
index 000000000..be472dfb2
--- /dev/null
+++ b/benchmarks/workloads/ffmpeg/BUILD
@@ -0,0 +1,18 @@
+load("@rules_pkg//:pkg.bzl", "pkg_tar")
+
+package(
+ default_visibility = ["//benchmarks:__subpackages__"],
+ licenses = ["notice"],
+)
+
+py_library(
+ name = "ffmpeg",
+ srcs = ["__init__.py"],
+)
+
+pkg_tar(
+ name = "tar",
+ srcs = [
+ "Dockerfile",
+ ],
+)
diff --git a/benchmarks/workloads/ffmpeg/Dockerfile b/benchmarks/workloads/ffmpeg/Dockerfile
new file mode 100644
index 000000000..f2f530d7c
--- /dev/null
+++ b/benchmarks/workloads/ffmpeg/Dockerfile
@@ -0,0 +1,10 @@
+FROM ubuntu:18.04
+
+RUN set -x \
+ && apt-get update \
+ && apt-get install -y \
+ ffmpeg \
+ && rm -rf /var/lib/apt/lists/*
+WORKDIR /media
+ADD https://samples.ffmpeg.org/MPEG-4/video.mp4 video.mp4
+CMD ["ffmpeg", "-i", "video.mp4", "-c:v", "libx264", "-preset", "veryslow", "output.mp4"]
diff --git a/benchmarks/workloads/ffmpeg/__init__.py b/benchmarks/workloads/ffmpeg/__init__.py
new file mode 100644
index 000000000..7578a443b
--- /dev/null
+++ b/benchmarks/workloads/ffmpeg/__init__.py
@@ -0,0 +1,20 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Simple ffmpeg workload."""
+
+
+# pylint: disable=unused-argument
+def run_time(value, **kwargs):
+ """Returns the startup and runtime of the ffmpeg workload in seconds."""
+ return value
diff --git a/benchmarks/workloads/fio/BUILD b/benchmarks/workloads/fio/BUILD
new file mode 100644
index 000000000..de257adad
--- /dev/null
+++ b/benchmarks/workloads/fio/BUILD
@@ -0,0 +1,36 @@
+load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement")
+load("@rules_pkg//:pkg.bzl", "pkg_tar")
+
+package(
+ default_visibility = ["//benchmarks:__subpackages__"],
+ licenses = ["notice"],
+)
+
+py_library(
+ name = "fio",
+ srcs = ["__init__.py"],
+)
+
+py_test(
+ name = "fio_test",
+ srcs = ["fio_test.py"],
+ python_version = "PY3",
+ deps = [
+ ":fio",
+ requirement("attrs", False),
+ requirement("atomicwrites", False),
+ requirement("more-itertools", False),
+ requirement("pathlib2", False),
+ requirement("pluggy", False),
+ requirement("py", False),
+ requirement("pytest", True),
+ requirement("six", False),
+ ],
+)
+
+pkg_tar(
+ name = "tar",
+ srcs = [
+ "Dockerfile",
+ ],
+)
diff --git a/benchmarks/workloads/fio/Dockerfile b/benchmarks/workloads/fio/Dockerfile
new file mode 100644
index 000000000..b3cf864eb
--- /dev/null
+++ b/benchmarks/workloads/fio/Dockerfile
@@ -0,0 +1,23 @@
+FROM ubuntu:18.04
+
+RUN set -x \
+ && apt-get update \
+ && apt-get install -y \
+ fio \
+ && rm -rf /var/lib/apt/lists/*
+
+# Parameterized test.
+ENV test write
+ENV ioengine sync
+ENV size 5000000
+ENV iodepth 4
+ENV blocksize "1m"
+ENV time ""
+ENV path "/disk/file.dat"
+ENV ramp_time 0
+
+CMD ["sh", "-c", "fio --output-format=json --name=test --ramp_time=${ramp_time} --ioengine=${ioengine} --size=${size} \
+--filename=${path} --iodepth=${iodepth} --bs=${blocksize} --rw=${test} ${time}"]
+
+
+
diff --git a/benchmarks/workloads/fio/__init__.py b/benchmarks/workloads/fio/__init__.py
new file mode 100644
index 000000000..52711e956
--- /dev/null
+++ b/benchmarks/workloads/fio/__init__.py
@@ -0,0 +1,369 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""FIO benchmark tool."""
+
+import json
+
+SAMPLE_DATA = """
+{
+ "fio version" : "fio-3.1",
+ "timestamp" : 1554837456,
+ "timestamp_ms" : 1554837456621,
+ "time" : "Tue Apr 9 19:17:36 2019",
+ "jobs" : [
+ {
+ "jobname" : "test",
+ "groupid" : 0,
+ "error" : 0,
+ "eta" : 2147483647,
+ "elapsed" : 1,
+ "job options" : {
+ "name" : "test",
+ "ioengine" : "sync",
+ "size" : "1073741824",
+ "filename" : "/disk/file.dat",
+ "iodepth" : "4",
+ "bs" : "4096",
+ "rw" : "write"
+ },
+ "read" : {
+ "io_bytes" : 0,
+ "io_kbytes" : 0,
+ "bw" : 0,
+ "iops" : 0.000000,
+ "runtime" : 0,
+ "total_ios" : 0,
+ "short_ios" : 0,
+ "drop_ios" : 0,
+ "slat_ns" : {
+ "min" : 0,
+ "max" : 0,
+ "mean" : 0.000000,
+ "stddev" : 0.000000
+ },
+ "clat_ns" : {
+ "min" : 0,
+ "max" : 0,
+ "mean" : 0.000000,
+ "stddev" : 0.000000,
+ "percentile" : {
+ "1.000000" : 0,
+ "5.000000" : 0,
+ "10.000000" : 0,
+ "20.000000" : 0,
+ "30.000000" : 0,
+ "40.000000" : 0,
+ "50.000000" : 0,
+ "60.000000" : 0,
+ "70.000000" : 0,
+ "80.000000" : 0,
+ "90.000000" : 0,
+ "95.000000" : 0,
+ "99.000000" : 0,
+ "99.500000" : 0,
+ "99.900000" : 0,
+ "99.950000" : 0,
+ "99.990000" : 0,
+ "0.00" : 0,
+ "0.00" : 0,
+ "0.00" : 0
+ }
+ },
+ "lat_ns" : {
+ "min" : 0,
+ "max" : 0,
+ "mean" : 0.000000,
+ "stddev" : 0.000000
+ },
+ "bw_min" : 0,
+ "bw_max" : 0,
+ "bw_agg" : 0.000000,
+ "bw_mean" : 0.000000,
+ "bw_dev" : 0.000000,
+ "bw_samples" : 0,
+ "iops_min" : 0,
+ "iops_max" : 0,
+ "iops_mean" : 0.000000,
+ "iops_stddev" : 0.000000,
+ "iops_samples" : 0
+ },
+ "write" : {
+ "io_bytes" : 1073741824,
+ "io_kbytes" : 1048576,
+ "bw" : 1753471,
+ "iops" : 438367.892977,
+ "runtime" : 598,
+ "total_ios" : 262144,
+ "short_ios" : 0,
+ "drop_ios" : 0,
+ "slat_ns" : {
+ "min" : 0,
+ "max" : 0,
+ "mean" : 0.000000,
+ "stddev" : 0.000000
+ },
+ "clat_ns" : {
+ "min" : 1693,
+ "max" : 754733,
+ "mean" : 2076.404373,
+ "stddev" : 1724.195529,
+ "percentile" : {
+ "1.000000" : 1736,
+ "5.000000" : 1752,
+ "10.000000" : 1768,
+ "20.000000" : 1784,
+ "30.000000" : 1800,
+ "40.000000" : 1800,
+ "50.000000" : 1816,
+ "60.000000" : 1816,
+ "70.000000" : 1848,
+ "80.000000" : 1928,
+ "90.000000" : 2512,
+ "95.000000" : 2992,
+ "99.000000" : 6176,
+ "99.500000" : 6304,
+ "99.900000" : 11328,
+ "99.950000" : 15168,
+ "99.990000" : 17792,
+ "0.00" : 0,
+ "0.00" : 0,
+ "0.00" : 0
+ }
+ },
+ "lat_ns" : {
+ "min" : 1731,
+ "max" : 754770,
+ "mean" : 2117.878979,
+ "stddev" : 1730.290512
+ },
+ "bw_min" : 1731120,
+ "bw_max" : 1731120,
+ "bw_agg" : 98.725328,
+ "bw_mean" : 1731120.000000,
+ "bw_dev" : 0.000000,
+ "bw_samples" : 1,
+ "iops_min" : 432780,
+ "iops_max" : 432780,
+ "iops_mean" : 432780.000000,
+ "iops_stddev" : 0.000000,
+ "iops_samples" : 1
+ },
+ "trim" : {
+ "io_bytes" : 0,
+ "io_kbytes" : 0,
+ "bw" : 0,
+ "iops" : 0.000000,
+ "runtime" : 0,
+ "total_ios" : 0,
+ "short_ios" : 0,
+ "drop_ios" : 0,
+ "slat_ns" : {
+ "min" : 0,
+ "max" : 0,
+ "mean" : 0.000000,
+ "stddev" : 0.000000
+ },
+ "clat_ns" : {
+ "min" : 0,
+ "max" : 0,
+ "mean" : 0.000000,
+ "stddev" : 0.000000,
+ "percentile" : {
+ "1.000000" : 0,
+ "5.000000" : 0,
+ "10.000000" : 0,
+ "20.000000" : 0,
+ "30.000000" : 0,
+ "40.000000" : 0,
+ "50.000000" : 0,
+ "60.000000" : 0,
+ "70.000000" : 0,
+ "80.000000" : 0,
+ "90.000000" : 0,
+ "95.000000" : 0,
+ "99.000000" : 0,
+ "99.500000" : 0,
+ "99.900000" : 0,
+ "99.950000" : 0,
+ "99.990000" : 0,
+ "0.00" : 0,
+ "0.00" : 0,
+ "0.00" : 0
+ }
+ },
+ "lat_ns" : {
+ "min" : 0,
+ "max" : 0,
+ "mean" : 0.000000,
+ "stddev" : 0.000000
+ },
+ "bw_min" : 0,
+ "bw_max" : 0,
+ "bw_agg" : 0.000000,
+ "bw_mean" : 0.000000,
+ "bw_dev" : 0.000000,
+ "bw_samples" : 0,
+ "iops_min" : 0,
+ "iops_max" : 0,
+ "iops_mean" : 0.000000,
+ "iops_stddev" : 0.000000,
+ "iops_samples" : 0
+ },
+ "usr_cpu" : 17.922948,
+ "sys_cpu" : 81.574539,
+ "ctx" : 3,
+ "majf" : 0,
+ "minf" : 10,
+ "iodepth_level" : {
+ "1" : 100.000000,
+ "2" : 0.000000,
+ "4" : 0.000000,
+ "8" : 0.000000,
+ "16" : 0.000000,
+ "32" : 0.000000,
+ ">=64" : 0.000000
+ },
+ "latency_ns" : {
+ "2" : 0.000000,
+ "4" : 0.000000,
+ "10" : 0.000000,
+ "20" : 0.000000,
+ "50" : 0.000000,
+ "100" : 0.000000,
+ "250" : 0.000000,
+ "500" : 0.000000,
+ "750" : 0.000000,
+ "1000" : 0.000000
+ },
+ "latency_us" : {
+ "2" : 82.737350,
+ "4" : 12.605286,
+ "10" : 4.543686,
+ "20" : 0.107956,
+ "50" : 0.010000,
+ "100" : 0.000000,
+ "250" : 0.000000,
+ "500" : 0.000000,
+ "750" : 0.000000,
+ "1000" : 0.010000
+ },
+ "latency_ms" : {
+ "2" : 0.000000,
+ "4" : 0.000000,
+ "10" : 0.000000,
+ "20" : 0.000000,
+ "50" : 0.000000,
+ "100" : 0.000000,
+ "250" : 0.000000,
+ "500" : 0.000000,
+ "750" : 0.000000,
+ "1000" : 0.000000,
+ "2000" : 0.000000,
+ ">=2000" : 0.000000
+ },
+ "latency_depth" : 4,
+ "latency_target" : 0,
+ "latency_percentile" : 100.000000,
+ "latency_window" : 0
+ }
+ ],
+ "disk_util" : [
+ {
+ "name" : "dm-1",
+ "read_ios" : 0,
+ "write_ios" : 3,
+ "read_merges" : 0,
+ "write_merges" : 0,
+ "read_ticks" : 0,
+ "write_ticks" : 0,
+ "in_queue" : 0,
+ "util" : 0.000000,
+ "aggr_read_ios" : 0,
+ "aggr_write_ios" : 3,
+ "aggr_read_merges" : 0,
+ "aggr_write_merge" : 0,
+ "aggr_read_ticks" : 0,
+ "aggr_write_ticks" : 0,
+ "aggr_in_queue" : 0,
+ "aggr_util" : 0.000000
+ },
+ {
+ "name" : "dm-0",
+ "read_ios" : 0,
+ "write_ios" : 3,
+ "read_merges" : 0,
+ "write_merges" : 0,
+ "read_ticks" : 0,
+ "write_ticks" : 0,
+ "in_queue" : 0,
+ "util" : 0.000000,
+ "aggr_read_ios" : 0,
+ "aggr_write_ios" : 3,
+ "aggr_read_merges" : 0,
+ "aggr_write_merge" : 0,
+ "aggr_read_ticks" : 0,
+ "aggr_write_ticks" : 2,
+ "aggr_in_queue" : 0,
+ "aggr_util" : 0.000000
+ },
+ {
+ "name" : "nvme0n1",
+ "read_ios" : 0,
+ "write_ios" : 3,
+ "read_merges" : 0,
+ "write_merges" : 0,
+ "read_ticks" : 0,
+ "write_ticks" : 2,
+ "in_queue" : 0,
+ "util" : 0.000000
+ }
+ ]
+}
+"""
+
+
+# pylint: disable=unused-argument
+def sample(**kwargs) -> str:
+ return SAMPLE_DATA
+
+
+# pylint: disable=unused-argument
+def read_bandwidth(data: str, **kwargs) -> int:
+ """File I/O bandwidth."""
+ return json.loads(data)["jobs"][0]["read"]["bw"] * 1024
+
+
+# pylint: disable=unused-argument
+def write_bandwidth(data: str, **kwargs) -> int:
+ """File I/O bandwidth."""
+ return json.loads(data)["jobs"][0]["write"]["bw"] * 1024
+
+
+# pylint: disable=unused-argument
+def read_io_ops(data: str, **kwargs) -> float:
+ """File I/O operations per second."""
+ return float(json.loads(data)["jobs"][0]["read"]["iops"])
+
+
+# pylint: disable=unused-argument
+def write_io_ops(data: str, **kwargs) -> float:
+ """File I/O operations per second."""
+ return float(json.loads(data)["jobs"][0]["write"]["iops"])
+
+
+# Change function names so we just print "bandwidth" and "io_ops".
+read_bandwidth.__name__ = "bandwidth"
+write_bandwidth.__name__ = "bandwidth"
+read_io_ops.__name__ = "io_ops"
+write_io_ops.__name__ = "io_ops"
diff --git a/benchmarks/workloads/fio/fio_test.py b/benchmarks/workloads/fio/fio_test.py
new file mode 100644
index 000000000..04a6eeb7e
--- /dev/null
+++ b/benchmarks/workloads/fio/fio_test.py
@@ -0,0 +1,44 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Parser tests."""
+
+import sys
+
+import pytest
+
+from benchmarks.workloads import fio
+
+
+def test_read_io_ops():
+ """Test read ops parser."""
+ assert fio.read_io_ops(fio.sample()) == 0.0
+
+
+def test_write_io_ops():
+ """Test write ops parser."""
+ assert fio.write_io_ops(fio.sample()) == 438367.892977
+
+
+def test_read_bandwidth():
+ """Test read bandwidth parser."""
+ assert fio.read_bandwidth(fio.sample()) == 0.0
+
+
+def test_write_bandwith():
+ """Test write bandwidth parser."""
+ assert fio.write_bandwidth(fio.sample()) == 1753471 * 1024
+
+
+if __name__ == "__main__":
+ sys.exit(pytest.main([__file__]))
diff --git a/benchmarks/workloads/httpd/BUILD b/benchmarks/workloads/httpd/BUILD
new file mode 100644
index 000000000..eb0fb6165
--- /dev/null
+++ b/benchmarks/workloads/httpd/BUILD
@@ -0,0 +1,13 @@
+load("@rules_pkg//:pkg.bzl", "pkg_tar")
+
+package(
+ default_visibility = ["//benchmarks:__subpackages__"],
+ licenses = ["notice"],
+)
+
+pkg_tar(
+ name = "tar",
+ srcs = [
+ "Dockerfile",
+ ],
+)
diff --git a/benchmarks/workloads/httpd/Dockerfile b/benchmarks/workloads/httpd/Dockerfile
new file mode 100644
index 000000000..5259c8f4f
--- /dev/null
+++ b/benchmarks/workloads/httpd/Dockerfile
@@ -0,0 +1,27 @@
+FROM ubuntu:18.04
+
+RUN set -x \
+ && apt-get update \
+ && apt-get install -y \
+ apache2 \
+ && rm -rf /var/lib/apt/lists/*
+
+# Link the htdoc directory to tmp.
+RUN mkdir -p /usr/local/apache2/htdocs && \
+ cd /usr/local/apache2 && ln -s /tmp htdocs
+
+# Generate a bunch of relevant files.
+RUN mkdir -p /local && \
+ for size in 1 10 100 1000 1024 10240; do \
+ dd if=/dev/zero of=/local/latin${size}k.txt count=${size} bs=1024; \
+ done
+
+# Standard settings.
+ENV APACHE_RUN_DIR /tmp
+ENV APACHE_RUN_USER nobody
+ENV APACHE_RUN_GROUP nogroup
+ENV APACHE_LOG_DIR /tmp
+ENV APACHE_PID_FILE /tmp/apache.pid
+
+# Copy on start-up; serve everything from /tmp (including the configuration).
+CMD ["sh", "-c", "cp -a /local/* /tmp && apache2 -c \"ServerName localhost\" -c \"DocumentRoot /tmp\" -X"]
diff --git a/benchmarks/workloads/iperf/BUILD b/benchmarks/workloads/iperf/BUILD
new file mode 100644
index 000000000..8832a996c
--- /dev/null
+++ b/benchmarks/workloads/iperf/BUILD
@@ -0,0 +1,36 @@
+load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement")
+load("@rules_pkg//:pkg.bzl", "pkg_tar")
+
+package(
+ default_visibility = ["//benchmarks:__subpackages__"],
+ licenses = ["notice"],
+)
+
+py_library(
+ name = "iperf",
+ srcs = ["__init__.py"],
+)
+
+py_test(
+ name = "iperf_test",
+ srcs = ["iperf_test.py"],
+ python_version = "PY3",
+ deps = [
+ ":iperf",
+ requirement("attrs", False),
+ requirement("atomicwrites", False),
+ requirement("more-itertools", False),
+ requirement("pathlib2", False),
+ requirement("pluggy", False),
+ requirement("py", False),
+ requirement("pytest", True),
+ requirement("six", False),
+ ],
+)
+
+pkg_tar(
+ name = "tar",
+ srcs = [
+ "Dockerfile",
+ ],
+)
diff --git a/benchmarks/workloads/iperf/Dockerfile b/benchmarks/workloads/iperf/Dockerfile
new file mode 100644
index 000000000..9704c506c
--- /dev/null
+++ b/benchmarks/workloads/iperf/Dockerfile
@@ -0,0 +1,14 @@
+FROM ubuntu:18.04
+
+RUN set -x \
+ && apt-get update \
+ && apt-get install -y \
+ iperf \
+ && rm -rf /var/lib/apt/lists/*
+
+# Accept a host parameter.
+ENV host ""
+ENV port 5001
+
+# Start as client if the host is provided.
+CMD ["sh", "-c", "test -z \"${host}\" && iperf -s || iperf -f K --realtime -c ${host} -p ${port}"]
diff --git a/benchmarks/workloads/iperf/__init__.py b/benchmarks/workloads/iperf/__init__.py
new file mode 100644
index 000000000..3817a7ade
--- /dev/null
+++ b/benchmarks/workloads/iperf/__init__.py
@@ -0,0 +1,40 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""iperf."""
+
+import re
+
+SAMPLE_DATA = """
+------------------------------------------------------------
+Client connecting to 10.138.15.215, TCP port 32779
+TCP window size: 45.0 KByte (default)
+------------------------------------------------------------
+[ 3] local 10.138.15.216 port 32866 connected with 10.138.15.215 port 32779
+[ ID] Interval Transfer Bandwidth
+[ 3] 0.0-10.0 sec 459520 KBytes 45900 KBytes/sec
+
+"""
+
+
+# pylint: disable=unused-argument
+def sample(**kwargs) -> str:
+ return SAMPLE_DATA
+
+
+# pylint: disable=unused-argument
+def bandwidth(data: str, **kwargs) -> float:
+ """Calculate the bandwidth."""
+ regex = r"\[\s*\d+\][^\n]+\s+(\d+\.?\d*)\s+KBytes/sec"
+ res = re.compile(regex).search(data)
+ return float(res.group(1)) * 1000
diff --git a/benchmarks/workloads/iperf/iperf_test.py b/benchmarks/workloads/iperf/iperf_test.py
new file mode 100644
index 000000000..6959b7e8a
--- /dev/null
+++ b/benchmarks/workloads/iperf/iperf_test.py
@@ -0,0 +1,28 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for iperf."""
+
+import sys
+
+import pytest
+
+from benchmarks.workloads import iperf
+
+
+def test_bandwidth():
+ assert iperf.bandwidth(iperf.sample()) == 45900 * 1000
+
+
+if __name__ == "__main__":
+ sys.exit(pytest.main([__file__]))
diff --git a/benchmarks/workloads/netcat/BUILD b/benchmarks/workloads/netcat/BUILD
new file mode 100644
index 000000000..eb0fb6165
--- /dev/null
+++ b/benchmarks/workloads/netcat/BUILD
@@ -0,0 +1,13 @@
+load("@rules_pkg//:pkg.bzl", "pkg_tar")
+
+package(
+ default_visibility = ["//benchmarks:__subpackages__"],
+ licenses = ["notice"],
+)
+
+pkg_tar(
+ name = "tar",
+ srcs = [
+ "Dockerfile",
+ ],
+)
diff --git a/benchmarks/workloads/netcat/Dockerfile b/benchmarks/workloads/netcat/Dockerfile
new file mode 100644
index 000000000..d8548d89a
--- /dev/null
+++ b/benchmarks/workloads/netcat/Dockerfile
@@ -0,0 +1,14 @@
+FROM ubuntu:18.04
+
+RUN set -x \
+ && apt-get update \
+ && apt-get install -y \
+ netcat \
+ && rm -rf /var/lib/apt/lists/*
+
+# Accept a host and port parameter.
+ENV host localhost
+ENV port 8080
+
+# Spin until we make a successful request.
+CMD ["sh", "-c", "while ! nc -zv $host $port; do true; done"]
diff --git a/benchmarks/workloads/nginx/BUILD b/benchmarks/workloads/nginx/BUILD
new file mode 100644
index 000000000..eb0fb6165
--- /dev/null
+++ b/benchmarks/workloads/nginx/BUILD
@@ -0,0 +1,13 @@
+load("@rules_pkg//:pkg.bzl", "pkg_tar")
+
+package(
+ default_visibility = ["//benchmarks:__subpackages__"],
+ licenses = ["notice"],
+)
+
+pkg_tar(
+ name = "tar",
+ srcs = [
+ "Dockerfile",
+ ],
+)
diff --git a/benchmarks/workloads/nginx/Dockerfile b/benchmarks/workloads/nginx/Dockerfile
new file mode 100644
index 000000000..b64eb52ae
--- /dev/null
+++ b/benchmarks/workloads/nginx/Dockerfile
@@ -0,0 +1 @@
+FROM nginx:1.15.10
diff --git a/benchmarks/workloads/node/BUILD b/benchmarks/workloads/node/BUILD
new file mode 100644
index 000000000..71cd9f519
--- /dev/null
+++ b/benchmarks/workloads/node/BUILD
@@ -0,0 +1,15 @@
+load("@rules_pkg//:pkg.bzl", "pkg_tar")
+
+package(
+ default_visibility = ["//benchmarks:__subpackages__"],
+ licenses = ["notice"],
+)
+
+pkg_tar(
+ name = "tar",
+ srcs = [
+ "Dockerfile",
+ "index.js",
+ "package.json",
+ ],
+)
diff --git a/benchmarks/workloads/node/Dockerfile b/benchmarks/workloads/node/Dockerfile
new file mode 100644
index 000000000..139a38bf5
--- /dev/null
+++ b/benchmarks/workloads/node/Dockerfile
@@ -0,0 +1,2 @@
+FROM node:onbuild
+CMD ["node", "index.js"]
diff --git a/benchmarks/workloads/node/index.js b/benchmarks/workloads/node/index.js
new file mode 100644
index 000000000..584158462
--- /dev/null
+++ b/benchmarks/workloads/node/index.js
@@ -0,0 +1,28 @@
+'use strict';
+
+var start = new Date().getTime();
+
+// Load dependencies to simulate an average nodejs app.
+var req_0 = require('async');
+var req_1 = require('bluebird');
+var req_2 = require('firebase');
+var req_3 = require('firebase-admin');
+var req_4 = require('@google-cloud/container');
+var req_5 = require('@google-cloud/logging');
+var req_6 = require('@google-cloud/monitoring');
+var req_7 = require('@google-cloud/spanner');
+var req_8 = require('lodash');
+var req_9 = require('mailgun-js');
+var req_10 = require('request');
+var express = require('express');
+var app = express();
+
+var loaded = new Date().getTime() - start;
+app.get('/', function(req, res) {
+ res.send('Hello World!<br>Loaded in ' + loaded + 'ms');
+});
+
+console.log('Loaded in ' + loaded + ' ms');
+app.listen(8080, function() {
+ console.log('Listening on port 8080...');
+});
diff --git a/benchmarks/workloads/node/package.json b/benchmarks/workloads/node/package.json
new file mode 100644
index 000000000..c00b9b3cb
--- /dev/null
+++ b/benchmarks/workloads/node/package.json
@@ -0,0 +1,19 @@
+{
+ "name": "node",
+ "version": "1.0.0",
+ "main": "index.js",
+ "dependencies": {
+ "@google-cloud/container": "^0.3.0",
+ "@google-cloud/logging": "^4.2.0",
+ "@google-cloud/monitoring": "^0.6.0",
+ "@google-cloud/spanner": "^2.2.1",
+ "async": "^2.6.1",
+ "bluebird": "^3.5.3",
+ "express": "^4.16.4",
+ "firebase": "^5.7.2",
+ "firebase-admin": "^6.4.0",
+ "lodash": "^4.17.11",
+ "mailgun-js": "^0.22.0",
+ "request": "^2.88.0"
+ }
+}
diff --git a/benchmarks/workloads/node_template/BUILD b/benchmarks/workloads/node_template/BUILD
new file mode 100644
index 000000000..ca996f068
--- /dev/null
+++ b/benchmarks/workloads/node_template/BUILD
@@ -0,0 +1,17 @@
+load("@rules_pkg//:pkg.bzl", "pkg_tar")
+
+package(
+ default_visibility = ["//benchmarks:__subpackages__"],
+ licenses = ["notice"],
+)
+
+pkg_tar(
+ name = "tar",
+ srcs = [
+ "Dockerfile",
+ "index.hbs",
+ "index.js",
+ "package.json",
+ "package-lock.json",
+ ],
+)
diff --git a/benchmarks/workloads/node_template/Dockerfile b/benchmarks/workloads/node_template/Dockerfile
new file mode 100644
index 000000000..7eb065d54
--- /dev/null
+++ b/benchmarks/workloads/node_template/Dockerfile
@@ -0,0 +1,5 @@
+FROM node:onbuild
+
+ENV host "127.0.0.1"
+
+CMD ["sh", "-c", "node index.js ${host}"]
diff --git a/benchmarks/workloads/node_template/index.hbs b/benchmarks/workloads/node_template/index.hbs
new file mode 100644
index 000000000..03feceb75
--- /dev/null
+++ b/benchmarks/workloads/node_template/index.hbs
@@ -0,0 +1,8 @@
+<!DOCTYPE html>
+<html>
+<body>
+ {{#each text}}
+ <p>{{this}}</p>
+ {{/each}}
+</body>
+</html>
diff --git a/benchmarks/workloads/node_template/index.js b/benchmarks/workloads/node_template/index.js
new file mode 100644
index 000000000..04a27f356
--- /dev/null
+++ b/benchmarks/workloads/node_template/index.js
@@ -0,0 +1,43 @@
+const app = require('express')();
+const path = require('path');
+const redis = require('redis');
+const srs = require('secure-random-string');
+
+// The hostname is the first argument.
+const host_name = process.argv[2];
+
+var client = redis.createClient({host: host_name, detect_buffers: true});
+
+app.set('views', __dirname);
+app.set('view engine', 'hbs');
+
+app.get('/', (req, res) => {
+ var tmp = [];
+ /* Pull four random keys from the redis server. */
+ for (i = 0; i < 4; i++) {
+ client.get(Math.floor(Math.random() * (100)), function(err, reply) {
+ tmp.push(reply.toString());
+ });
+ }
+
+ res.render('index', {text: tmp});
+});
+
+/**
+ * Securely generate a random string.
+ * @param {number} len
+ * @return {string}
+ */
+function randomBody(len) {
+ return srs({alphanumeric: true, length: len});
+}
+
+/** Mutates one hundred keys randomly. */
+function generateText() {
+ for (i = 0; i < 100; i++) {
+ client.set(i, randomBody(1024));
+ }
+}
+
+generateText();
+app.listen(8080);
diff --git a/benchmarks/workloads/node_template/package-lock.json b/benchmarks/workloads/node_template/package-lock.json
new file mode 100644
index 000000000..580e68aa5
--- /dev/null
+++ b/benchmarks/workloads/node_template/package-lock.json
@@ -0,0 +1,486 @@
+{
+ "name": "nodedum",
+ "version": "1.0.0",
+ "lockfileVersion": 1,
+ "requires": true,
+ "dependencies": {
+ "accepts": {
+ "version": "1.3.5",
+ "resolved": "https://registry.npmjs.org/accepts/-/accepts-1.3.5.tgz",
+ "integrity": "sha1-63d99gEXI6OxTopywIBcjoZ0a9I=",
+ "requires": {
+ "mime-types": "~2.1.18",
+ "negotiator": "0.6.1"
+ }
+ },
+ "array-flatten": {
+ "version": "1.1.1",
+ "resolved": "https://registry.npmjs.org/array-flatten/-/array-flatten-1.1.1.tgz",
+ "integrity": "sha1-ml9pkFGx5wczKPKgCJaLZOopVdI="
+ },
+ "async": {
+ "version": "2.6.2",
+ "resolved": "https://registry.npmjs.org/async/-/async-2.6.2.tgz",
+ "integrity": "sha512-H1qVYh1MYhEEFLsP97cVKqCGo7KfCyTt6uEWqsTBr9SO84oK9Uwbyd/yCW+6rKJLHksBNUVWZDAjfS+Ccx0Bbg==",
+ "requires": {
+ "lodash": "^4.17.11"
+ }
+ },
+ "body-parser": {
+ "version": "1.18.3",
+ "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.18.3.tgz",
+ "integrity": "sha1-WykhmP/dVTs6DyDe0FkrlWlVyLQ=",
+ "requires": {
+ "bytes": "3.0.0",
+ "content-type": "~1.0.4",
+ "debug": "2.6.9",
+ "depd": "~1.1.2",
+ "http-errors": "~1.6.3",
+ "iconv-lite": "0.4.23",
+ "on-finished": "~2.3.0",
+ "qs": "6.5.2",
+ "raw-body": "2.3.3",
+ "type-is": "~1.6.16"
+ }
+ },
+ "bytes": {
+ "version": "3.0.0",
+ "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.0.0.tgz",
+ "integrity": "sha1-0ygVQE1olpn4Wk6k+odV3ROpYEg="
+ },
+ "commander": {
+ "version": "2.20.0",
+ "resolved": "https://registry.npmjs.org/commander/-/commander-2.20.0.tgz",
+ "integrity": "sha512-7j2y+40w61zy6YC2iRNpUe/NwhNyoXrYpHMrSunaMG64nRnaf96zO/KMQR4OyN/UnE5KLyEBnKHd4aG3rskjpQ==",
+ "optional": true
+ },
+ "content-disposition": {
+ "version": "0.5.2",
+ "resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-0.5.2.tgz",
+ "integrity": "sha1-DPaLud318r55YcOoUXjLhdunjLQ="
+ },
+ "content-type": {
+ "version": "1.0.4",
+ "resolved": "https://registry.npmjs.org/content-type/-/content-type-1.0.4.tgz",
+ "integrity": "sha512-hIP3EEPs8tB9AT1L+NUqtwOAps4mk2Zob89MWXMHjHWg9milF/j4osnnQLXBCBFBk/tvIG/tUc9mOUJiPBhPXA=="
+ },
+ "cookie": {
+ "version": "0.3.1",
+ "resolved": "https://registry.npmjs.org/cookie/-/cookie-0.3.1.tgz",
+ "integrity": "sha1-5+Ch+e9DtMi6klxcWpboBtFoc7s="
+ },
+ "cookie-signature": {
+ "version": "1.0.6",
+ "resolved": "https://registry.npmjs.org/cookie-signature/-/cookie-signature-1.0.6.tgz",
+ "integrity": "sha1-4wOogrNCzD7oylE6eZmXNNqzriw="
+ },
+ "debug": {
+ "version": "2.6.9",
+ "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
+ "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==",
+ "requires": {
+ "ms": "2.0.0"
+ }
+ },
+ "depd": {
+ "version": "1.1.2",
+ "resolved": "https://registry.npmjs.org/depd/-/depd-1.1.2.tgz",
+ "integrity": "sha1-m81S4UwJd2PnSbJ0xDRu0uVgtak="
+ },
+ "destroy": {
+ "version": "1.0.4",
+ "resolved": "https://registry.npmjs.org/destroy/-/destroy-1.0.4.tgz",
+ "integrity": "sha1-l4hXRCxEdJ5CBmE+N5RiBYJqvYA="
+ },
+ "double-ended-queue": {
+ "version": "2.1.0-0",
+ "resolved": "https://registry.npmjs.org/double-ended-queue/-/double-ended-queue-2.1.0-0.tgz",
+ "integrity": "sha1-ED01J/0xUo9AGIEwyEHv3XgmTlw="
+ },
+ "ee-first": {
+ "version": "1.1.1",
+ "resolved": "https://registry.npmjs.org/ee-first/-/ee-first-1.1.1.tgz",
+ "integrity": "sha1-WQxhFWsK4vTwJVcyoViyZrxWsh0="
+ },
+ "encodeurl": {
+ "version": "1.0.2",
+ "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-1.0.2.tgz",
+ "integrity": "sha1-rT/0yG7C0CkyL1oCw6mmBslbP1k="
+ },
+ "escape-html": {
+ "version": "1.0.3",
+ "resolved": "https://registry.npmjs.org/escape-html/-/escape-html-1.0.3.tgz",
+ "integrity": "sha1-Aljq5NPQwJdN4cFpGI7wBR0dGYg="
+ },
+ "etag": {
+ "version": "1.8.1",
+ "resolved": "https://registry.npmjs.org/etag/-/etag-1.8.1.tgz",
+ "integrity": "sha1-Qa4u62XvpiJorr/qg6x9eSmbCIc="
+ },
+ "express": {
+ "version": "4.16.4",
+ "resolved": "https://registry.npmjs.org/express/-/express-4.16.4.tgz",
+ "integrity": "sha512-j12Uuyb4FMrd/qQAm6uCHAkPtO8FDTRJZBDd5D2KOL2eLaz1yUNdUB/NOIyq0iU4q4cFarsUCrnFDPBcnksuOg==",
+ "requires": {
+ "accepts": "~1.3.5",
+ "array-flatten": "1.1.1",
+ "body-parser": "1.18.3",
+ "content-disposition": "0.5.2",
+ "content-type": "~1.0.4",
+ "cookie": "0.3.1",
+ "cookie-signature": "1.0.6",
+ "debug": "2.6.9",
+ "depd": "~1.1.2",
+ "encodeurl": "~1.0.2",
+ "escape-html": "~1.0.3",
+ "etag": "~1.8.1",
+ "finalhandler": "1.1.1",
+ "fresh": "0.5.2",
+ "merge-descriptors": "1.0.1",
+ "methods": "~1.1.2",
+ "on-finished": "~2.3.0",
+ "parseurl": "~1.3.2",
+ "path-to-regexp": "0.1.7",
+ "proxy-addr": "~2.0.4",
+ "qs": "6.5.2",
+ "range-parser": "~1.2.0",
+ "safe-buffer": "5.1.2",
+ "send": "0.16.2",
+ "serve-static": "1.13.2",
+ "setprototypeof": "1.1.0",
+ "statuses": "~1.4.0",
+ "type-is": "~1.6.16",
+ "utils-merge": "1.0.1",
+ "vary": "~1.1.2"
+ }
+ },
+ "finalhandler": {
+ "version": "1.1.1",
+ "resolved": "http://registry.npmjs.org/finalhandler/-/finalhandler-1.1.1.tgz",
+ "integrity": "sha512-Y1GUDo39ez4aHAw7MysnUD5JzYX+WaIj8I57kO3aEPT1fFRL4sr7mjei97FgnwhAyyzRYmQZaTHb2+9uZ1dPtg==",
+ "requires": {
+ "debug": "2.6.9",
+ "encodeurl": "~1.0.2",
+ "escape-html": "~1.0.3",
+ "on-finished": "~2.3.0",
+ "parseurl": "~1.3.2",
+ "statuses": "~1.4.0",
+ "unpipe": "~1.0.0"
+ }
+ },
+ "foreachasync": {
+ "version": "3.0.0",
+ "resolved": "https://registry.npmjs.org/foreachasync/-/foreachasync-3.0.0.tgz",
+ "integrity": "sha1-VQKYfchxS+M5IJfzLgBxyd7gfPY="
+ },
+ "forwarded": {
+ "version": "0.1.2",
+ "resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.1.2.tgz",
+ "integrity": "sha1-mMI9qxF1ZXuMBXPozszZGw/xjIQ="
+ },
+ "fresh": {
+ "version": "0.5.2",
+ "resolved": "https://registry.npmjs.org/fresh/-/fresh-0.5.2.tgz",
+ "integrity": "sha1-PYyt2Q2XZWn6g1qx+OSyOhBWBac="
+ },
+ "handlebars": {
+ "version": "4.0.14",
+ "resolved": "https://registry.npmjs.org/handlebars/-/handlebars-4.0.14.tgz",
+ "integrity": "sha512-E7tDoyAA8ilZIV3xDJgl18sX3M8xB9/fMw8+mfW4msLW8jlX97bAnWgT3pmaNXuvzIEgSBMnAHfuXsB2hdzfow==",
+ "requires": {
+ "async": "^2.5.0",
+ "optimist": "^0.6.1",
+ "source-map": "^0.6.1",
+ "uglify-js": "^3.1.4"
+ }
+ },
+ "hbs": {
+ "version": "4.0.4",
+ "resolved": "https://registry.npmjs.org/hbs/-/hbs-4.0.4.tgz",
+ "integrity": "sha512-esVlyV/V59mKkwFai5YmPRSNIWZzhqL5YMN0++ueMxyK1cCfPa5f6JiHtapPKAIVAhQR6rpGxow0troav9WMEg==",
+ "requires": {
+ "handlebars": "4.0.14",
+ "walk": "2.3.9"
+ }
+ },
+ "http-errors": {
+ "version": "1.6.3",
+ "resolved": "http://registry.npmjs.org/http-errors/-/http-errors-1.6.3.tgz",
+ "integrity": "sha1-i1VoC7S+KDoLW/TqLjhYC+HZMg0=",
+ "requires": {
+ "depd": "~1.1.2",
+ "inherits": "2.0.3",
+ "setprototypeof": "1.1.0",
+ "statuses": ">= 1.4.0 < 2"
+ }
+ },
+ "iconv-lite": {
+ "version": "0.4.23",
+ "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.23.tgz",
+ "integrity": "sha512-neyTUVFtahjf0mB3dZT77u+8O0QB89jFdnBkd5P1JgYPbPaia3gXXOVL2fq8VyU2gMMD7SaN7QukTB/pmXYvDA==",
+ "requires": {
+ "safer-buffer": ">= 2.1.2 < 3"
+ }
+ },
+ "inherits": {
+ "version": "2.0.3",
+ "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.3.tgz",
+ "integrity": "sha1-Yzwsg+PaQqUC9SRmAiSA9CCCYd4="
+ },
+ "ipaddr.js": {
+ "version": "1.8.0",
+ "resolved": "https://registry.npmjs.org/ipaddr.js/-/ipaddr.js-1.8.0.tgz",
+ "integrity": "sha1-6qM9bd16zo9/b+DJygRA5wZzix4="
+ },
+ "lodash": {
+ "version": "4.17.15",
+ "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.15.tgz",
+ "integrity": "sha512-8xOcRHvCjnocdS5cpwXQXVzmmh5e5+saE2QGoeQmbKmRS6J3VQppPOIt0MnmE+4xlZoumy0GPG0D0MVIQbNA1A=="
+ },
+ "media-typer": {
+ "version": "0.3.0",
+ "resolved": "http://registry.npmjs.org/media-typer/-/media-typer-0.3.0.tgz",
+ "integrity": "sha1-hxDXrwqmJvj/+hzgAWhUUmMlV0g="
+ },
+ "merge-descriptors": {
+ "version": "1.0.1",
+ "resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-1.0.1.tgz",
+ "integrity": "sha1-sAqqVW3YtEVoFQ7J0blT8/kMu2E="
+ },
+ "methods": {
+ "version": "1.1.2",
+ "resolved": "https://registry.npmjs.org/methods/-/methods-1.1.2.tgz",
+ "integrity": "sha1-VSmk1nZUE07cxSZmVoNbD4Ua/O4="
+ },
+ "mime": {
+ "version": "1.4.1",
+ "resolved": "https://registry.npmjs.org/mime/-/mime-1.4.1.tgz",
+ "integrity": "sha512-KI1+qOZu5DcW6wayYHSzR/tXKCDC5Om4s1z2QJjDULzLcmf3DvzS7oluY4HCTrc+9FiKmWUgeNLg7W3uIQvxtQ=="
+ },
+ "mime-db": {
+ "version": "1.37.0",
+ "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.37.0.tgz",
+ "integrity": "sha512-R3C4db6bgQhlIhPU48fUtdVmKnflq+hRdad7IyKhtFj06VPNVdk2RhiYL3UjQIlso8L+YxAtFkobT0VK+S/ybg=="
+ },
+ "mime-types": {
+ "version": "2.1.21",
+ "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.21.tgz",
+ "integrity": "sha512-3iL6DbwpyLzjR3xHSFNFeb9Nz/M8WDkX33t1GFQnFOllWk8pOrh/LSrB5OXlnlW5P9LH73X6loW/eogc+F5lJg==",
+ "requires": {
+ "mime-db": "~1.37.0"
+ }
+ },
+ "minimist": {
+ "version": "0.0.10",
+ "resolved": "https://registry.npmjs.org/minimist/-/minimist-0.0.10.tgz",
+ "integrity": "sha1-3j+YVD2/lggr5IrRoMfNqDYwHc8="
+ },
+ "ms": {
+ "version": "2.0.0",
+ "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
+ "integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g="
+ },
+ "negotiator": {
+ "version": "0.6.1",
+ "resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.1.tgz",
+ "integrity": "sha1-KzJxhOiZIQEXeyhWP7XnECrNDKk="
+ },
+ "on-finished": {
+ "version": "2.3.0",
+ "resolved": "https://registry.npmjs.org/on-finished/-/on-finished-2.3.0.tgz",
+ "integrity": "sha1-IPEzZIGwg811M3mSoWlxqi2QaUc=",
+ "requires": {
+ "ee-first": "1.1.1"
+ }
+ },
+ "optimist": {
+ "version": "0.6.1",
+ "resolved": "https://registry.npmjs.org/optimist/-/optimist-0.6.1.tgz",
+ "integrity": "sha1-2j6nRob6IaGaERwybpDrFaAZZoY=",
+ "requires": {
+ "minimist": "~0.0.1",
+ "wordwrap": "~0.0.2"
+ }
+ },
+ "parseurl": {
+ "version": "1.3.2",
+ "resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.2.tgz",
+ "integrity": "sha1-/CidTtiZMRlGDBViUyYs3I3mW/M="
+ },
+ "path-to-regexp": {
+ "version": "0.1.7",
+ "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-0.1.7.tgz",
+ "integrity": "sha1-32BBeABfUi8V60SQ5yR6G/qmf4w="
+ },
+ "proxy-addr": {
+ "version": "2.0.4",
+ "resolved": "https://registry.npmjs.org/proxy-addr/-/proxy-addr-2.0.4.tgz",
+ "integrity": "sha512-5erio2h9jp5CHGwcybmxmVqHmnCBZeewlfJ0pex+UW7Qny7OOZXTtH56TGNyBizkgiOwhJtMKrVzDTeKcySZwA==",
+ "requires": {
+ "forwarded": "~0.1.2",
+ "ipaddr.js": "1.8.0"
+ }
+ },
+ "qs": {
+ "version": "6.5.2",
+ "resolved": "https://registry.npmjs.org/qs/-/qs-6.5.2.tgz",
+ "integrity": "sha512-N5ZAX4/LxJmF+7wN74pUD6qAh9/wnvdQcjq9TZjevvXzSUo7bfmw91saqMjzGS2xq91/odN2dW/WOl7qQHNDGA=="
+ },
+ "range-parser": {
+ "version": "1.2.0",
+ "resolved": "https://registry.npmjs.org/range-parser/-/range-parser-1.2.0.tgz",
+ "integrity": "sha1-9JvmtIeJTdxA3MlKMi9hEJLgDV4="
+ },
+ "raw-body": {
+ "version": "2.3.3",
+ "resolved": "https://registry.npmjs.org/raw-body/-/raw-body-2.3.3.tgz",
+ "integrity": "sha512-9esiElv1BrZoI3rCDuOuKCBRbuApGGaDPQfjSflGxdy4oyzqghxu6klEkkVIvBje+FF0BX9coEv8KqW6X/7njw==",
+ "requires": {
+ "bytes": "3.0.0",
+ "http-errors": "1.6.3",
+ "iconv-lite": "0.4.23",
+ "unpipe": "1.0.0"
+ }
+ },
+ "redis": {
+ "version": "2.8.0",
+ "resolved": "https://registry.npmjs.org/redis/-/redis-2.8.0.tgz",
+ "integrity": "sha512-M1OkonEQwtRmZv4tEWF2VgpG0JWJ8Fv1PhlgT5+B+uNq2cA3Rt1Yt/ryoR+vQNOQcIEgdCdfH0jr3bDpihAw1A==",
+ "requires": {
+ "double-ended-queue": "^2.1.0-0",
+ "redis-commands": "^1.2.0",
+ "redis-parser": "^2.6.0"
+ },
+ "dependencies": {
+ "redis-commands": {
+ "version": "1.4.0",
+ "resolved": "https://registry.npmjs.org/redis-commands/-/redis-commands-1.4.0.tgz",
+ "integrity": "sha512-cu8EF+MtkwI4DLIT0x9P8qNTLFhQD4jLfxLR0cCNkeGzs87FN6879JOJwNQR/1zD7aSYNbU0hgsV9zGY71Itvw=="
+ },
+ "redis-parser": {
+ "version": "2.6.0",
+ "resolved": "https://registry.npmjs.org/redis-parser/-/redis-parser-2.6.0.tgz",
+ "integrity": "sha1-Uu0J2srBCPGmMcB+m2mUHnoZUEs="
+ }
+ }
+ },
+ "redis-commands": {
+ "version": "1.5.0",
+ "resolved": "https://registry.npmjs.org/redis-commands/-/redis-commands-1.5.0.tgz",
+ "integrity": "sha512-6KxamqpZ468MeQC3bkWmCB1fp56XL64D4Kf0zJSwDZbVLLm7KFkoIcHrgRvQ+sk8dnhySs7+yBg94yIkAK7aJg=="
+ },
+ "redis-parser": {
+ "version": "2.6.0",
+ "resolved": "https://registry.npmjs.org/redis-parser/-/redis-parser-2.6.0.tgz",
+ "integrity": "sha1-Uu0J2srBCPGmMcB+m2mUHnoZUEs="
+ },
+ "safe-buffer": {
+ "version": "5.1.2",
+ "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz",
+ "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g=="
+ },
+ "safer-buffer": {
+ "version": "2.1.2",
+ "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz",
+ "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg=="
+ },
+ "secure-random-string": {
+ "version": "1.1.0",
+ "resolved": "https://registry.npmjs.org/secure-random-string/-/secure-random-string-1.1.0.tgz",
+ "integrity": "sha512-V/h8jqoz58zklNGybVhP++cWrxEPXlLM/6BeJ4e0a8zlb4BsbYRzFs16snrxByPa5LUxCVTD3M6EYIVIHR1fAg=="
+ },
+ "send": {
+ "version": "0.16.2",
+ "resolved": "https://registry.npmjs.org/send/-/send-0.16.2.tgz",
+ "integrity": "sha512-E64YFPUssFHEFBvpbbjr44NCLtI1AohxQ8ZSiJjQLskAdKuriYEP6VyGEsRDH8ScozGpkaX1BGvhanqCwkcEZw==",
+ "requires": {
+ "debug": "2.6.9",
+ "depd": "~1.1.2",
+ "destroy": "~1.0.4",
+ "encodeurl": "~1.0.2",
+ "escape-html": "~1.0.3",
+ "etag": "~1.8.1",
+ "fresh": "0.5.2",
+ "http-errors": "~1.6.2",
+ "mime": "1.4.1",
+ "ms": "2.0.0",
+ "on-finished": "~2.3.0",
+ "range-parser": "~1.2.0",
+ "statuses": "~1.4.0"
+ }
+ },
+ "serve-static": {
+ "version": "1.13.2",
+ "resolved": "https://registry.npmjs.org/serve-static/-/serve-static-1.13.2.tgz",
+ "integrity": "sha512-p/tdJrO4U387R9oMjb1oj7qSMaMfmOyd4j9hOFoxZe2baQszgHcSWjuya/CiT5kgZZKRudHNOA0pYXOl8rQ5nw==",
+ "requires": {
+ "encodeurl": "~1.0.2",
+ "escape-html": "~1.0.3",
+ "parseurl": "~1.3.2",
+ "send": "0.16.2"
+ }
+ },
+ "setprototypeof": {
+ "version": "1.1.0",
+ "resolved": "https://registry.npmjs.org/setprototypeof/-/setprototypeof-1.1.0.tgz",
+ "integrity": "sha512-BvE/TwpZX4FXExxOxZyRGQQv651MSwmWKZGqvmPcRIjDqWub67kTKuIMx43cZZrS/cBBzwBcNDWoFxt2XEFIpQ=="
+ },
+ "source-map": {
+ "version": "0.6.1",
+ "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz",
+ "integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g=="
+ },
+ "statuses": {
+ "version": "1.4.0",
+ "resolved": "https://registry.npmjs.org/statuses/-/statuses-1.4.0.tgz",
+ "integrity": "sha512-zhSCtt8v2NDrRlPQpCNtw/heZLtfUDqxBM1udqikb/Hbk52LK4nQSwr10u77iopCW5LsyHpuXS0GnEc48mLeew=="
+ },
+ "type-is": {
+ "version": "1.6.16",
+ "resolved": "https://registry.npmjs.org/type-is/-/type-is-1.6.16.tgz",
+ "integrity": "sha512-HRkVv/5qY2G6I8iab9cI7v1bOIdhm94dVjQCPFElW9W+3GeDOSHmy2EBYe4VTApuzolPcmgFTN3ftVJRKR2J9Q==",
+ "requires": {
+ "media-typer": "0.3.0",
+ "mime-types": "~2.1.18"
+ }
+ },
+ "uglify-js": {
+ "version": "3.5.9",
+ "resolved": "https://registry.npmjs.org/uglify-js/-/uglify-js-3.5.9.tgz",
+ "integrity": "sha512-WpT0RqsDtAWPNJK955DEnb6xjymR8Fn0OlK4TT4pS0ASYsVPqr5ELhgwOwLCP5J5vHeJ4xmMmz3DEgdqC10JeQ==",
+ "optional": true,
+ "requires": {
+ "commander": "~2.20.0",
+ "source-map": "~0.6.1"
+ }
+ },
+ "unpipe": {
+ "version": "1.0.0",
+ "resolved": "https://registry.npmjs.org/unpipe/-/unpipe-1.0.0.tgz",
+ "integrity": "sha1-sr9O6FFKrmFltIF4KdIbLvSZBOw="
+ },
+ "utils-merge": {
+ "version": "1.0.1",
+ "resolved": "https://registry.npmjs.org/utils-merge/-/utils-merge-1.0.1.tgz",
+ "integrity": "sha1-n5VxD1CiZ5R7LMwSR0HBAoQn5xM="
+ },
+ "vary": {
+ "version": "1.1.2",
+ "resolved": "https://registry.npmjs.org/vary/-/vary-1.1.2.tgz",
+ "integrity": "sha1-IpnwLG3tMNSllhsLn3RSShj2NPw="
+ },
+ "walk": {
+ "version": "2.3.9",
+ "resolved": "https://registry.npmjs.org/walk/-/walk-2.3.9.tgz",
+ "integrity": "sha1-MbTbZnjyrgHDnqn7hyWpAx5Vins=",
+ "requires": {
+ "foreachasync": "^3.0.0"
+ }
+ },
+ "wordwrap": {
+ "version": "0.0.3",
+ "resolved": "https://registry.npmjs.org/wordwrap/-/wordwrap-0.0.3.tgz",
+ "integrity": "sha1-o9XabNXAvAAI03I0u68b7WMFkQc="
+ }
+ }
+}
diff --git a/benchmarks/workloads/node_template/package.json b/benchmarks/workloads/node_template/package.json
new file mode 100644
index 000000000..7dcadd523
--- /dev/null
+++ b/benchmarks/workloads/node_template/package.json
@@ -0,0 +1,19 @@
+{
+ "name": "nodedum",
+ "version": "1.0.0",
+ "description": "",
+ "main": "index.js",
+ "scripts": {
+ "test": "echo \"Error: no test specified\" && exit 1"
+ },
+ "author": "",
+ "license": "ISC",
+ "dependencies": {
+ "express": "^4.16.4",
+ "hbs": "^4.0.4",
+ "redis": "^2.8.0",
+ "redis-commands": "^1.2.0",
+ "redis-parser": "^2.6.0",
+ "secure-random-string": "^1.1.0"
+ }
+}
diff --git a/benchmarks/workloads/redis/BUILD b/benchmarks/workloads/redis/BUILD
new file mode 100644
index 000000000..eb0fb6165
--- /dev/null
+++ b/benchmarks/workloads/redis/BUILD
@@ -0,0 +1,13 @@
+load("@rules_pkg//:pkg.bzl", "pkg_tar")
+
+package(
+ default_visibility = ["//benchmarks:__subpackages__"],
+ licenses = ["notice"],
+)
+
+pkg_tar(
+ name = "tar",
+ srcs = [
+ "Dockerfile",
+ ],
+)
diff --git a/benchmarks/workloads/redis/Dockerfile b/benchmarks/workloads/redis/Dockerfile
new file mode 100644
index 000000000..0f17249af
--- /dev/null
+++ b/benchmarks/workloads/redis/Dockerfile
@@ -0,0 +1 @@
+FROM redis:5.0.4
diff --git a/benchmarks/workloads/redisbenchmark/BUILD b/benchmarks/workloads/redisbenchmark/BUILD
new file mode 100644
index 000000000..f5994a815
--- /dev/null
+++ b/benchmarks/workloads/redisbenchmark/BUILD
@@ -0,0 +1,36 @@
+load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement")
+load("@rules_pkg//:pkg.bzl", "pkg_tar")
+
+package(
+ default_visibility = ["//benchmarks:__subpackages__"],
+ licenses = ["notice"],
+)
+
+py_library(
+ name = "redisbenchmark",
+ srcs = ["__init__.py"],
+)
+
+py_test(
+ name = "redisbenchmark_test",
+ srcs = ["redisbenchmark_test.py"],
+ python_version = "PY3",
+ deps = [
+ ":redisbenchmark",
+ requirement("attrs", False),
+ requirement("atomicwrites", False),
+ requirement("more-itertools", False),
+ requirement("pathlib2", False),
+ requirement("pluggy", False),
+ requirement("py", False),
+ requirement("pytest", True),
+ requirement("six", False),
+ ],
+)
+
+pkg_tar(
+ name = "tar",
+ srcs = [
+ "Dockerfile",
+ ],
+)
diff --git a/benchmarks/workloads/redisbenchmark/Dockerfile b/benchmarks/workloads/redisbenchmark/Dockerfile
new file mode 100644
index 000000000..f94f6442e
--- /dev/null
+++ b/benchmarks/workloads/redisbenchmark/Dockerfile
@@ -0,0 +1,4 @@
+FROM redis:5.0.4
+ENV host localhost
+ENV port 6379
+CMD ["sh", "-c", "redis-benchmark --csv -h ${host} -p ${port} ${flags}"]
diff --git a/benchmarks/workloads/redisbenchmark/__init__.py b/benchmarks/workloads/redisbenchmark/__init__.py
new file mode 100644
index 000000000..229cef5fa
--- /dev/null
+++ b/benchmarks/workloads/redisbenchmark/__init__.py
@@ -0,0 +1,85 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Redis-benchmark tool."""
+
+import re
+
+OPERATIONS = [
+ "PING_INLINE",
+ "PING_BULK",
+ "SET",
+ "GET",
+ "INCR",
+ "LPUSH",
+ "RPUSH",
+ "LPOP",
+ "RPOP",
+ "SADD",
+ "HSET",
+ "SPOP",
+ "LRANGE_100",
+ "LRANGE_300",
+ "LRANGE_500",
+ "LRANGE_600",
+ "MSET",
+]
+
+METRICS = dict()
+
+SAMPLE_DATA = """
+"PING_INLINE","48661.80"
+"PING_BULK","50301.81"
+"SET","48923.68"
+"GET","49382.71"
+"INCR","49975.02"
+"LPUSH","49875.31"
+"RPUSH","50276.52"
+"LPOP","50327.12"
+"RPOP","50556.12"
+"SADD","49504.95"
+"HSET","49504.95"
+"SPOP","50025.02"
+"LPUSH (needed to benchmark LRANGE)","48875.86"
+"LRANGE_100 (first 100 elements)","33955.86"
+"LRANGE_300 (first 300 elements)","16550.81"
+"LRANGE_500 (first 450 elements)","13653.74"
+"LRANGE_600 (first 600 elements)","11219.57"
+"MSET (10 keys)","44682.75"
+"""
+
+
+# pylint: disable=unused-argument
+def sample(**kwargs) -> str:
+ return SAMPLE_DATA
+
+
+# Bind a metric for each operation noted above.
+for op in OPERATIONS:
+
+ def bind(metric):
+ """Bind op to a new scope."""
+
+ # pylint: disable=unused-argument
+ def parse(data: str, **kwargs) -> float:
+ """Operation throughput in requests/sec."""
+ regex = r"\"" + metric + r"( .*)?\",\"(\d*.\d*)"
+ res = re.compile(regex).search(data)
+ if res:
+ return float(res.group(2))
+ return 0.0
+
+ parse.__name__ = metric
+ return parse
+
+ METRICS[op] = bind(op)
diff --git a/benchmarks/workloads/redisbenchmark/redisbenchmark_test.py b/benchmarks/workloads/redisbenchmark/redisbenchmark_test.py
new file mode 100644
index 000000000..419ced059
--- /dev/null
+++ b/benchmarks/workloads/redisbenchmark/redisbenchmark_test.py
@@ -0,0 +1,51 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Parser test."""
+
+import sys
+
+import pytest
+
+from benchmarks.workloads import redisbenchmark
+
+RESULTS = {
+ "PING_INLINE": 48661.80,
+ "PING_BULK": 50301.81,
+ "SET": 48923.68,
+ "GET": 49382.71,
+ "INCR": 49975.02,
+ "LPUSH": 49875.31,
+ "RPUSH": 50276.52,
+ "LPOP": 50327.12,
+ "RPOP": 50556.12,
+ "SADD": 49504.95,
+ "HSET": 49504.95,
+ "SPOP": 50025.02,
+ "LRANGE_100": 33955.86,
+ "LRANGE_300": 16550.81,
+ "LRANGE_500": 13653.74,
+ "LRANGE_600": 11219.57,
+ "MSET": 44682.75
+}
+
+
+def test_metrics():
+ """Test all metrics."""
+ for (metric, func) in redisbenchmark.METRICS.items():
+ res = func(redisbenchmark.sample())
+ assert float(res) == RESULTS[metric]
+
+
+if __name__ == "__main__":
+ sys.exit(pytest.main([__file__]))
diff --git a/benchmarks/workloads/ruby/BUILD b/benchmarks/workloads/ruby/BUILD
new file mode 100644
index 000000000..e37d77804
--- /dev/null
+++ b/benchmarks/workloads/ruby/BUILD
@@ -0,0 +1,28 @@
+load("@rules_pkg//:pkg.bzl", "pkg_tar")
+
+package(
+ default_visibility = ["//benchmarks:__subpackages__"],
+ licenses = ["notice"],
+)
+
+filegroup(
+ name = "files",
+ srcs = [
+ "Dockerfile",
+ "Gemfile",
+ "Gemfile.lock",
+ "config.ru",
+ "index.rb",
+ ],
+)
+
+pkg_tar(
+ name = "tar",
+ srcs = [
+ "Dockerfile",
+ "Gemfile",
+ "Gemfile.lock",
+ "config.ru",
+ "index.rb",
+ ],
+)
diff --git a/benchmarks/workloads/ruby/Dockerfile b/benchmarks/workloads/ruby/Dockerfile
new file mode 100644
index 000000000..a9a7a7086
--- /dev/null
+++ b/benchmarks/workloads/ruby/Dockerfile
@@ -0,0 +1,28 @@
+# example based on https://github.com/errm/fib
+
+FROM ruby:2.5
+
+RUN apt-get update -qq && apt-get install -y build-essential libpq-dev nodejs libsodium-dev
+
+# Set an environment variable where the Rails app is installed to inside of Docker image
+ENV RAILS_ROOT /var/www/app_name
+RUN mkdir -p $RAILS_ROOT
+
+# Set working directory
+WORKDIR $RAILS_ROOT
+
+# Setting env up
+ENV RAILS_ENV='production'
+ENV RACK_ENV='production'
+
+# Adding gems
+COPY Gemfile Gemfile
+COPY Gemfile.lock Gemfile.lock
+RUN bundle install --jobs 20 --retry 5 --without development test
+
+# Adding project files
+COPY . .
+
+EXPOSE $PORT
+STOPSIGNAL SIGINT
+CMD ["bundle", "exec", "puma", "config.ru"]
diff --git a/benchmarks/workloads/ruby/Gemfile b/benchmarks/workloads/ruby/Gemfile
new file mode 100644
index 000000000..8f1bdad6e
--- /dev/null
+++ b/benchmarks/workloads/ruby/Gemfile
@@ -0,0 +1,12 @@
+source "https://rubygems.org"
+# load a bunch of dependencies to take up memory
+gem "sinatra"
+gem "puma"
+gem "redis"
+gem 'rake'
+gem 'squid', '~> 1.4'
+gem 'cassandra-driver'
+gem 'ruby-fann'
+gem 'rbnacl'
+gem 'bcrypt'
+gem "activemerchant" \ No newline at end of file
diff --git a/benchmarks/workloads/ruby/Gemfile.lock b/benchmarks/workloads/ruby/Gemfile.lock
new file mode 100644
index 000000000..b44817bd3
--- /dev/null
+++ b/benchmarks/workloads/ruby/Gemfile.lock
@@ -0,0 +1,55 @@
+GEM
+ remote: https://rubygems.org/
+ specs:
+ activesupport (5.2.3)
+ concurrent-ruby (~> 1.0, >= 1.0.2)
+ i18n (>= 0.7, < 2)
+ minitest (~> 5.1)
+ tzinfo (~> 1.1)
+ cassandra-driver (3.2.3)
+ ione (~> 1.2)
+ concurrent-ruby (1.1.5)
+ i18n (1.6.0)
+ concurrent-ruby (~> 1.0)
+ ione (1.2.4)
+ minitest (5.11.3)
+ mustermann (1.0.3)
+ pdf-core (0.7.0)
+ prawn (2.2.2)
+ pdf-core (~> 0.7.0)
+ ttfunk (~> 1.5)
+ puma (3.12.1)
+ rack (2.0.7)
+ rack-protection (2.0.5)
+ rack
+ rake (12.3.2)
+ redis (4.1.1)
+ ruby-fann (1.2.6)
+ sinatra (2.0.5)
+ mustermann (~> 1.0)
+ rack (~> 2.0)
+ rack-protection (= 2.0.5)
+ tilt (~> 2.0)
+ squid (1.4.1)
+ activesupport (>= 4.0)
+ prawn (~> 2.2)
+ thread_safe (0.3.6)
+ tilt (2.0.9)
+ ttfunk (1.5.1)
+ tzinfo (1.2.5)
+ thread_safe (~> 0.1)
+
+PLATFORMS
+ ruby
+
+DEPENDENCIES
+ cassandra-driver
+ puma
+ rake
+ redis
+ ruby-fann
+ sinatra
+ squid (~> 1.4)
+
+BUNDLED WITH
+ 1.17.1
diff --git a/benchmarks/workloads/ruby/config.ru b/benchmarks/workloads/ruby/config.ru
new file mode 100755
index 000000000..fbd5acc82
--- /dev/null
+++ b/benchmarks/workloads/ruby/config.ru
@@ -0,0 +1,2 @@
+require './index'
+run Sinatra::Application \ No newline at end of file
diff --git a/benchmarks/workloads/ruby/index.rb b/benchmarks/workloads/ruby/index.rb
new file mode 100755
index 000000000..5fa85af93
--- /dev/null
+++ b/benchmarks/workloads/ruby/index.rb
@@ -0,0 +1,14 @@
+require "sinatra"
+require "puma"
+require "redis"
+require "rake"
+require "squid"
+require "cassandra"
+require "ruby-fann"
+require "rbnacl"
+require "bcrypt"
+require "activemerchant"
+
+get "/" do
+ "Hello World!"
+end \ No newline at end of file
diff --git a/benchmarks/workloads/ruby_template/BUILD b/benchmarks/workloads/ruby_template/BUILD
new file mode 100644
index 000000000..27f7c0c46
--- /dev/null
+++ b/benchmarks/workloads/ruby_template/BUILD
@@ -0,0 +1,19 @@
+load("@rules_pkg//:pkg.bzl", "pkg_tar")
+
+package(
+ default_visibility = ["//benchmarks:__subpackages__"],
+ licenses = ["notice"],
+)
+
+pkg_tar(
+ name = "tar",
+ srcs = [
+ "Dockerfile",
+ "Gemfile",
+ "Gemfile.lock",
+ "config.ru",
+ "index.erb",
+ "main.rb",
+ ],
+ strip_prefix = "third_party/gvisor/benchmarks/workloads/ruby_template",
+)
diff --git a/benchmarks/workloads/ruby_template/Dockerfile b/benchmarks/workloads/ruby_template/Dockerfile
new file mode 100755
index 000000000..a06d68bf4
--- /dev/null
+++ b/benchmarks/workloads/ruby_template/Dockerfile
@@ -0,0 +1,38 @@
+# example based on https://github.com/errm/fib
+
+FROM alpine:3.9 as build
+
+COPY Gemfile Gemfile.lock ./
+
+RUN apk add --no-cache ruby ruby-dev ruby-bundler ruby-json build-base bash \
+ && bundle install --frozen -j4 -r3 --no-cache --without development \
+ && apk del --no-cache ruby-bundler \
+ && rm -rf /usr/lib/ruby/gems/*/cache
+
+FROM alpine:3.9 as prod
+
+COPY --from=build /usr/lib/ruby/gems /usr/lib/ruby/gems
+RUN apk add --no-cache ruby ruby-json ruby-etc redis apache2-utils \
+ && ruby -e "Gem::Specification.map.each do |spec| \
+ Gem::Installer.for_spec( \
+ spec, \
+ wrappers: true, \
+ force: true, \
+ install_dir: spec.base_dir, \
+ build_args: spec.build_args, \
+ ).generate_bin \
+ end"
+
+WORKDIR /app
+COPY . /app/.
+
+ENV PORT=9292 \
+ WEB_CONCURRENCY=20 \
+ WEB_MAX_THREADS=20 \
+ RACK_ENV=production
+
+ENV host localhost
+EXPOSE $PORT
+USER nobody
+STOPSIGNAL SIGINT
+CMD ["sh", "-c", "/usr/bin/puma", "${host}"]
diff --git a/benchmarks/workloads/ruby_template/Gemfile b/benchmarks/workloads/ruby_template/Gemfile
new file mode 100755
index 000000000..ac521b32c
--- /dev/null
+++ b/benchmarks/workloads/ruby_template/Gemfile
@@ -0,0 +1,5 @@
+source "https://rubygems.org"
+
+gem "sinatra"
+gem "puma"
+gem "redis" \ No newline at end of file
diff --git a/benchmarks/workloads/ruby_template/Gemfile.lock b/benchmarks/workloads/ruby_template/Gemfile.lock
new file mode 100644
index 000000000..dd8d56fb7
--- /dev/null
+++ b/benchmarks/workloads/ruby_template/Gemfile.lock
@@ -0,0 +1,26 @@
+GEM
+ remote: https://rubygems.org/
+ specs:
+ mustermann (1.0.3)
+ puma (3.12.0)
+ rack (2.0.6)
+ rack-protection (2.0.5)
+ rack
+ sinatra (2.0.5)
+ mustermann (~> 1.0)
+ rack (~> 2.0)
+ rack-protection (= 2.0.5)
+ tilt (~> 2.0)
+ tilt (2.0.9)
+ redis (4.1.0)
+
+PLATFORMS
+ ruby
+
+DEPENDENCIES
+ puma
+ sinatra
+ redis
+
+BUNDLED WITH
+ 1.17.1 \ No newline at end of file
diff --git a/benchmarks/workloads/ruby_template/config.ru b/benchmarks/workloads/ruby_template/config.ru
new file mode 100755
index 000000000..b2d135cc0
--- /dev/null
+++ b/benchmarks/workloads/ruby_template/config.ru
@@ -0,0 +1,2 @@
+require './main'
+run Sinatra::Application \ No newline at end of file
diff --git a/benchmarks/workloads/ruby_template/index.erb b/benchmarks/workloads/ruby_template/index.erb
new file mode 100755
index 000000000..7f7300e80
--- /dev/null
+++ b/benchmarks/workloads/ruby_template/index.erb
@@ -0,0 +1,8 @@
+<!DOCTYPE html>
+<html>
+<body>
+ <% text.each do |t| %>
+ <p><%= t %></p>
+ <% end %>
+</body>
+</html>
diff --git a/benchmarks/workloads/ruby_template/main.rb b/benchmarks/workloads/ruby_template/main.rb
new file mode 100755
index 000000000..35c239377
--- /dev/null
+++ b/benchmarks/workloads/ruby_template/main.rb
@@ -0,0 +1,27 @@
+require "sinatra"
+require "securerandom"
+require "redis"
+
+redis_host = ENV["host"]
+$redis = Redis.new(host: redis_host)
+
+def generateText
+ for i in 0..99
+ $redis.set(i, randomBody(1024))
+ end
+end
+
+def randomBody(length)
+ return SecureRandom.alphanumeric(length)
+end
+
+generateText
+template = ERB.new(File.read('./index.erb'))
+
+get "/" do
+ texts = Array.new
+ for i in 0..4
+ texts.push($redis.get(rand(0..99)))
+ end
+ template.result_with_hash(text: texts)
+end \ No newline at end of file
diff --git a/benchmarks/workloads/sleep/BUILD b/benchmarks/workloads/sleep/BUILD
new file mode 100644
index 000000000..eb0fb6165
--- /dev/null
+++ b/benchmarks/workloads/sleep/BUILD
@@ -0,0 +1,13 @@
+load("@rules_pkg//:pkg.bzl", "pkg_tar")
+
+package(
+ default_visibility = ["//benchmarks:__subpackages__"],
+ licenses = ["notice"],
+)
+
+pkg_tar(
+ name = "tar",
+ srcs = [
+ "Dockerfile",
+ ],
+)
diff --git a/benchmarks/workloads/sleep/Dockerfile b/benchmarks/workloads/sleep/Dockerfile
new file mode 100644
index 000000000..24c72e07a
--- /dev/null
+++ b/benchmarks/workloads/sleep/Dockerfile
@@ -0,0 +1,3 @@
+FROM alpine:latest
+
+CMD ["sleep", "315360000"]
diff --git a/benchmarks/workloads/sysbench/BUILD b/benchmarks/workloads/sysbench/BUILD
new file mode 100644
index 000000000..fd2f8f03d
--- /dev/null
+++ b/benchmarks/workloads/sysbench/BUILD
@@ -0,0 +1,36 @@
+load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement")
+load("@rules_pkg//:pkg.bzl", "pkg_tar")
+
+package(
+ default_visibility = ["//benchmarks:__subpackages__"],
+ licenses = ["notice"],
+)
+
+py_library(
+ name = "sysbench",
+ srcs = ["__init__.py"],
+)
+
+py_test(
+ name = "sysbench_test",
+ srcs = ["sysbench_test.py"],
+ python_version = "PY3",
+ deps = [
+ ":sysbench",
+ requirement("attrs", False),
+ requirement("atomicwrites", False),
+ requirement("more-itertools", False),
+ requirement("pathlib2", False),
+ requirement("pluggy", False),
+ requirement("py", False),
+ requirement("pytest", True),
+ requirement("six", False),
+ ],
+)
+
+pkg_tar(
+ name = "tar",
+ srcs = [
+ "Dockerfile",
+ ],
+)
diff --git a/benchmarks/workloads/sysbench/Dockerfile b/benchmarks/workloads/sysbench/Dockerfile
new file mode 100644
index 000000000..8225e0e14
--- /dev/null
+++ b/benchmarks/workloads/sysbench/Dockerfile
@@ -0,0 +1,16 @@
+FROM ubuntu:18.04
+
+RUN set -x \
+ && apt-get update \
+ && apt-get install -y \
+ sysbench \
+ && rm -rf /var/lib/apt/lists/*
+
+# Parameterize the tests.
+ENV test cpu
+ENV threads 1
+ENV options ""
+
+# run sysbench once as a warm-up and take the second result
+CMD ["sh", "-c", "sysbench --threads=8 --memory-total-size=5G memory run > /dev/null && \
+sysbench --threads=${threads} ${options} ${test} run"]
diff --git a/benchmarks/workloads/sysbench/__init__.py b/benchmarks/workloads/sysbench/__init__.py
new file mode 100644
index 000000000..de357b4db
--- /dev/null
+++ b/benchmarks/workloads/sysbench/__init__.py
@@ -0,0 +1,167 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Sysbench."""
+
+import re
+
+STD_REGEX = r"events per second:\s*(\d*.?\d*)\n"
+MEM_REGEX = r"Total\soperations:\s+\d*\s*\((\d*\.\d*)\sper\ssecond\)"
+ALT_REGEX = r"execution time \(avg/stddev\):\s*(\d*.?\d*)/(\d*.?\d*)"
+AVG_REGEX = r"avg:[^\n^\d]*(\d*\.?\d*)"
+
+SAMPLE_CPU_DATA = """
+sysbench 1.0.11 (using system LuaJIT 2.1.0-beta3)
+
+Running the test with following options:
+Number of threads: 8
+Initializing random number generator from current time
+
+
+Prime numbers limit: 10000
+
+Initializing worker threads...
+
+Threads started!
+
+CPU speed:
+ events per second: 9093.38
+
+General statistics:
+ total time: 10.0007s
+ total number of events: 90949
+
+Latency (ms):
+ min: 0.64
+ avg: 0.88
+ max: 24.65
+ 95th percentile: 1.55
+ sum: 79936.91
+
+Threads fairness:
+ events (avg/stddev): 11368.6250/831.38
+ execution time (avg/stddev): 9.9921/0.01
+"""
+
+SAMPLE_MEMORY_DATA = """
+sysbench 1.0.11 (using system LuaJIT 2.1.0-beta3)
+
+Running the test with following options:
+Number of threads: 8
+Initializing random number generator from current time
+
+
+Running memory speed test with the following options:
+ block size: 1KiB
+ total size: 102400MiB
+ operation: write
+ scope: global
+
+Initializing worker threads...
+
+Threads started!
+
+Total operations: 47999046 (9597428.64 per second)
+
+46874.07 MiB transferred (9372.49 MiB/sec)
+
+
+General statistics:
+ total time: 5.0001s
+ total number of events: 47999046
+
+Latency (ms):
+ min: 0.00
+ avg: 0.00
+ max: 0.21
+ 95th percentile: 0.00
+ sum: 33165.91
+
+Threads fairness:
+ events (avg/stddev): 5999880.7500/111242.52
+ execution time (avg/stddev): 4.1457/0.09
+"""
+
+SAMPLE_MUTEX_DATA = """
+sysbench 1.0.11 (using system LuaJIT 2.1.0-beta3)
+
+Running the test with following options:
+Number of threads: 8
+Initializing random number generator from current time
+
+
+Initializing worker threads...
+
+Threads started!
+
+
+General statistics:
+ total time: 3.7869s
+ total number of events: 8
+
+Latency (ms):
+ min: 3688.56
+ avg: 3754.03
+ max: 3780.94
+ 95th percentile: 3773.42
+ sum: 30032.28
+
+Threads fairness:
+ events (avg/stddev): 1.0000/0.00
+ execution time (avg/stddev): 3.7540/0.03
+"""
+
+
+# pylint: disable=unused-argument
+def sample(test, **kwargs):
+ switch = {
+ "cpu": SAMPLE_CPU_DATA,
+ "memory": SAMPLE_MEMORY_DATA,
+ "mutex": SAMPLE_MUTEX_DATA,
+ "randwr": SAMPLE_CPU_DATA
+ }
+ return switch[test]
+
+
+# pylint: disable=unused-argument
+def cpu_events_per_second(data: str, **kwargs) -> float:
+ """Returns events per second."""
+ return float(re.compile(STD_REGEX).search(data).group(1))
+
+
+# pylint: disable=unused-argument
+def memory_ops_per_second(data: str, **kwargs) -> float:
+ """Returns memory operations per second."""
+ return float(re.compile(MEM_REGEX).search(data).group(1))
+
+
+# pylint: disable=unused-argument
+def mutex_time(data: str, count: int, locks: int, threads: int,
+ **kwargs) -> float:
+ """Returns normalized mutex time (lower is better)."""
+ value = float(re.compile(ALT_REGEX).search(data).group(1))
+ contention = float(threads) / float(locks)
+ scale = contention * float(count) / 100000000.0
+ return value / scale
+
+
+# pylint: disable=unused-argument
+def mutex_deviation(data: str, **kwargs) -> float:
+ """Returns deviation for threads."""
+ return float(re.compile(ALT_REGEX).search(data).group(2))
+
+
+# pylint: disable=unused-argument
+def mutex_latency(data: str, **kwargs) -> float:
+ """Returns average mutex latency."""
+ return float(re.compile(AVG_REGEX).search(data).group(1))
diff --git a/benchmarks/workloads/sysbench/sysbench_test.py b/benchmarks/workloads/sysbench/sysbench_test.py
new file mode 100644
index 000000000..3fb541fd2
--- /dev/null
+++ b/benchmarks/workloads/sysbench/sysbench_test.py
@@ -0,0 +1,34 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Parser test."""
+
+import sys
+
+import pytest
+
+from benchmarks.workloads import sysbench
+
+
+def test_sysbench_parser():
+ """Test the basic parser."""
+ assert sysbench.cpu_events_per_second(sysbench.sample("cpu")) == 9093.38
+ assert sysbench.memory_ops_per_second(sysbench.sample("memory")) == 9597428.64
+ assert sysbench.mutex_time(sysbench.sample("mutex"), 1, 1,
+ 100000000.0) == 3.754
+ assert sysbench.mutex_deviation(sysbench.sample("mutex")) == 0.03
+ assert sysbench.mutex_latency(sysbench.sample("mutex")) == 3754.03
+
+
+if __name__ == "__main__":
+ sys.exit(pytest.main([__file__]))
diff --git a/benchmarks/workloads/syscall/BUILD b/benchmarks/workloads/syscall/BUILD
new file mode 100644
index 000000000..5100cbb21
--- /dev/null
+++ b/benchmarks/workloads/syscall/BUILD
@@ -0,0 +1,37 @@
+load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement")
+load("@rules_pkg//:pkg.bzl", "pkg_tar")
+
+package(
+ default_visibility = ["//benchmarks:__subpackages__"],
+ licenses = ["notice"],
+)
+
+py_library(
+ name = "syscall",
+ srcs = ["__init__.py"],
+)
+
+py_test(
+ name = "syscall_test",
+ srcs = ["syscall_test.py"],
+ python_version = "PY3",
+ deps = [
+ ":syscall",
+ requirement("attrs", False),
+ requirement("atomicwrites", False),
+ requirement("more-itertools", False),
+ requirement("pathlib2", False),
+ requirement("pluggy", False),
+ requirement("py", False),
+ requirement("pytest", True),
+ requirement("six", False),
+ ],
+)
+
+pkg_tar(
+ name = "tar",
+ srcs = [
+ "Dockerfile",
+ "syscall.c",
+ ],
+)
diff --git a/benchmarks/workloads/syscall/Dockerfile b/benchmarks/workloads/syscall/Dockerfile
new file mode 100644
index 000000000..a2088d953
--- /dev/null
+++ b/benchmarks/workloads/syscall/Dockerfile
@@ -0,0 +1,6 @@
+FROM gcc:latest
+COPY . /usr/src/syscall
+WORKDIR /usr/src/syscall
+RUN gcc -O2 -o syscall syscall.c
+ENV count 1000000
+CMD ["sh", "-c", "./syscall ${count}"]
diff --git a/benchmarks/workloads/syscall/__init__.py b/benchmarks/workloads/syscall/__init__.py
new file mode 100644
index 000000000..dc9028faa
--- /dev/null
+++ b/benchmarks/workloads/syscall/__init__.py
@@ -0,0 +1,29 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Simple syscall test."""
+
+import re
+
+SAMPLE_DATA = "Called getpid syscall 1000000 times: 1117 ms, 500 ns each."
+
+
+# pylint: disable=unused-argument
+def sample(**kwargs) -> str:
+ return SAMPLE_DATA
+
+
+# pylint: disable=unused-argument
+def syscall_time_ns(data: str, **kwargs) -> int:
+ """Returns average system call time."""
+ return float(re.compile(r"(\d+)\sns each.").search(data).group(1))
diff --git a/benchmarks/workloads/syscall/syscall.c b/benchmarks/workloads/syscall/syscall.c
new file mode 100644
index 000000000..ded030397
--- /dev/null
+++ b/benchmarks/workloads/syscall/syscall.c
@@ -0,0 +1,55 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <time.h>
+#include <unistd.h>
+
+// Short program that calls getpid() a number of times and outputs time
+// diference from the MONOTONIC clock.
+int main(int argc, char** argv) {
+ struct timespec start, stop;
+ long result;
+ char buf[80];
+
+ if (argc < 2) {
+ printf("Usage:./syscall NUM_TIMES_TO_CALL");
+ return 1;
+ }
+
+ if (clock_gettime(CLOCK_MONOTONIC, &start)) return 1;
+
+ long loops = atoi(argv[1]);
+ for (long i = 0; i < loops; i++) {
+ syscall(SYS_gettimeofday, 0, 0);
+ }
+
+ if (clock_gettime(CLOCK_MONOTONIC, &stop)) return 1;
+
+ if ((stop.tv_nsec - start.tv_nsec) < 0) {
+ result = (stop.tv_sec - start.tv_sec - 1) * 1000;
+ result += (stop.tv_nsec - start.tv_nsec + 1000000000) / (1000 * 1000);
+ } else {
+ result = (stop.tv_sec - start.tv_sec) * 1000;
+ result += (stop.tv_nsec - start.tv_nsec) / (1000 * 1000);
+ }
+
+ printf("Called getpid syscall %d times: %lu ms, %lu ns each.\n", loops,
+ result, result * 1000000 / loops);
+
+ return 0;
+}
diff --git a/benchmarks/workloads/syscall/syscall_test.py b/benchmarks/workloads/syscall/syscall_test.py
new file mode 100644
index 000000000..72f027de1
--- /dev/null
+++ b/benchmarks/workloads/syscall/syscall_test.py
@@ -0,0 +1,27 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+
+import pytest
+
+from benchmarks.workloads import syscall
+
+
+def test_syscall_time_ns():
+ assert syscall.syscall_time_ns(syscall.sample()) == 500
+
+
+if __name__ == "__main__":
+ sys.exit(pytest.main([__file__]))
diff --git a/benchmarks/workloads/tensorflow/BUILD b/benchmarks/workloads/tensorflow/BUILD
new file mode 100644
index 000000000..026c3b316
--- /dev/null
+++ b/benchmarks/workloads/tensorflow/BUILD
@@ -0,0 +1,18 @@
+load("@rules_pkg//:pkg.bzl", "pkg_tar")
+
+package(
+ default_visibility = ["//benchmarks:__subpackages__"],
+ licenses = ["notice"],
+)
+
+py_library(
+ name = "tensorflow",
+ srcs = ["__init__.py"],
+)
+
+pkg_tar(
+ name = "tar",
+ srcs = [
+ "Dockerfile",
+ ],
+)
diff --git a/benchmarks/workloads/tensorflow/Dockerfile b/benchmarks/workloads/tensorflow/Dockerfile
new file mode 100644
index 000000000..262643b98
--- /dev/null
+++ b/benchmarks/workloads/tensorflow/Dockerfile
@@ -0,0 +1,14 @@
+FROM tensorflow/tensorflow:1.13.2
+
+RUN apt-get update \
+ && apt-get install -y git
+RUN git clone https://github.com/aymericdamien/TensorFlow-Examples.git
+RUN python -m pip install -U pip setuptools
+RUN python -m pip install matplotlib
+
+WORKDIR /TensorFlow-Examples/examples
+
+ENV PYTHONPATH="$PYTHONPATH:/TensorFlow-Examples/examples"
+
+ENV workload "3_NeuralNetworks/convolutional_network.py"
+CMD python ${workload}
diff --git a/benchmarks/workloads/tensorflow/__init__.py b/benchmarks/workloads/tensorflow/__init__.py
new file mode 100644
index 000000000..b5ec213f8
--- /dev/null
+++ b/benchmarks/workloads/tensorflow/__init__.py
@@ -0,0 +1,20 @@
+# python3
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A Tensorflow example."""
+
+
+# pylint: disable=unused-argument
+def run_time(value, **kwargs):
+ """Returns the startup and runtime of the Tensorflow workload in seconds."""
+ return value
diff --git a/benchmarks/workloads/true/BUILD b/benchmarks/workloads/true/BUILD
new file mode 100644
index 000000000..221c4b9a7
--- /dev/null
+++ b/benchmarks/workloads/true/BUILD
@@ -0,0 +1,14 @@
+load("@rules_pkg//:pkg.bzl", "pkg_tar")
+
+package(
+ default_visibility = ["//benchmarks:__subpackages__"],
+ licenses = ["notice"],
+)
+
+pkg_tar(
+ name = "tar",
+ srcs = [
+ "Dockerfile",
+ ],
+ extension = "tar",
+)
diff --git a/benchmarks/workloads/true/Dockerfile b/benchmarks/workloads/true/Dockerfile
new file mode 100644
index 000000000..2e97c921e
--- /dev/null
+++ b/benchmarks/workloads/true/Dockerfile
@@ -0,0 +1,3 @@
+FROM alpine:latest
+
+CMD ["true"]
diff --git a/go.mod b/go.mod
index 821273d22..c4687ed02 100644
--- a/go.mod
+++ b/go.mod
@@ -1,21 +1,23 @@
module gvisor.dev/gvisor
-go 1.12
+go 1.13
require (
- github.com/cenkalti/backoff v0.0.0-20190506075156-2146c9339422
- github.com/gofrs/flock v0.6.1-0.20180915234121-886344bea079
- github.com/golang/mock v1.3.1
- github.com/golang/protobuf v1.3.1
- github.com/google/btree v1.0.0
- github.com/google/go-cmp v0.2.0
- github.com/google/subcommands v0.0.0-20190508160503-636abe8753b8
- github.com/google/uuid v0.0.0-20171129191014-dec09d789f3d
- github.com/kr/pty v1.1.1
- github.com/opencontainers/runtime-spec v0.1.2-0.20171211145439-b2d941ef6a78
- github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2
- github.com/vishvananda/netlink v1.0.1-0.20190318003149-adb577d4a45e
- github.com/vishvananda/netns v0.0.0-20171111001504-be1fbeda1936
- golang.org/x/net v0.0.0-20190311183353-d8887717615a
- golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a
+ github.com/cenkalti/backoff v0.0.0-20190506075156-2146c9339422
+ github.com/gofrs/flock v0.6.1-0.20180915234121-886344bea079
+ github.com/golang/mock v1.3.1
+ github.com/golang/protobuf v1.3.1
+ github.com/google/btree v1.0.0
+ github.com/google/go-cmp v0.2.0
+ github.com/google/go-github/v28 v28.1.1
+ github.com/google/subcommands v0.0.0-20190508160503-636abe8753b8
+ github.com/google/uuid v0.0.0-20171129191014-dec09d789f3d
+ github.com/kr/pty v1.1.1
+ github.com/opencontainers/runtime-spec v0.1.2-0.20171211145439-b2d941ef6a78
+ github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2
+ github.com/vishvananda/netlink v1.0.1-0.20190318003149-adb577d4a45e
+ github.com/vishvananda/netns v0.0.0-20171111001504-be1fbeda1936
+ golang.org/x/net v0.0.0-20190311183353-d8887717615a
+ golang.org/x/oauth2 v0.0.0-20191202225959-858c2ad4c8b6
+ golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a
)
diff --git a/go.sum b/go.sum
index 7a0bc175a..434770beb 100644
--- a/go.sum
+++ b/go.sum
@@ -4,6 +4,7 @@ github.com/golang/mock v1.3.1/go.mod h1:sBzyDLLjw3U8JLTeZvSv8jJB+tU5PVekmnlKIyFU
github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
+github.com/google/go-github/v28 v28.1.1/go.mod h1:bsqJWQX05omyWVmc00nEUql9mhQyv38lDZ8kPZcQVoM=
github.com/google/subcommands v0.0.0-20190508160503-636abe8753b8/go.mod h1:ZjhPrFU+Olkh9WazFPsl27BQ4UPiG37m3yTrtFlrHVk=
github.com/google/uuid v0.0.0-20171129191014-dec09d789f3d/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
@@ -13,6 +14,7 @@ github.com/vishvananda/netlink v1.0.1-0.20190318003149-adb577d4a45e/go.mod h1:+S
github.com/vishvananda/netns v0.0.0-20171111001504-be1fbeda1936/go.mod h1:ZjcWmFBXmLKZu9Nxj3WKYEafiSqer2rnvPr0en9UNpI=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/oauth2 v0.0.0-20191202225959-858c2ad4c8b6/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
diff --git a/kokoro/iptables_tests.cfg b/kokoro/iptables_tests.cfg
new file mode 100644
index 000000000..7af20629a
--- /dev/null
+++ b/kokoro/iptables_tests.cfg
@@ -0,0 +1,10 @@
+build_file: "repo/scripts/iptables_test.sh"
+
+action {
+ define_artifacts {
+ regex: "**/sponge_log.xml"
+ regex: "**/sponge_log.log"
+ regex: "**/outputs.zip"
+ regex: "**/runsc_logs_*.tar.gz"
+ }
+}
diff --git a/kokoro/issue_reviver.cfg b/kokoro/issue_reviver.cfg
new file mode 100644
index 000000000..2370d9250
--- /dev/null
+++ b/kokoro/issue_reviver.cfg
@@ -0,0 +1,15 @@
+build_file: "repo/scripts/issue_reviver.sh"
+
+before_action {
+ fetch_keystore {
+ keystore_resource {
+ keystore_config_id: 73898
+ keyname: "kokoro-github-access-token"
+ }
+ }
+}
+
+env_vars {
+ key: "KOKORO_GITHUB_ACCESS_TOKEN"
+ value: "73898_kokoro-github-access-token"
+}
diff --git a/kokoro/kythe/generate_xrefs.cfg b/kokoro/kythe/generate_xrefs.cfg
new file mode 100644
index 000000000..03e65c54e
--- /dev/null
+++ b/kokoro/kythe/generate_xrefs.cfg
@@ -0,0 +1,28 @@
+build_file: "gvisor/kokoro/kythe/generate_xrefs.sh"
+
+before_action {
+ fetch_keystore {
+ keystore_resource {
+ keystore_config_id: 73898
+ keyname: "kokoro-rbe-service-account"
+ }
+ }
+}
+
+bazel_setting {
+ project_id: "gvisor-rbe"
+ local_execution: false
+ auth_credential: {
+ keystore_config_id: 73898
+ keyname: "kokoro-rbe-service-account"
+ }
+ bes_backend_address: "buildeventservice.googleapis.com"
+ foundry_backend_address: "remotebuildexecution.googleapis.com"
+ upsalite_frontend_address: "https://source.cloud.google.com"
+}
+
+action {
+ define_artifacts {
+ regex: "*.kzip"
+ }
+}
diff --git a/kokoro/kythe/generate_xrefs.sh b/kokoro/kythe/generate_xrefs.sh
new file mode 100644
index 000000000..799467a34
--- /dev/null
+++ b/kokoro/kythe/generate_xrefs.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -ex
+
+# Install the latest version of Bazel. The default on Kokoro images is out of
+# date.
+if command -v use_bazel.sh >/dev/null; then
+ use_bazel.sh latest
+fi
+bazel version
+
+python3 -V
+
+readonly KYTHE_VERSION='v0.0.37'
+readonly WORKDIR="$(mktemp -d)"
+readonly KYTHE_DIR="${WORKDIR}/kythe-${KYTHE_VERSION}"
+if [[ -n "$KOKORO_GIT_COMMIT" ]]; then
+ readonly KZIP_FILENAME="${KOKORO_ARTIFACTS_DIR}/${KOKORO_GIT_COMMIT}.kzip"
+else
+ readonly KZIP_FILENAME="$(git rev-parse HEAD).kzip"
+fi
+
+wget -q -O "${WORKDIR}/kythe.tar.gz" \
+ "https://github.com/kythe/kythe/releases/download/${KYTHE_VERSION}/kythe-${KYTHE_VERSION}.tar.gz"
+tar --no-same-owner -xzf "${WORKDIR}/kythe.tar.gz" --directory "$WORKDIR"
+
+if [[ -n "$KOKORO_ARTIFACTS_DIR" ]]; then
+ cd "${KOKORO_ARTIFACTS_DIR}/github/gvisor"
+fi
+bazel \
+ --bazelrc="${KYTHE_DIR}/extractors.bazelrc" \
+ build \
+ --override_repository kythe_release="${KYTHE_DIR}" \
+ --define=kythe_corpus=gvisor.dev \
+ --cxxopt=-std=c++17 \
+ //...
+
+"${KYTHE_DIR}/tools/kzip" merge \
+ --output "$KZIP_FILENAME" \
+ $(find -L bazel-out/*/extra_actions/ -name '*.kzip')
diff --git a/kokoro/ubuntu1604/10_core.sh b/kokoro/ubuntu1604/10_core.sh
index e87a6eee8..46dda6bb1 100755
--- a/kokoro/ubuntu1604/10_core.sh
+++ b/kokoro/ubuntu1604/10_core.sh
@@ -21,8 +21,8 @@ apt-get update && apt-get -y install make git-core build-essential linux-headers
# Install a recent go toolchain.
if ! [[ -d /usr/local/go ]]; then
- wget https://dl.google.com/go/go1.12.linux-amd64.tar.gz
- tar -xvf go1.12.linux-amd64.tar.gz
+ wget https://dl.google.com/go/go1.13.5.linux-amd64.tar.gz
+ tar -xvf go1.13.5.linux-amd64.tar.gz
mv go /usr/local
fi
diff --git a/kokoro/ubuntu1604/40_kokoro.sh b/kokoro/ubuntu1604/40_kokoro.sh
index 3f50929d5..5f2dfc858 100755
--- a/kokoro/ubuntu1604/40_kokoro.sh
+++ b/kokoro/ubuntu1604/40_kokoro.sh
@@ -23,7 +23,7 @@ declare -r ssh_public_keys=(
)
# Install dependencies.
-apt-get update && apt-get install -y rsync coreutils python-psutil qemu-kvm python-pip zip
+apt-get update && apt-get install -y rsync coreutils python-psutil qemu-kvm python-pip python3-pip zip
# junitparser is used to merge junit xml files.
pip install junitparser
diff --git a/kokoro/ubuntu1604/README.md b/kokoro/ubuntu1604/README.md
new file mode 100644
index 000000000..64f913b9a
--- /dev/null
+++ b/kokoro/ubuntu1604/README.md
@@ -0,0 +1,34 @@
+## Image Update
+
+After making changes to files in the directory, you must run the following
+commands to update the image Kokoro uses:
+
+```shell
+gcloud config set project gvisor-kokoro-testing
+third_party/gvisor/kokoro/ubuntu1604/build.sh
+third_party/gvisor/kokoro/ubuntu1804/build.sh
+```
+
+Note: the command above will change your default project for `gcloud`. Run
+`gcloud config set project` again to revert back to your default project.
+
+Note: Files in `third_party/gvisor/kokoro/ubuntu1804/` as symlinks to
+`ubuntu1604`, therefore both images must be updated.
+
+After the script finishes, the last few lines of the output will container the
+image name. If the output was lost, you can run `build.sh` again to print the
+image name.
+
+```
+NAME PROJECT FAMILY DEPRECATED STATUS
+image-6777fa4666a968c8 gvisor-kokoro-testing READY
++ cleanup
++ gcloud compute instances delete --quiet build-tlfrdv
+Deleted [https://www.googleapis.com/compute/v1/projects/gvisor-kokoro-testing/zones/us-central1-f/instances/build-tlfrdv].
+```
+
+To setup Kokoro to use the new image, copy the image names to their
+corresponding file below:
+
+* //devtools/kokoro/config/gcp/gvisor/ubuntu1604.gcl
+* //devtools/kokoro/config/gcp/gvisor/ubuntu1804.gcl
diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD
index 51774c6b6..716ff22d2 100644
--- a/pkg/abi/linux/BUILD
+++ b/pkg/abi/linux/BUILD
@@ -41,6 +41,7 @@ go_library(
"poll.go",
"prctl.go",
"ptrace.go",
+ "rseq.go",
"rusage.go",
"sched.go",
"seccomp.go",
@@ -57,6 +58,7 @@ go_library(
"uio.go",
"utsname.go",
"wait.go",
+ "xattr.go",
],
importpath = "gvisor.dev/gvisor/pkg/abi/linux",
visibility = ["//visibility:public"],
diff --git a/pkg/abi/linux/fcntl.go b/pkg/abi/linux/fcntl.go
index f78315ebf..6663a199c 100644
--- a/pkg/abi/linux/fcntl.go
+++ b/pkg/abi/linux/fcntl.go
@@ -16,15 +16,17 @@ package linux
// Commands from linux/fcntl.h.
const (
- F_DUPFD = 0x0
- F_GETFD = 0x1
- F_SETFD = 0x2
- F_GETFL = 0x3
- F_SETFL = 0x4
- F_SETLK = 0x6
- F_SETLKW = 0x7
- F_SETOWN = 0x8
- F_GETOWN = 0x9
+ F_DUPFD = 0
+ F_GETFD = 1
+ F_SETFD = 2
+ F_GETFL = 3
+ F_SETFL = 4
+ F_SETLK = 6
+ F_SETLKW = 7
+ F_SETOWN = 8
+ F_GETOWN = 9
+ F_SETOWN_EX = 15
+ F_GETOWN_EX = 16
F_DUPFD_CLOEXEC = 1024 + 6
F_SETPIPE_SZ = 1024 + 7
F_GETPIPE_SZ = 1024 + 8
@@ -32,9 +34,9 @@ const (
// Commands for F_SETLK.
const (
- F_RDLCK = 0x0
- F_WRLCK = 0x1
- F_UNLCK = 0x2
+ F_RDLCK = 0
+ F_WRLCK = 1
+ F_UNLCK = 2
)
// Flags for fcntl.
@@ -42,7 +44,7 @@ const (
FD_CLOEXEC = 00000001
)
-// Lock structure for F_SETLK.
+// Flock is the lock structure for F_SETLK.
type Flock struct {
Type int16
Whence int16
@@ -52,3 +54,16 @@ type Flock struct {
Pid int32
_ [4]byte
}
+
+// Flags for F_SETOWN_EX and F_GETOWN_EX.
+const (
+ F_OWNER_TID = 0
+ F_OWNER_PID = 1
+ F_OWNER_PGRP = 2
+)
+
+// FOwnerEx is the owner structure for F_SETOWN_EX and F_GETOWN_EX.
+type FOwnerEx struct {
+ Type int32
+ PID int32
+}
diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go
index c9ee098f4..16791d03e 100644
--- a/pkg/abi/linux/file.go
+++ b/pkg/abi/linux/file.go
@@ -144,9 +144,13 @@ const (
ModeCharacterDevice = S_IFCHR
ModeNamedPipe = S_IFIFO
- ModeSetUID = 04000
- ModeSetGID = 02000
- ModeSticky = 01000
+ S_ISUID = 04000
+ S_ISGID = 02000
+ S_ISVTX = 01000
+
+ ModeSetUID = S_ISUID
+ ModeSetGID = S_ISGID
+ ModeSticky = S_ISVTX
ModeUserAll = 0700
ModeUserRead = 0400
@@ -282,6 +286,29 @@ func (m FileMode) String() string {
return strings.Join(s, "|")
}
+// DirentType maps file types to dirent types appropriate for (struct
+// dirent)::d_type.
+func (m FileMode) DirentType() uint8 {
+ switch m.FileType() {
+ case ModeSocket:
+ return DT_SOCK
+ case ModeSymlink:
+ return DT_LNK
+ case ModeRegular:
+ return DT_REG
+ case ModeBlockDevice:
+ return DT_BLK
+ case ModeDirectory:
+ return DT_DIR
+ case ModeCharacterDevice:
+ return DT_CHR
+ case ModeNamedPipe:
+ return DT_FIFO
+ default:
+ return DT_UNKNOWN
+ }
+}
+
var modeExtraBits = abi.FlagSet{
{
Flag: ModeSetUID,
diff --git a/pkg/abi/linux/fs.go b/pkg/abi/linux/fs.go
index b416e3472..2c652baa2 100644
--- a/pkg/abi/linux/fs.go
+++ b/pkg/abi/linux/fs.go
@@ -92,3 +92,10 @@ const (
SYNC_FILE_RANGE_WRITE = 2
SYNC_FILE_RANGE_WAIT_AFTER = 4
)
+
+// Flag argument to renameat2(2), from include/uapi/linux/fs.h.
+const (
+ RENAME_NOREPLACE = (1 << 0) // Don't overwrite target.
+ RENAME_EXCHANGE = (1 << 1) // Exchange src and dst.
+ RENAME_WHITEOUT = (1 << 2) // Whiteout src.
+)
diff --git a/pkg/abi/linux/netfilter.go b/pkg/abi/linux/netfilter.go
index 269ba5567..33fcc6c95 100644
--- a/pkg/abi/linux/netfilter.go
+++ b/pkg/abi/linux/netfilter.go
@@ -42,6 +42,15 @@ const (
NF_RETURN = -NF_REPEAT - 1
)
+// VerdictStrings maps int verdicts to the strings they represent. It is used
+// for debugging.
+var VerdictStrings = map[int32]string{
+ -NF_DROP - 1: "DROP",
+ -NF_ACCEPT - 1: "ACCEPT",
+ -NF_QUEUE - 1: "QUEUE",
+ NF_RETURN: "RETURN",
+}
+
// Socket options. These correspond to values in
// include/uapi/linux/netfilter_ipv4/ip_tables.h.
const (
@@ -179,7 +188,7 @@ const SizeOfXTCounters = 16
// the user data.
type XTEntryMatch struct {
MatchSize uint16
- Name [XT_EXTENSION_MAXNAMELEN]byte
+ Name ExtensionName
Revision uint8
// Data is omitted here because it would cause XTEntryMatch to be an
// extra byte larger (see http://www.catb.org/esr/structure-packing/).
@@ -199,7 +208,7 @@ const SizeOfXTEntryMatch = 32
// the user data.
type XTEntryTarget struct {
TargetSize uint16
- Name [XT_EXTENSION_MAXNAMELEN]byte
+ Name ExtensionName
Revision uint8
// Data is omitted here because it would cause XTEntryTarget to be an
// extra byte larger (see http://www.catb.org/esr/structure-packing/).
@@ -226,9 +235,9 @@ const SizeOfXTStandardTarget = 40
// ErrorName. It corresponds to struct xt_error_target in
// include/uapi/linux/netfilter/x_tables.h.
type XTErrorTarget struct {
- Target XTEntryTarget
- ErrorName [XT_FUNCTION_MAXNAMELEN]byte
- _ [2]byte
+ Target XTEntryTarget
+ Name ErrorName
+ _ [2]byte
}
// SizeOfXTErrorTarget is the size of an XTErrorTarget.
@@ -237,7 +246,7 @@ const SizeOfXTErrorTarget = 64
// IPTGetinfo is the argument for the IPT_SO_GET_INFO sockopt. It corresponds
// to struct ipt_getinfo in include/uapi/linux/netfilter_ipv4/ip_tables.h.
type IPTGetinfo struct {
- Name [XT_TABLE_MAXNAMELEN]byte
+ Name TableName
ValidHooks uint32
HookEntry [NF_INET_NUMHOOKS]uint32
Underflow [NF_INET_NUMHOOKS]uint32
@@ -248,16 +257,11 @@ type IPTGetinfo struct {
// SizeOfIPTGetinfo is the size of an IPTGetinfo.
const SizeOfIPTGetinfo = 84
-// TableName returns the table name.
-func (info *IPTGetinfo) TableName() string {
- return tableName(info.Name[:])
-}
-
// IPTGetEntries is the argument for the IPT_SO_GET_ENTRIES sockopt. It
// corresponds to struct ipt_get_entries in
// include/uapi/linux/netfilter_ipv4/ip_tables.h.
type IPTGetEntries struct {
- Name [XT_TABLE_MAXNAMELEN]byte
+ Name TableName
Size uint32
_ [4]byte
// Entrytable is omitted here because it would cause IPTGetEntries to
@@ -266,34 +270,22 @@ type IPTGetEntries struct {
// Entrytable [0]IPTEntry
}
-// TableName returns the entries' table name.
-func (entries *IPTGetEntries) TableName() string {
- return tableName(entries.Name[:])
-}
-
// SizeOfIPTGetEntries is the size of an IPTGetEntries.
const SizeOfIPTGetEntries = 40
-// KernelIPTGetEntries is identical to IPTEntry, but includes the Elems field.
-// This struct marshaled via the binary package to write an KernelIPTGetEntries
-// to userspace.
+// KernelIPTGetEntries is identical to IPTGetEntries, but includes the
+// Entrytable field. This struct marshaled via the binary package to write an
+// KernelIPTGetEntries to userspace.
type KernelIPTGetEntries struct {
- Name [XT_TABLE_MAXNAMELEN]byte
- Size uint32
- _ [4]byte
+ IPTGetEntries
Entrytable []KernelIPTEntry
}
-// TableName returns the entries' table name.
-func (entries *KernelIPTGetEntries) TableName() string {
- return tableName(entries.Name[:])
-}
-
// IPTReplace is the argument for the IPT_SO_SET_REPLACE sockopt. It
// corresponds to struct ipt_replace in
// include/uapi/linux/netfilter_ipv4/ip_tables.h.
type IPTReplace struct {
- Name [XT_TABLE_MAXNAMELEN]byte
+ Name TableName
ValidHooks uint32
NumEntries uint32
Size uint32
@@ -306,14 +298,45 @@ type IPTReplace struct {
// Entries [0]IPTEntry
}
+// KernelIPTReplace is identical to IPTReplace, but includes the Entries field.
+type KernelIPTReplace struct {
+ IPTReplace
+ Entries [0]IPTEntry
+}
+
// SizeOfIPTReplace is the size of an IPTReplace.
const SizeOfIPTReplace = 96
-func tableName(name []byte) string {
- for i, c := range name {
+// ExtensionName holds the name of a netfilter extension.
+type ExtensionName [XT_EXTENSION_MAXNAMELEN]byte
+
+// String implements fmt.Stringer.
+func (en ExtensionName) String() string {
+ return goString(en[:])
+}
+
+// TableName holds the name of a netfilter table.
+type TableName [XT_TABLE_MAXNAMELEN]byte
+
+// String implements fmt.Stringer.
+func (tn TableName) String() string {
+ return goString(tn[:])
+}
+
+// ErrorName holds the name of a netfilter error. These can also hold
+// user-defined chains.
+type ErrorName [XT_FUNCTION_MAXNAMELEN]byte
+
+// String implements fmt.Stringer.
+func (en ErrorName) String() string {
+ return goString(en[:])
+}
+
+func goString(cstring []byte) string {
+ for i, c := range cstring {
if c == 0 {
- return string(name[:i])
+ return string(cstring[:i])
}
}
- return string(name)
+ return string(cstring)
}
diff --git a/pkg/abi/linux/rseq.go b/pkg/abi/linux/rseq.go
new file mode 100644
index 000000000..76253ba30
--- /dev/null
+++ b/pkg/abi/linux/rseq.go
@@ -0,0 +1,130 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+// Flags passed to rseq(2).
+//
+// Defined in include/uapi/linux/rseq.h.
+const (
+ // RSEQ_FLAG_UNREGISTER unregisters the current thread.
+ RSEQ_FLAG_UNREGISTER = 1 << 0
+)
+
+// Critical section flags used in RSeqCriticalSection.Flags and RSeq.Flags.
+//
+// Defined in include/uapi/linux/rseq.h.
+const (
+ // RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT inhibits restart on preemption.
+ RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT = 1 << 0
+
+ // RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL inhibits restart on signal
+ // delivery.
+ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL = 1 << 1
+
+ // RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE inhibits restart on CPU
+ // migration.
+ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE = 1 << 2
+)
+
+// RSeqCriticalSection describes a restartable sequences critical section. It
+// is equivalent to struct rseq_cs, defined in include/uapi/linux/rseq.h.
+//
+// In userspace, this structure is always aligned to 32 bytes.
+//
+// +marshal
+type RSeqCriticalSection struct {
+ // Version is the version of this structure. Version 0 is defined here.
+ Version uint32
+
+ // Flags are the critical section flags, defined above.
+ Flags uint32
+
+ // Start is the start address of the critical section.
+ Start uint64
+
+ // PostCommitOffset is the offset from Start of the first instruction
+ // outside of the critical section.
+ PostCommitOffset uint64
+
+ // Abort is the abort address. It must be outside the critical section,
+ // and the 4 bytes prior must match the abort signature.
+ Abort uint64
+}
+
+const (
+ // SizeOfRSeqCriticalSection is the size of RSeqCriticalSection.
+ SizeOfRSeqCriticalSection = 32
+
+ // SizeOfRSeqSignature is the size of the signature immediately
+ // preceding RSeqCriticalSection.Abort.
+ SizeOfRSeqSignature = 4
+)
+
+// Special values for RSeq.CPUID, defined in include/uapi/linux/rseq.h.
+const (
+ // RSEQ_CPU_ID_UNINITIALIZED indicates that this thread has not
+ // performed rseq initialization.
+ RSEQ_CPU_ID_UNINITIALIZED = ^uint32(0) // -1
+
+ // RSEQ_CPU_ID_REGISTRATION_FAILED indicates that rseq initialization
+ // failed.
+ RSEQ_CPU_ID_REGISTRATION_FAILED = ^uint32(1) // -2
+)
+
+// RSeq is the thread-local restartable sequences config/status. It
+// is equivalent to struct rseq, defined in include/uapi/linux/rseq.h.
+//
+// In userspace, this structure is always aligned to 32 bytes.
+type RSeq struct {
+ // CPUIDStart contains the current CPU ID if rseq is initialized.
+ //
+ // This field should only be read by the thread which registered this
+ // structure, and must be read atomically.
+ CPUIDStart uint32
+
+ // CPUID contains the current CPU ID or one of the CPU ID special
+ // values defined above.
+ //
+ // This field should only be read by the thread which registered this
+ // structure, and must be read atomically.
+ CPUID uint32
+
+ // RSeqCriticalSection is a pointer to the current RSeqCriticalSection
+ // block, or NULL. It is reset to NULL by the kernel on restart or
+ // non-restarting preempt/signal.
+ //
+ // This field should only be written by the thread which registered
+ // this structure, and must be written atomically.
+ RSeqCriticalSection uint64
+
+ // Flags are the critical section flags that apply to all critical
+ // sections on this thread, defined above.
+ Flags uint32
+}
+
+const (
+ // SizeOfRSeq is the size of RSeq.
+ //
+ // Note that RSeq is naively 24 bytes. However, it has 32-byte
+ // alignment, which in C increases sizeof to 32. That is the size that
+ // the Linux kernel uses.
+ SizeOfRSeq = 32
+
+ // AlignOfRSeq is the standard alignment of RSeq.
+ AlignOfRSeq = 32
+
+ // OffsetOfRSeqCriticalSection is the offset of RSeqCriticalSection in RSeq.
+ OffsetOfRSeqCriticalSection = 8
+)
diff --git a/pkg/abi/linux/socket.go b/pkg/abi/linux/socket.go
index 2e2cc6be7..766ee4014 100644
--- a/pkg/abi/linux/socket.go
+++ b/pkg/abi/linux/socket.go
@@ -422,6 +422,15 @@ type ControlMessageRights []int32
// ControlMessageRights.
const SizeOfControlMessageRight = 4
+// SizeOfControlMessageInq is the size of a TCP_INQ control message.
+const SizeOfControlMessageInq = 4
+
+// SizeOfControlMessageTOS is the size of an IP_TOS control message.
+const SizeOfControlMessageTOS = 1
+
+// SizeOfControlMessageTClass is the size of an IPV6_TCLASS control message.
+const SizeOfControlMessageTClass = 4
+
// SCM_MAX_FD is the maximum number of FDs accepted in a single sendmsg call.
// From net/scm.h.
const SCM_MAX_FD = 253
diff --git a/pkg/abi/linux/time.go b/pkg/abi/linux/time.go
index 546668bca..5c5a58cd4 100644
--- a/pkg/abi/linux/time.go
+++ b/pkg/abi/linux/time.go
@@ -234,6 +234,19 @@ type StatxTimestamp struct {
_ int32
}
+// ToNsec returns the nanosecond representation.
+func (sxts StatxTimestamp) ToNsec() int64 {
+ return int64(sxts.Sec)*1e9 + int64(sxts.Nsec)
+}
+
+// ToNsecCapped returns the safe nanosecond representation.
+func (sxts StatxTimestamp) ToNsecCapped() int64 {
+ if sxts.Sec > maxSecInDuration {
+ return math.MaxInt64
+ }
+ return sxts.ToNsec()
+}
+
// NsecToStatxTimestamp translates nanoseconds to StatxTimestamp.
func NsecToStatxTimestamp(nsec int64) (ts StatxTimestamp) {
return StatxTimestamp{
diff --git a/pkg/sentry/fsimpl/proc/filesystems.go b/pkg/abi/linux/xattr.go
index c36c4aff5..a3b6406fa 100644
--- a/pkg/sentry/fsimpl/proc/filesystems.go
+++ b/pkg/abi/linux/xattr.go
@@ -12,14 +12,16 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-package proc
+package linux
-// filesystemsData implements vfs.DynamicBytesSource for /proc/filesystems.
-//
-// +stateify savable
-type filesystemsData struct{}
+// Constants for extended attributes.
+const (
+ XATTR_NAME_MAX = 255
+ XATTR_SIZE_MAX = 65536
+
+ XATTR_CREATE = 1
+ XATTR_REPLACE = 2
-// TODO(b/138862512): Implement vfs.DynamicBytesSource.Generate for
-// filesystemsData. We would need to retrive filesystem names from
-// vfs.VirtualFilesystem. Also needs vfs replacement for
-// fs.Filesystem.AllowUserList() and fs.FilesystemRequiresDev.
+ XATTR_USER_PREFIX = "user."
+ XATTR_USER_PREFIX_LEN = len(XATTR_USER_PREFIX)
+)
diff --git a/pkg/amutex/BUILD b/pkg/amutex/BUILD
index 6bc486b62..d99e37b40 100644
--- a/pkg/amutex/BUILD
+++ b/pkg/amutex/BUILD
@@ -15,4 +15,5 @@ go_test(
size = "small",
srcs = ["amutex_test.go"],
embed = [":amutex"],
+ deps = ["//pkg/sync"],
)
diff --git a/pkg/amutex/amutex_test.go b/pkg/amutex/amutex_test.go
index 1d7f45641..8a3952f2a 100644
--- a/pkg/amutex/amutex_test.go
+++ b/pkg/amutex/amutex_test.go
@@ -15,9 +15,10 @@
package amutex
import (
- "sync"
"testing"
"time"
+
+ "gvisor.dev/gvisor/pkg/sync"
)
type sleeper struct {
diff --git a/pkg/atomicbitops/BUILD b/pkg/atomicbitops/BUILD
index 36beaade9..6403c60c2 100644
--- a/pkg/atomicbitops/BUILD
+++ b/pkg/atomicbitops/BUILD
@@ -20,4 +20,5 @@ go_test(
size = "small",
srcs = ["atomic_bitops_test.go"],
embed = [":atomicbitops"],
+ deps = ["//pkg/sync"],
)
diff --git a/pkg/atomicbitops/atomic_bitops_test.go b/pkg/atomicbitops/atomic_bitops_test.go
index 965e9be79..9466d3e23 100644
--- a/pkg/atomicbitops/atomic_bitops_test.go
+++ b/pkg/atomicbitops/atomic_bitops_test.go
@@ -16,8 +16,9 @@ package atomicbitops
import (
"runtime"
- "sync"
"testing"
+
+ "gvisor.dev/gvisor/pkg/sync"
)
const iterations = 100
diff --git a/pkg/compressio/BUILD b/pkg/compressio/BUILD
index a0b21d4bd..2bb581b18 100644
--- a/pkg/compressio/BUILD
+++ b/pkg/compressio/BUILD
@@ -8,7 +8,10 @@ go_library(
srcs = ["compressio.go"],
importpath = "gvisor.dev/gvisor/pkg/compressio",
visibility = ["//:sandbox"],
- deps = ["//pkg/binary"],
+ deps = [
+ "//pkg/binary",
+ "//pkg/sync",
+ ],
)
go_test(
diff --git a/pkg/compressio/compressio.go b/pkg/compressio/compressio.go
index 3b0bb086e..5f52cbe74 100644
--- a/pkg/compressio/compressio.go
+++ b/pkg/compressio/compressio.go
@@ -52,9 +52,9 @@ import (
"hash"
"io"
"runtime"
- "sync"
"gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/sync"
)
var bufPool = sync.Pool{
diff --git a/pkg/control/server/BUILD b/pkg/control/server/BUILD
index 21adf3adf..adbd1e3f8 100644
--- a/pkg/control/server/BUILD
+++ b/pkg/control/server/BUILD
@@ -9,6 +9,7 @@ go_library(
visibility = ["//:sandbox"],
deps = [
"//pkg/log",
+ "//pkg/sync",
"//pkg/unet",
"//pkg/urpc",
],
diff --git a/pkg/control/server/server.go b/pkg/control/server/server.go
index a56152d10..41abe1f2d 100644
--- a/pkg/control/server/server.go
+++ b/pkg/control/server/server.go
@@ -22,9 +22,9 @@ package server
import (
"os"
- "sync"
"gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/unet"
"gvisor.dev/gvisor/pkg/urpc"
)
diff --git a/pkg/eventchannel/BUILD b/pkg/eventchannel/BUILD
index 0b4b7cc44..9d68682c7 100644
--- a/pkg/eventchannel/BUILD
+++ b/pkg/eventchannel/BUILD
@@ -15,6 +15,7 @@ go_library(
deps = [
":eventchannel_go_proto",
"//pkg/log",
+ "//pkg/sync",
"//pkg/unet",
"@com_github_golang_protobuf//proto:go_default_library",
"@com_github_golang_protobuf//ptypes:go_default_library_gen",
@@ -40,6 +41,7 @@ go_test(
srcs = ["event_test.go"],
embed = [":eventchannel"],
deps = [
+ "//pkg/sync",
"@com_github_golang_protobuf//proto:go_default_library",
],
)
diff --git a/pkg/eventchannel/event.go b/pkg/eventchannel/event.go
index d37ad0428..9a29c58bd 100644
--- a/pkg/eventchannel/event.go
+++ b/pkg/eventchannel/event.go
@@ -22,13 +22,13 @@ package eventchannel
import (
"encoding/binary"
"fmt"
- "sync"
"syscall"
"github.com/golang/protobuf/proto"
"github.com/golang/protobuf/ptypes"
pb "gvisor.dev/gvisor/pkg/eventchannel/eventchannel_go_proto"
"gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/unet"
)
diff --git a/pkg/eventchannel/event_test.go b/pkg/eventchannel/event_test.go
index 3649097d6..7f41b4a27 100644
--- a/pkg/eventchannel/event_test.go
+++ b/pkg/eventchannel/event_test.go
@@ -16,11 +16,11 @@ package eventchannel
import (
"fmt"
- "sync"
"testing"
"time"
"github.com/golang/protobuf/proto"
+ "gvisor.dev/gvisor/pkg/sync"
)
// testEmitter is an emitter that can be used in tests. It records all events
diff --git a/pkg/fdchannel/BUILD b/pkg/fdchannel/BUILD
index 56495cbd9..b0478c672 100644
--- a/pkg/fdchannel/BUILD
+++ b/pkg/fdchannel/BUILD
@@ -15,4 +15,5 @@ go_test(
size = "small",
srcs = ["fdchannel_test.go"],
embed = [":fdchannel"],
+ deps = ["//pkg/sync"],
)
diff --git a/pkg/fdchannel/fdchannel_test.go b/pkg/fdchannel/fdchannel_test.go
index 5d01dc636..7a8a63a59 100644
--- a/pkg/fdchannel/fdchannel_test.go
+++ b/pkg/fdchannel/fdchannel_test.go
@@ -17,10 +17,11 @@ package fdchannel
import (
"io/ioutil"
"os"
- "sync"
"syscall"
"testing"
"time"
+
+ "gvisor.dev/gvisor/pkg/sync"
)
func TestSendRecvFD(t *testing.T) {
diff --git a/pkg/fdnotifier/BUILD b/pkg/fdnotifier/BUILD
index aca2d8a82..91a202a30 100644
--- a/pkg/fdnotifier/BUILD
+++ b/pkg/fdnotifier/BUILD
@@ -11,6 +11,7 @@ go_library(
importpath = "gvisor.dev/gvisor/pkg/fdnotifier",
visibility = ["//:sandbox"],
deps = [
+ "//pkg/sync",
"//pkg/waiter",
"@org_golang_x_sys//unix:go_default_library",
],
diff --git a/pkg/fdnotifier/fdnotifier.go b/pkg/fdnotifier/fdnotifier.go
index f4aae1953..a6b63c982 100644
--- a/pkg/fdnotifier/fdnotifier.go
+++ b/pkg/fdnotifier/fdnotifier.go
@@ -22,10 +22,10 @@ package fdnotifier
import (
"fmt"
- "sync"
"syscall"
"golang.org/x/sys/unix"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/waiter"
)
diff --git a/pkg/flipcall/BUILD b/pkg/flipcall/BUILD
index e590a71ba..85bd83af1 100644
--- a/pkg/flipcall/BUILD
+++ b/pkg/flipcall/BUILD
@@ -19,7 +19,7 @@ go_library(
"//pkg/abi/linux",
"//pkg/log",
"//pkg/memutil",
- "//pkg/syncutil",
+ "//pkg/sync",
],
)
@@ -31,4 +31,5 @@ go_test(
"flipcall_test.go",
],
embed = [":flipcall"],
+ deps = ["//pkg/sync"],
)
diff --git a/pkg/flipcall/flipcall_example_test.go b/pkg/flipcall/flipcall_example_test.go
index 8d88b845d..2e28a149a 100644
--- a/pkg/flipcall/flipcall_example_test.go
+++ b/pkg/flipcall/flipcall_example_test.go
@@ -17,7 +17,8 @@ package flipcall
import (
"bytes"
"fmt"
- "sync"
+
+ "gvisor.dev/gvisor/pkg/sync"
)
func Example() {
diff --git a/pkg/flipcall/flipcall_test.go b/pkg/flipcall/flipcall_test.go
index 168a487ec..33fd55a44 100644
--- a/pkg/flipcall/flipcall_test.go
+++ b/pkg/flipcall/flipcall_test.go
@@ -16,9 +16,10 @@ package flipcall
import (
"runtime"
- "sync"
"testing"
"time"
+
+ "gvisor.dev/gvisor/pkg/sync"
)
var testPacketWindowSize = pageSize
diff --git a/pkg/flipcall/flipcall_unsafe.go b/pkg/flipcall/flipcall_unsafe.go
index 27b8939fc..ac974b232 100644
--- a/pkg/flipcall/flipcall_unsafe.go
+++ b/pkg/flipcall/flipcall_unsafe.go
@@ -18,7 +18,7 @@ import (
"reflect"
"unsafe"
- "gvisor.dev/gvisor/pkg/syncutil"
+ "gvisor.dev/gvisor/pkg/sync"
)
// Packets consist of a 16-byte header followed by an arbitrarily-sized
@@ -75,13 +75,13 @@ func (ep *Endpoint) Data() []byte {
var ioSync int64
func raceBecomeActive() {
- if syncutil.RaceEnabled {
- syncutil.RaceAcquire((unsafe.Pointer)(&ioSync))
+ if sync.RaceEnabled {
+ sync.RaceAcquire((unsafe.Pointer)(&ioSync))
}
}
func raceBecomeInactive() {
- if syncutil.RaceEnabled {
- syncutil.RaceReleaseMerge((unsafe.Pointer)(&ioSync))
+ if sync.RaceEnabled {
+ sync.RaceReleaseMerge((unsafe.Pointer)(&ioSync))
}
}
diff --git a/pkg/fspath/BUILD b/pkg/fspath/BUILD
index 0c5f50397..ca540363c 100644
--- a/pkg/fspath/BUILD
+++ b/pkg/fspath/BUILD
@@ -14,7 +14,6 @@ go_library(
"fspath.go",
],
importpath = "gvisor.dev/gvisor/pkg/fspath",
- deps = ["//pkg/syserror"],
)
go_test(
@@ -25,5 +24,4 @@ go_test(
"fspath_test.go",
],
embed = [":fspath"],
- deps = ["//pkg/syserror"],
)
diff --git a/pkg/fspath/fspath.go b/pkg/fspath/fspath.go
index f68752560..9fb3fee24 100644
--- a/pkg/fspath/fspath.go
+++ b/pkg/fspath/fspath.go
@@ -18,19 +18,17 @@ package fspath
import (
"strings"
-
- "gvisor.dev/gvisor/pkg/syserror"
)
const pathSep = '/'
-// Parse parses a pathname as described by path_resolution(7).
-func Parse(pathname string) (Path, error) {
+// Parse parses a pathname as described by path_resolution(7), except that
+// empty pathnames will be parsed successfully to a Path for which
+// Path.Absolute == Path.Dir == Path.HasComponents() == false. (This is
+// necessary to support AT_EMPTY_PATH.)
+func Parse(pathname string) Path {
if len(pathname) == 0 {
- // "... POSIX decrees that an empty pathname must not be resolved
- // successfully. Linux returns ENOENT in this case." -
- // path_resolution(7)
- return Path{}, syserror.ENOENT
+ return Path{}
}
// Skip leading path separators.
i := 0
@@ -41,7 +39,7 @@ func Parse(pathname string) (Path, error) {
return Path{
Absolute: true,
Dir: true,
- }, nil
+ }
}
}
// Skip trailing path separators. This is required by Iterator.Next. This
@@ -64,7 +62,7 @@ func Parse(pathname string) (Path, error) {
},
Absolute: i != 0,
Dir: j != len(pathname)-1,
- }, nil
+ }
}
// Path contains the information contained in a pathname string.
@@ -111,6 +109,12 @@ func (p Path) String() string {
return b.String()
}
+// HasComponents returns true if p contains a non-zero number of path
+// components.
+func (p Path) HasComponents() bool {
+ return p.Begin.Ok()
+}
+
// An Iterator represents either a path component in a Path or a terminal
// iterator indicating that the end of the path has been reached.
//
diff --git a/pkg/fspath/fspath_test.go b/pkg/fspath/fspath_test.go
index 215b35622..d5e9a549a 100644
--- a/pkg/fspath/fspath_test.go
+++ b/pkg/fspath/fspath_test.go
@@ -18,15 +18,10 @@ import (
"reflect"
"strings"
"testing"
-
- "gvisor.dev/gvisor/pkg/syserror"
)
func TestParseIteratorPartialPathnames(t *testing.T) {
- path, err := Parse("/foo//bar///baz////")
- if err != nil {
- t.Fatalf("Parse failed: %v", err)
- }
+ path := Parse("/foo//bar///baz////")
// Parse strips leading slashes, and records their presence as
// Path.Absolute.
if !path.Absolute {
@@ -71,6 +66,12 @@ func TestParse(t *testing.T) {
}
tests := []testCase{
{
+ pathname: "",
+ relpath: []string{},
+ abs: false,
+ dir: false,
+ },
+ {
pathname: "/",
relpath: []string{},
abs: true,
@@ -113,10 +114,7 @@ func TestParse(t *testing.T) {
for _, test := range tests {
t.Run(test.pathname, func(t *testing.T) {
- p, err := Parse(test.pathname)
- if err != nil {
- t.Fatalf("failed to parse pathname %q: %v", test.pathname, err)
- }
+ p := Parse(test.pathname)
t.Logf("pathname %q => path %q", test.pathname, p)
if p.Absolute != test.abs {
t.Errorf("path absoluteness: got %v, wanted %v", p.Absolute, test.abs)
@@ -134,10 +132,3 @@ func TestParse(t *testing.T) {
})
}
}
-
-func TestParseEmptyPathname(t *testing.T) {
- p, err := Parse("")
- if err != syserror.ENOENT {
- t.Errorf("parsing empty pathname: got (%v, %v), wanted (<unspecified>, ENOENT)", p, err)
- }
-}
diff --git a/pkg/gate/BUILD b/pkg/gate/BUILD
index 4b9321711..f22bd070d 100644
--- a/pkg/gate/BUILD
+++ b/pkg/gate/BUILD
@@ -19,5 +19,6 @@ go_test(
],
deps = [
":gate",
+ "//pkg/sync",
],
)
diff --git a/pkg/gate/gate_test.go b/pkg/gate/gate_test.go
index 5dbd8d712..850693df8 100644
--- a/pkg/gate/gate_test.go
+++ b/pkg/gate/gate_test.go
@@ -15,11 +15,11 @@
package gate_test
import (
- "sync"
"testing"
"time"
"gvisor.dev/gvisor/pkg/gate"
+ "gvisor.dev/gvisor/pkg/sync"
)
func TestBasicEnter(t *testing.T) {
diff --git a/pkg/goid/BUILD b/pkg/goid/BUILD
new file mode 100644
index 000000000..5d31e5366
--- /dev/null
+++ b/pkg/goid/BUILD
@@ -0,0 +1,26 @@
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+ name = "goid",
+ srcs = [
+ "goid.go",
+ "goid_amd64.s",
+ "goid_race.go",
+ "goid_unsafe.go",
+ ],
+ importpath = "gvisor.dev/gvisor/pkg/goid",
+ visibility = ["//visibility:public"],
+)
+
+go_test(
+ name = "goid_test",
+ size = "small",
+ srcs = [
+ "empty_test.go",
+ "goid_test.go",
+ ],
+ embed = [":goid"],
+)
diff --git a/pkg/goid/empty_test.go b/pkg/goid/empty_test.go
new file mode 100644
index 000000000..c0a4b17ab
--- /dev/null
+++ b/pkg/goid/empty_test.go
@@ -0,0 +1,22 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build !race
+
+package goid
+
+import "testing"
+
+// TestNothing exists to make the build system happy.
+func TestNothing(t *testing.T) {}
diff --git a/pkg/goid/goid.go b/pkg/goid/goid.go
new file mode 100644
index 000000000..39df30031
--- /dev/null
+++ b/pkg/goid/goid.go
@@ -0,0 +1,24 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build !race
+
+// Package goid provides access to the ID of the current goroutine in
+// race/gotsan builds.
+package goid
+
+// Get returns the ID of the current goroutine.
+func Get() int64 {
+ panic("unimplemented for non-race builds")
+}
diff --git a/pkg/goid/goid_amd64.s b/pkg/goid/goid_amd64.s
new file mode 100644
index 000000000..d9f5cd2a3
--- /dev/null
+++ b/pkg/goid/goid_amd64.s
@@ -0,0 +1,21 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "textflag.h"
+
+// func getg() *g
+TEXT ·getg(SB),NOSPLIT,$0-8
+ MOVQ (TLS), R14
+ MOVQ R14, ret+0(FP)
+ RET
diff --git a/pkg/goid/goid_race.go b/pkg/goid/goid_race.go
new file mode 100644
index 000000000..1766beaee
--- /dev/null
+++ b/pkg/goid/goid_race.go
@@ -0,0 +1,25 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Only available in race/gotsan builds.
+// +build race
+
+// Package goid provides access to the ID of the current goroutine in
+// race/gotsan builds.
+package goid
+
+// Get returns the ID of the current goroutine.
+func Get() int64 {
+ return goid()
+}
diff --git a/pkg/goid/goid_test.go b/pkg/goid/goid_test.go
new file mode 100644
index 000000000..31970ce79
--- /dev/null
+++ b/pkg/goid/goid_test.go
@@ -0,0 +1,74 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build race
+
+package goid
+
+import (
+ "runtime"
+ "sync"
+ "testing"
+)
+
+func TestInitialGoID(t *testing.T) {
+ const max = 10000
+ if id := goid(); id < 0 || id > max {
+ t.Errorf("got goid = %d, want 0 < goid <= %d", id, max)
+ }
+}
+
+// TestGoIDSquence verifies that goid returns values which could plausibly be
+// goroutine IDs. If this test breaks or becomes flaky, the structs in
+// goid_unsafe.go may need to be updated.
+func TestGoIDSquence(t *testing.T) {
+ // Goroutine IDs are cached by each P.
+ runtime.GOMAXPROCS(1)
+
+ // Fill any holes in lower range.
+ for i := 0; i < 50; i++ {
+ var wg sync.WaitGroup
+ wg.Add(1)
+ go func() {
+ wg.Done()
+
+ // Leak the goroutine to prevent the ID from being
+ // reused.
+ select {}
+ }()
+ wg.Wait()
+ }
+
+ id := goid()
+ for i := 0; i < 100; i++ {
+ var (
+ newID int64
+ wg sync.WaitGroup
+ )
+ wg.Add(1)
+ go func() {
+ newID = goid()
+ wg.Done()
+
+ // Leak the goroutine to prevent the ID from being
+ // reused.
+ select {}
+ }()
+ wg.Wait()
+ if max := id + 100; newID <= id || newID > max {
+ t.Errorf("unexpected goroutine ID pattern, got goid = %d, want %d < goid <= %d (previous = %d)", newID, id, max, id)
+ }
+ id = newID
+ }
+}
diff --git a/pkg/goid/goid_unsafe.go b/pkg/goid/goid_unsafe.go
new file mode 100644
index 000000000..ded8004dd
--- /dev/null
+++ b/pkg/goid/goid_unsafe.go
@@ -0,0 +1,64 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package goid
+
+// Structs from Go runtime. These may change in the future and require
+// updating. These structs are currently the same on both AMD64 and ARM64,
+// but may diverge in the future.
+
+type stack struct {
+ lo uintptr
+ hi uintptr
+}
+
+type gobuf struct {
+ sp uintptr
+ pc uintptr
+ g uintptr
+ ctxt uintptr
+ ret uint64
+ lr uintptr
+ bp uintptr
+}
+
+type g struct {
+ stack stack
+ stackguard0 uintptr
+ stackguard1 uintptr
+
+ _panic uintptr
+ _defer uintptr
+ m uintptr
+ sched gobuf
+ syscallsp uintptr
+ syscallpc uintptr
+ stktopsp uintptr
+ param uintptr
+ atomicstatus uint32
+ stackLock uint32
+ goid int64
+
+ // More fields...
+ //
+ // We only use goid and the fields before it are only listed to
+ // calculate the correct offset.
+}
+
+func getg() *g
+
+// goid returns the ID of the current goroutine.
+func goid() int64 {
+ return getg().goid
+}
diff --git a/pkg/linewriter/BUILD b/pkg/linewriter/BUILD
index a5d980d14..bcde6d308 100644
--- a/pkg/linewriter/BUILD
+++ b/pkg/linewriter/BUILD
@@ -8,6 +8,7 @@ go_library(
srcs = ["linewriter.go"],
importpath = "gvisor.dev/gvisor/pkg/linewriter",
visibility = ["//visibility:public"],
+ deps = ["//pkg/sync"],
)
go_test(
diff --git a/pkg/linewriter/linewriter.go b/pkg/linewriter/linewriter.go
index cd6e4e2ce..a1b1285d4 100644
--- a/pkg/linewriter/linewriter.go
+++ b/pkg/linewriter/linewriter.go
@@ -17,7 +17,8 @@ package linewriter
import (
"bytes"
- "sync"
+
+ "gvisor.dev/gvisor/pkg/sync"
)
// Writer is an io.Writer which buffers input, flushing
diff --git a/pkg/log/BUILD b/pkg/log/BUILD
index fc5f5779b..0df0f2849 100644
--- a/pkg/log/BUILD
+++ b/pkg/log/BUILD
@@ -16,7 +16,10 @@ go_library(
visibility = [
"//visibility:public",
],
- deps = ["//pkg/linewriter"],
+ deps = [
+ "//pkg/linewriter",
+ "//pkg/sync",
+ ],
)
go_test(
diff --git a/pkg/log/log.go b/pkg/log/log.go
index 9387586e6..91a81b288 100644
--- a/pkg/log/log.go
+++ b/pkg/log/log.go
@@ -25,12 +25,12 @@ import (
stdlog "log"
"os"
"runtime"
- "sync"
"sync/atomic"
"syscall"
"time"
"gvisor.dev/gvisor/pkg/linewriter"
+ "gvisor.dev/gvisor/pkg/sync"
)
// Level is the log level.
diff --git a/pkg/metric/BUILD b/pkg/metric/BUILD
index dd6ca6d39..9145f3233 100644
--- a/pkg/metric/BUILD
+++ b/pkg/metric/BUILD
@@ -14,6 +14,7 @@ go_library(
":metric_go_proto",
"//pkg/eventchannel",
"//pkg/log",
+ "//pkg/sync",
],
)
diff --git a/pkg/metric/metric.go b/pkg/metric/metric.go
index eadde06e4..93d4f2b8c 100644
--- a/pkg/metric/metric.go
+++ b/pkg/metric/metric.go
@@ -18,12 +18,12 @@ package metric
import (
"errors"
"fmt"
- "sync"
"sync/atomic"
"gvisor.dev/gvisor/pkg/eventchannel"
"gvisor.dev/gvisor/pkg/log"
pb "gvisor.dev/gvisor/pkg/metric/metric_go_proto"
+ "gvisor.dev/gvisor/pkg/sync"
)
var (
diff --git a/pkg/p9/BUILD b/pkg/p9/BUILD
index f32244c69..a3e05c96d 100644
--- a/pkg/p9/BUILD
+++ b/pkg/p9/BUILD
@@ -29,6 +29,7 @@ go_library(
"//pkg/fdchannel",
"//pkg/flipcall",
"//pkg/log",
+ "//pkg/sync",
"//pkg/unet",
"@org_golang_x_sys//unix:go_default_library",
],
diff --git a/pkg/p9/client.go b/pkg/p9/client.go
index 221516c6c..4045e41fa 100644
--- a/pkg/p9/client.go
+++ b/pkg/p9/client.go
@@ -17,12 +17,12 @@ package p9
import (
"errors"
"fmt"
- "sync"
"syscall"
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/flipcall"
"gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/unet"
)
diff --git a/pkg/p9/handlers.go b/pkg/p9/handlers.go
index 51869c7d6..b9582c07f 100644
--- a/pkg/p9/handlers.go
+++ b/pkg/p9/handlers.go
@@ -257,7 +257,6 @@ func CanOpen(mode FileMode) bool {
// handle implements handler.handle.
func (t *Tlopen) handle(cs *connState) message {
- // Lookup the FID.
ref, ok := cs.LookupFID(t.FID)
if !ok {
return newErr(syscall.EBADF)
@@ -294,7 +293,6 @@ func (t *Tlopen) handle(cs *connState) message {
return syscall.EINVAL
}
- // Do the open.
osFile, qid, ioUnit, err = ref.file.Open(t.Flags)
return err
}); err != nil {
@@ -311,12 +309,10 @@ func (t *Tlopen) handle(cs *connState) message {
}
func (t *Tlcreate) do(cs *connState, uid UID) (*Rlcreate, error) {
- // Don't allow complex names.
if err := checkSafeName(t.Name); err != nil {
return nil, err
}
- // Lookup the FID.
ref, ok := cs.LookupFID(t.FID)
if !ok {
return nil, syscall.EBADF
@@ -390,12 +386,10 @@ func (t *Tsymlink) handle(cs *connState) message {
}
func (t *Tsymlink) do(cs *connState, uid UID) (*Rsymlink, error) {
- // Don't allow complex names.
if err := checkSafeName(t.Name); err != nil {
return nil, err
}
- // Lookup the FID.
ref, ok := cs.LookupFID(t.Directory)
if !ok {
return nil, syscall.EBADF
@@ -426,19 +420,16 @@ func (t *Tsymlink) do(cs *connState, uid UID) (*Rsymlink, error) {
// handle implements handler.handle.
func (t *Tlink) handle(cs *connState) message {
- // Don't allow complex names.
if err := checkSafeName(t.Name); err != nil {
return newErr(err)
}
- // Lookup the FID.
ref, ok := cs.LookupFID(t.Directory)
if !ok {
return newErr(syscall.EBADF)
}
defer ref.DecRef()
- // Lookup the other FID.
refTarget, ok := cs.LookupFID(t.Target)
if !ok {
return newErr(syscall.EBADF)
@@ -467,7 +458,6 @@ func (t *Tlink) handle(cs *connState) message {
// handle implements handler.handle.
func (t *Trenameat) handle(cs *connState) message {
- // Don't allow complex names.
if err := checkSafeName(t.OldName); err != nil {
return newErr(err)
}
@@ -475,14 +465,12 @@ func (t *Trenameat) handle(cs *connState) message {
return newErr(err)
}
- // Lookup the FID.
ref, ok := cs.LookupFID(t.OldDirectory)
if !ok {
return newErr(syscall.EBADF)
}
defer ref.DecRef()
- // Lookup the other FID.
refTarget, ok := cs.LookupFID(t.NewDirectory)
if !ok {
return newErr(syscall.EBADF)
@@ -523,12 +511,10 @@ func (t *Trenameat) handle(cs *connState) message {
// handle implements handler.handle.
func (t *Tunlinkat) handle(cs *connState) message {
- // Don't allow complex names.
if err := checkSafeName(t.Name); err != nil {
return newErr(err)
}
- // Lookup the FID.
ref, ok := cs.LookupFID(t.Directory)
if !ok {
return newErr(syscall.EBADF)
@@ -577,19 +563,16 @@ func (t *Tunlinkat) handle(cs *connState) message {
// handle implements handler.handle.
func (t *Trename) handle(cs *connState) message {
- // Don't allow complex names.
if err := checkSafeName(t.Name); err != nil {
return newErr(err)
}
- // Lookup the FID.
ref, ok := cs.LookupFID(t.FID)
if !ok {
return newErr(syscall.EBADF)
}
defer ref.DecRef()
- // Lookup the target.
refTarget, ok := cs.LookupFID(t.Directory)
if !ok {
return newErr(syscall.EBADF)
@@ -641,7 +624,6 @@ func (t *Trename) handle(cs *connState) message {
// handle implements handler.handle.
func (t *Treadlink) handle(cs *connState) message {
- // Lookup the FID.
ref, ok := cs.LookupFID(t.FID)
if !ok {
return newErr(syscall.EBADF)
@@ -669,7 +651,6 @@ func (t *Treadlink) handle(cs *connState) message {
// handle implements handler.handle.
func (t *Tread) handle(cs *connState) message {
- // Lookup the FID.
ref, ok := cs.LookupFID(t.FID)
if !ok {
return newErr(syscall.EBADF)
@@ -708,7 +689,6 @@ func (t *Tread) handle(cs *connState) message {
// handle implements handler.handle.
func (t *Twrite) handle(cs *connState) message {
- // Lookup the FID.
ref, ok := cs.LookupFID(t.FID)
if !ok {
return newErr(syscall.EBADF)
@@ -747,12 +727,10 @@ func (t *Tmknod) handle(cs *connState) message {
}
func (t *Tmknod) do(cs *connState, uid UID) (*Rmknod, error) {
- // Don't allow complex names.
if err := checkSafeName(t.Name); err != nil {
return nil, err
}
- // Lookup the FID.
ref, ok := cs.LookupFID(t.Directory)
if !ok {
return nil, syscall.EBADF
@@ -791,12 +769,10 @@ func (t *Tmkdir) handle(cs *connState) message {
}
func (t *Tmkdir) do(cs *connState, uid UID) (*Rmkdir, error) {
- // Don't allow complex names.
if err := checkSafeName(t.Name); err != nil {
return nil, err
}
- // Lookup the FID.
ref, ok := cs.LookupFID(t.Directory)
if !ok {
return nil, syscall.EBADF
@@ -827,7 +803,6 @@ func (t *Tmkdir) do(cs *connState, uid UID) (*Rmkdir, error) {
// handle implements handler.handle.
func (t *Tgetattr) handle(cs *connState) message {
- // Lookup the FID.
ref, ok := cs.LookupFID(t.FID)
if !ok {
return newErr(syscall.EBADF)
@@ -856,7 +831,6 @@ func (t *Tgetattr) handle(cs *connState) message {
// handle implements handler.handle.
func (t *Tsetattr) handle(cs *connState) message {
- // Lookup the FID.
ref, ok := cs.LookupFID(t.FID)
if !ok {
return newErr(syscall.EBADF)
@@ -883,7 +857,6 @@ func (t *Tsetattr) handle(cs *connState) message {
// handle implements handler.handle.
func (t *Tallocate) handle(cs *connState) message {
- // Lookup the FID.
ref, ok := cs.LookupFID(t.FID)
if !ok {
return newErr(syscall.EBADF)
@@ -917,7 +890,6 @@ func (t *Tallocate) handle(cs *connState) message {
// handle implements handler.handle.
func (t *Txattrwalk) handle(cs *connState) message {
- // Lookup the FID.
ref, ok := cs.LookupFID(t.FID)
if !ok {
return newErr(syscall.EBADF)
@@ -930,7 +902,6 @@ func (t *Txattrwalk) handle(cs *connState) message {
// handle implements handler.handle.
func (t *Txattrcreate) handle(cs *connState) message {
- // Lookup the FID.
ref, ok := cs.LookupFID(t.FID)
if !ok {
return newErr(syscall.EBADF)
@@ -943,7 +914,6 @@ func (t *Txattrcreate) handle(cs *connState) message {
// handle implements handler.handle.
func (t *Treaddir) handle(cs *connState) message {
- // Lookup the FID.
ref, ok := cs.LookupFID(t.Directory)
if !ok {
return newErr(syscall.EBADF)
@@ -977,7 +947,6 @@ func (t *Treaddir) handle(cs *connState) message {
// handle implements handler.handle.
func (t *Tfsync) handle(cs *connState) message {
- // Lookup the FID.
ref, ok := cs.LookupFID(t.FID)
if !ok {
return newErr(syscall.EBADF)
@@ -1001,7 +970,6 @@ func (t *Tfsync) handle(cs *connState) message {
// handle implements handler.handle.
func (t *Tstatfs) handle(cs *connState) message {
- // Lookup the FID.
ref, ok := cs.LookupFID(t.FID)
if !ok {
return newErr(syscall.EBADF)
@@ -1192,7 +1160,6 @@ func doWalk(cs *connState, ref *fidRef, names []string, getattr bool) (qids []QI
// handle implements handler.handle.
func (t *Twalk) handle(cs *connState) message {
- // Lookup the FID.
ref, ok := cs.LookupFID(t.FID)
if !ok {
return newErr(syscall.EBADF)
@@ -1213,7 +1180,6 @@ func (t *Twalk) handle(cs *connState) message {
// handle implements handler.handle.
func (t *Twalkgetattr) handle(cs *connState) message {
- // Lookup the FID.
ref, ok := cs.LookupFID(t.FID)
if !ok {
return newErr(syscall.EBADF)
@@ -1270,7 +1236,6 @@ func (t *Tumknod) handle(cs *connState) message {
// handle implements handler.handle.
func (t *Tlconnect) handle(cs *connState) message {
- // Lookup the FID.
ref, ok := cs.LookupFID(t.FID)
if !ok {
return newErr(syscall.EBADF)
@@ -1303,7 +1268,6 @@ func (t *Tchannel) handle(cs *connState) message {
return newErr(err)
}
- // Lookup the given channel.
ch := cs.lookupChannel(t.ID)
if ch == nil {
return newErr(syscall.ENOSYS)
diff --git a/pkg/p9/p9test/BUILD b/pkg/p9/p9test/BUILD
index 28707c0ca..f4edd68b2 100644
--- a/pkg/p9/p9test/BUILD
+++ b/pkg/p9/p9test/BUILD
@@ -70,6 +70,7 @@ go_library(
"//pkg/fd",
"//pkg/log",
"//pkg/p9",
+ "//pkg/sync",
"//pkg/unet",
"@com_github_golang_mock//gomock:go_default_library",
],
@@ -83,6 +84,7 @@ go_test(
deps = [
"//pkg/fd",
"//pkg/p9",
+ "//pkg/sync",
"@com_github_golang_mock//gomock:go_default_library",
],
)
diff --git a/pkg/p9/p9test/client_test.go b/pkg/p9/p9test/client_test.go
index 6e758148d..6e7bb3db2 100644
--- a/pkg/p9/p9test/client_test.go
+++ b/pkg/p9/p9test/client_test.go
@@ -22,7 +22,6 @@ import (
"os"
"reflect"
"strings"
- "sync"
"syscall"
"testing"
"time"
@@ -30,6 +29,7 @@ import (
"github.com/golang/mock/gomock"
"gvisor.dev/gvisor/pkg/fd"
"gvisor.dev/gvisor/pkg/p9"
+ "gvisor.dev/gvisor/pkg/sync"
)
func TestPanic(t *testing.T) {
diff --git a/pkg/p9/p9test/p9test.go b/pkg/p9/p9test/p9test.go
index 4d3271b37..dd8b01b6d 100644
--- a/pkg/p9/p9test/p9test.go
+++ b/pkg/p9/p9test/p9test.go
@@ -17,13 +17,13 @@ package p9test
import (
"fmt"
- "sync"
"sync/atomic"
"syscall"
"testing"
"github.com/golang/mock/gomock"
"gvisor.dev/gvisor/pkg/p9"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/unet"
)
diff --git a/pkg/p9/path_tree.go b/pkg/p9/path_tree.go
index 865459411..72ef53313 100644
--- a/pkg/p9/path_tree.go
+++ b/pkg/p9/path_tree.go
@@ -16,7 +16,8 @@ package p9
import (
"fmt"
- "sync"
+
+ "gvisor.dev/gvisor/pkg/sync"
)
// pathNode is a single node in a path traversal.
diff --git a/pkg/p9/pool.go b/pkg/p9/pool.go
index 52de889e1..2b14a5ce3 100644
--- a/pkg/p9/pool.go
+++ b/pkg/p9/pool.go
@@ -15,7 +15,7 @@
package p9
import (
- "sync"
+ "gvisor.dev/gvisor/pkg/sync"
)
// pool is a simple allocator.
diff --git a/pkg/p9/server.go b/pkg/p9/server.go
index 40b8fa023..fdfa83648 100644
--- a/pkg/p9/server.go
+++ b/pkg/p9/server.go
@@ -17,7 +17,6 @@ package p9
import (
"io"
"runtime/debug"
- "sync"
"sync/atomic"
"syscall"
@@ -25,6 +24,7 @@ import (
"gvisor.dev/gvisor/pkg/fdchannel"
"gvisor.dev/gvisor/pkg/flipcall"
"gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/unet"
)
diff --git a/pkg/p9/transport.go b/pkg/p9/transport.go
index 6e8b4bbcd..9c11e28ce 100644
--- a/pkg/p9/transport.go
+++ b/pkg/p9/transport.go
@@ -19,11 +19,11 @@ import (
"fmt"
"io"
"io/ioutil"
- "sync"
"syscall"
"gvisor.dev/gvisor/pkg/fd"
"gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/unet"
)
diff --git a/pkg/procid/BUILD b/pkg/procid/BUILD
index 078f084b2..b506813f0 100644
--- a/pkg/procid/BUILD
+++ b/pkg/procid/BUILD
@@ -21,6 +21,7 @@ go_test(
"procid_test.go",
],
embed = [":procid"],
+ deps = ["//pkg/sync"],
)
go_test(
@@ -31,4 +32,5 @@ go_test(
"procid_test.go",
],
embed = [":procid"],
+ deps = ["//pkg/sync"],
)
diff --git a/pkg/procid/procid_test.go b/pkg/procid/procid_test.go
index 88dd0b3ae..9ec08c3d6 100644
--- a/pkg/procid/procid_test.go
+++ b/pkg/procid/procid_test.go
@@ -17,9 +17,10 @@ package procid
import (
"os"
"runtime"
- "sync"
"syscall"
"testing"
+
+ "gvisor.dev/gvisor/pkg/sync"
)
// runOnMain is used to send functions to run on the main (initial) thread.
diff --git a/pkg/rand/BUILD b/pkg/rand/BUILD
index f4f2001f3..9d5b4859b 100644
--- a/pkg/rand/BUILD
+++ b/pkg/rand/BUILD
@@ -10,5 +10,8 @@ go_library(
],
importpath = "gvisor.dev/gvisor/pkg/rand",
visibility = ["//:sandbox"],
- deps = ["@org_golang_x_sys//unix:go_default_library"],
+ deps = [
+ "//pkg/sync",
+ "@org_golang_x_sys//unix:go_default_library",
+ ],
)
diff --git a/pkg/rand/rand_linux.go b/pkg/rand/rand_linux.go
index 2b92db3e6..0bdad5fad 100644
--- a/pkg/rand/rand_linux.go
+++ b/pkg/rand/rand_linux.go
@@ -19,9 +19,9 @@ package rand
import (
"crypto/rand"
"io"
- "sync"
"golang.org/x/sys/unix"
+ "gvisor.dev/gvisor/pkg/sync"
)
// reader implements an io.Reader that returns pseudorandom bytes.
diff --git a/pkg/refs/BUILD b/pkg/refs/BUILD
index 7ad59dfd7..974d9af9b 100644
--- a/pkg/refs/BUILD
+++ b/pkg/refs/BUILD
@@ -27,6 +27,7 @@ go_library(
visibility = ["//:sandbox"],
deps = [
"//pkg/log",
+ "//pkg/sync",
],
)
@@ -35,4 +36,5 @@ go_test(
size = "small",
srcs = ["refcounter_test.go"],
embed = [":refs"],
+ deps = ["//pkg/sync"],
)
diff --git a/pkg/refs/refcounter.go b/pkg/refs/refcounter.go
index ad69e0757..c45ba8200 100644
--- a/pkg/refs/refcounter.go
+++ b/pkg/refs/refcounter.go
@@ -21,10 +21,10 @@ import (
"fmt"
"reflect"
"runtime"
- "sync"
"sync/atomic"
"gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/sync"
)
// RefCounter is the interface to be implemented by objects that are reference
diff --git a/pkg/refs/refcounter_test.go b/pkg/refs/refcounter_test.go
index ffd3d3f07..1ab4a4440 100644
--- a/pkg/refs/refcounter_test.go
+++ b/pkg/refs/refcounter_test.go
@@ -16,8 +16,9 @@ package refs
import (
"reflect"
- "sync"
"testing"
+
+ "gvisor.dev/gvisor/pkg/sync"
)
type testCounter struct {
diff --git a/pkg/sentry/arch/BUILD b/pkg/sentry/arch/BUILD
index 18c73cc24..65f22af2b 100644
--- a/pkg/sentry/arch/BUILD
+++ b/pkg/sentry/arch/BUILD
@@ -9,17 +9,23 @@ go_library(
srcs = [
"aligned.go",
"arch.go",
+ "arch_aarch64.go",
"arch_amd64.go",
"arch_amd64.s",
+ "arch_arm64.go",
+ "arch_state_aarch64.go",
"arch_state_x86.go",
"arch_x86.go",
"auxv.go",
+ "signal.go",
"signal_act.go",
"signal_amd64.go",
+ "signal_arm64.go",
"signal_info.go",
"signal_stack.go",
"stack.go",
"syscalls_amd64.go",
+ "syscalls_arm64.go",
],
importpath = "gvisor.dev/gvisor/pkg/sentry/arch",
visibility = ["//:sandbox"],
@@ -32,6 +38,7 @@ go_library(
"//pkg/sentry/context",
"//pkg/sentry/limits",
"//pkg/sentry/usermem",
+ "//pkg/sync",
"//pkg/syserror",
],
)
diff --git a/pkg/sentry/arch/arch.go b/pkg/sentry/arch/arch.go
index 498ca4669..81ec98a77 100644
--- a/pkg/sentry/arch/arch.go
+++ b/pkg/sentry/arch/arch.go
@@ -125,9 +125,9 @@ type Context interface {
// SetTLS sets the current TLS pointer. Returns false if value is invalid.
SetTLS(value uintptr) bool
- // SetRSEQInterruptedIP sets the register that contains the old IP when a
- // restartable sequence is interrupted.
- SetRSEQInterruptedIP(value uintptr)
+ // SetOldRSeqInterruptedIP sets the register that contains the old IP
+ // when an "old rseq" restartable sequence is interrupted.
+ SetOldRSeqInterruptedIP(value uintptr)
// StateData returns a pointer to underlying architecture state.
StateData() *State
diff --git a/pkg/sentry/arch/arch_aarch64.go b/pkg/sentry/arch/arch_aarch64.go
new file mode 100644
index 000000000..ea4dedbdf
--- /dev/null
+++ b/pkg/sentry/arch/arch_aarch64.go
@@ -0,0 +1,293 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package arch
+
+import (
+ "fmt"
+ "io"
+ "syscall"
+
+ "gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/cpuid"
+ "gvisor.dev/gvisor/pkg/log"
+ rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+const (
+ // SyscallWidth is the width of insturctions.
+ SyscallWidth = 4
+)
+
+// aarch64FPState is aarch64 floating point state.
+type aarch64FPState []byte
+
+// initAarch64FPState (defined in asm files) sets up initial state.
+func initAarch64FPState(data *FloatingPointData) {
+ // TODO(gvisor.dev/issue/1238): floating-point is not supported.
+}
+
+func newAarch64FPStateSlice() []byte {
+ return alignedBytes(4096, 32)[:4096]
+}
+
+// newAarch64FPState returns an initialized floating point state.
+//
+// The returned state is large enough to store all floating point state
+// supported by host, even if the app won't use much of it due to a restricted
+// FeatureSet. Since they may still be able to see state not advertised by
+// CPUID we must ensure it does not contain any sentry state.
+func newAarch64FPState() aarch64FPState {
+ f := aarch64FPState(newAarch64FPStateSlice())
+ initAarch64FPState(f.FloatingPointData())
+ return f
+}
+
+// fork creates and returns an identical copy of the aarch64 floating point state.
+func (f aarch64FPState) fork() aarch64FPState {
+ n := aarch64FPState(newAarch64FPStateSlice())
+ copy(n, f)
+ return n
+}
+
+// FloatingPointData returns the raw data pointer.
+func (f aarch64FPState) FloatingPointData() *FloatingPointData {
+ return (*FloatingPointData)(&f[0])
+}
+
+// NewFloatingPointData returns a new floating point data blob.
+//
+// This is primarily for use in tests.
+func NewFloatingPointData() *FloatingPointData {
+ return (*FloatingPointData)(&(newAarch64FPState()[0]))
+}
+
+// State contains the common architecture bits for aarch64 (the build tag of this
+// file ensures it's only built on aarch64).
+type State struct {
+ // The system registers.
+ Regs syscall.PtraceRegs `state:".(syscallPtraceRegs)"`
+
+ // Our floating point state.
+ aarch64FPState `state:"wait"`
+
+ // FeatureSet is a pointer to the currently active feature set.
+ FeatureSet *cpuid.FeatureSet
+}
+
+// Proto returns a protobuf representation of the system registers in State.
+func (s State) Proto() *rpb.Registers {
+ regs := &rpb.ARM64Registers{
+ R0: s.Regs.Regs[0],
+ R1: s.Regs.Regs[1],
+ R2: s.Regs.Regs[2],
+ R3: s.Regs.Regs[3],
+ R4: s.Regs.Regs[4],
+ R5: s.Regs.Regs[5],
+ R6: s.Regs.Regs[6],
+ R7: s.Regs.Regs[7],
+ R8: s.Regs.Regs[8],
+ R9: s.Regs.Regs[9],
+ R10: s.Regs.Regs[10],
+ R11: s.Regs.Regs[11],
+ R12: s.Regs.Regs[12],
+ R13: s.Regs.Regs[13],
+ R14: s.Regs.Regs[14],
+ R15: s.Regs.Regs[15],
+ R16: s.Regs.Regs[16],
+ R17: s.Regs.Regs[17],
+ R18: s.Regs.Regs[18],
+ R19: s.Regs.Regs[19],
+ R20: s.Regs.Regs[20],
+ R21: s.Regs.Regs[21],
+ R22: s.Regs.Regs[22],
+ R23: s.Regs.Regs[23],
+ R24: s.Regs.Regs[24],
+ R25: s.Regs.Regs[25],
+ R26: s.Regs.Regs[26],
+ R27: s.Regs.Regs[27],
+ R28: s.Regs.Regs[28],
+ R29: s.Regs.Regs[29],
+ R30: s.Regs.Regs[30],
+ Sp: s.Regs.Sp,
+ Pc: s.Regs.Pc,
+ Pstate: s.Regs.Pstate,
+ }
+ return &rpb.Registers{Arch: &rpb.Registers_Arm64{Arm64: regs}}
+}
+
+// Fork creates and returns an identical copy of the state.
+func (s *State) Fork() State {
+ // TODO(gvisor.dev/issue/1238): floating-point is not supported.
+ return State{
+ Regs: s.Regs,
+ FeatureSet: s.FeatureSet,
+ }
+}
+
+// StateData implements Context.StateData.
+func (s *State) StateData() *State {
+ return s
+}
+
+// CPUIDEmulate emulates a cpuid instruction.
+func (s *State) CPUIDEmulate(l log.Logger) {
+ // TODO(gvisor.dev/issue/1255): cpuid is not supported.
+}
+
+// SingleStep implements Context.SingleStep.
+func (s *State) SingleStep() bool {
+ return false
+}
+
+// SetSingleStep enables single stepping.
+func (s *State) SetSingleStep() {
+ // Set the trap flag.
+ // TODO(gvisor.dev/issue/1239): ptrace single-step is not supported.
+}
+
+// ClearSingleStep enables single stepping.
+func (s *State) ClearSingleStep() {
+ // Clear the trap flag.
+ // TODO(gvisor.dev/issue/1239): ptrace single-step is not supported.
+}
+
+// RegisterMap returns a map of all registers.
+func (s *State) RegisterMap() (map[string]uintptr, error) {
+ return map[string]uintptr{
+ "R0": uintptr(s.Regs.Regs[0]),
+ "R1": uintptr(s.Regs.Regs[1]),
+ "R2": uintptr(s.Regs.Regs[2]),
+ "R3": uintptr(s.Regs.Regs[3]),
+ "R4": uintptr(s.Regs.Regs[4]),
+ "R5": uintptr(s.Regs.Regs[5]),
+ "R6": uintptr(s.Regs.Regs[6]),
+ "R7": uintptr(s.Regs.Regs[7]),
+ "R8": uintptr(s.Regs.Regs[8]),
+ "R9": uintptr(s.Regs.Regs[9]),
+ "R10": uintptr(s.Regs.Regs[10]),
+ "R11": uintptr(s.Regs.Regs[11]),
+ "R12": uintptr(s.Regs.Regs[12]),
+ "R13": uintptr(s.Regs.Regs[13]),
+ "R14": uintptr(s.Regs.Regs[14]),
+ "R15": uintptr(s.Regs.Regs[15]),
+ "R16": uintptr(s.Regs.Regs[16]),
+ "R17": uintptr(s.Regs.Regs[17]),
+ "R18": uintptr(s.Regs.Regs[18]),
+ "R19": uintptr(s.Regs.Regs[19]),
+ "R20": uintptr(s.Regs.Regs[20]),
+ "R21": uintptr(s.Regs.Regs[21]),
+ "R22": uintptr(s.Regs.Regs[22]),
+ "R23": uintptr(s.Regs.Regs[23]),
+ "R24": uintptr(s.Regs.Regs[24]),
+ "R25": uintptr(s.Regs.Regs[25]),
+ "R26": uintptr(s.Regs.Regs[26]),
+ "R27": uintptr(s.Regs.Regs[27]),
+ "R28": uintptr(s.Regs.Regs[28]),
+ "R29": uintptr(s.Regs.Regs[29]),
+ "R30": uintptr(s.Regs.Regs[30]),
+ "Sp": uintptr(s.Regs.Sp),
+ "Pc": uintptr(s.Regs.Pc),
+ "Pstate": uintptr(s.Regs.Pstate),
+ }, nil
+}
+
+// PtraceGetRegs implements Context.PtraceGetRegs.
+func (s *State) PtraceGetRegs(dst io.Writer) (int, error) {
+ return dst.Write(binary.Marshal(nil, usermem.ByteOrder, s.ptraceGetRegs()))
+}
+
+func (s *State) ptraceGetRegs() syscall.PtraceRegs {
+ return s.Regs
+}
+
+var ptraceRegsSize = int(binary.Size(syscall.PtraceRegs{}))
+
+// PtraceSetRegs implements Context.PtraceSetRegs.
+func (s *State) PtraceSetRegs(src io.Reader) (int, error) {
+ var regs syscall.PtraceRegs
+ buf := make([]byte, ptraceRegsSize)
+ if _, err := io.ReadFull(src, buf); err != nil {
+ return 0, err
+ }
+ binary.Unmarshal(buf, usermem.ByteOrder, &regs)
+ s.Regs = regs
+ return ptraceRegsSize, nil
+}
+
+// PtraceGetFPRegs implements Context.PtraceGetFPRegs.
+func (s *State) PtraceGetFPRegs(dst io.Writer) (int, error) {
+ // TODO(gvisor.dev/issue/1238): floating-point is not supported.
+ return 0, nil
+}
+
+// PtraceSetFPRegs implements Context.PtraceSetFPRegs.
+func (s *State) PtraceSetFPRegs(src io.Reader) (int, error) {
+ // TODO(gvisor.dev/issue/1238): floating-point is not supported.
+ return 0, nil
+}
+
+// Register sets defined in include/uapi/linux/elf.h.
+const (
+ _NT_PRSTATUS = 1
+ _NT_PRFPREG = 2
+)
+
+// PtraceGetRegSet implements Context.PtraceGetRegSet.
+func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int, error) {
+ switch regset {
+ case _NT_PRSTATUS:
+ if maxlen < ptraceRegsSize {
+ return 0, syserror.EFAULT
+ }
+ return s.PtraceGetRegs(dst)
+ default:
+ return 0, syserror.EINVAL
+ }
+}
+
+// PtraceSetRegSet implements Context.PtraceSetRegSet.
+func (s *State) PtraceSetRegSet(regset uintptr, src io.Reader, maxlen int) (int, error) {
+ switch regset {
+ case _NT_PRSTATUS:
+ if maxlen < ptraceRegsSize {
+ return 0, syserror.EFAULT
+ }
+ return s.PtraceSetRegs(src)
+ default:
+ return 0, syserror.EINVAL
+ }
+}
+
+// FullRestore indicates whether a full restore is required.
+func (s *State) FullRestore() bool {
+ return false
+}
+
+// New returns a new architecture context.
+func New(arch Arch, fs *cpuid.FeatureSet) Context {
+ switch arch {
+ case ARM64:
+ return &context64{
+ State{
+ FeatureSet: fs,
+ },
+ }
+ }
+ panic(fmt.Sprintf("unknown architecture %v", arch))
+}
diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go
index 9e7db8b30..2aa08b1a9 100644
--- a/pkg/sentry/arch/arch_amd64.go
+++ b/pkg/sentry/arch/arch_amd64.go
@@ -174,8 +174,8 @@ func (c *context64) SetTLS(value uintptr) bool {
return true
}
-// SetRSEQInterruptedIP implements Context.SetRSEQInterruptedIP.
-func (c *context64) SetRSEQInterruptedIP(value uintptr) {
+// SetOldRSeqInterruptedIP implements Context.SetOldRSeqInterruptedIP.
+func (c *context64) SetOldRSeqInterruptedIP(value uintptr) {
c.Regs.R10 = uint64(value)
}
@@ -305,7 +305,7 @@ func (c *context64) PtracePeekUser(addr uintptr) (interface{}, error) {
buf := binary.Marshal(nil, usermem.ByteOrder, c.ptraceGetRegs())
return c.Native(uintptr(usermem.ByteOrder.Uint64(buf[addr:]))), nil
}
- // TODO(b/34088053): debug registers
+ // Note: x86 debug registers are missing.
return c.Native(0), nil
}
@@ -320,6 +320,6 @@ func (c *context64) PtracePokeUser(addr, data uintptr) error {
_, err := c.PtraceSetRegs(bytes.NewBuffer(buf))
return err
}
- // TODO(b/34088053): debug registers
+ // Note: x86 debug registers are missing.
return nil
}
diff --git a/pkg/sentry/arch/arch_arm64.go b/pkg/sentry/arch/arch_arm64.go
new file mode 100644
index 000000000..0d5b7d317
--- /dev/null
+++ b/pkg/sentry/arch/arch_arm64.go
@@ -0,0 +1,266 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package arch
+
+import (
+ "fmt"
+ "math/rand"
+ "syscall"
+
+ "gvisor.dev/gvisor/pkg/cpuid"
+ "gvisor.dev/gvisor/pkg/sentry/limits"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
+)
+
+// Host specifies the host architecture.
+const Host = ARM64
+
+// These constants come directly from Linux.
+const (
+ // maxAddr64 is the maximum userspace address. It is TASK_SIZE in Linux
+ // for a 64-bit process.
+ maxAddr64 usermem.Addr = (1 << 48)
+
+ // maxStackRand64 is the maximum randomization to apply to the stack.
+ // It is defined by arch/arm64/mm/mmap.c:(STACK_RND_MASK << PAGE_SHIFT) in Linux.
+ maxStackRand64 = 0x3ffff << 12 // 16 GB
+
+ // maxMmapRand64 is the maximum randomization to apply to the mmap
+ // layout. It is defined by arch/arm64/mm/mmap.c:arch_mmap_rnd in Linux.
+ maxMmapRand64 = (1 << 33) * usermem.PageSize
+
+ // minGap64 is the minimum gap to leave at the top of the address space
+ // for the stack. It is defined by arch/arm64/mm/mmap.c:MIN_GAP in Linux.
+ minGap64 = (128 << 20) + maxStackRand64
+
+ // preferredPIELoadAddr is the standard Linux position-independent
+ // executable base load address. It is ELF_ET_DYN_BASE in Linux.
+ //
+ // The Platform {Min,Max}UserAddress() may preclude loading at this
+ // address. See other preferredFoo comments below.
+ preferredPIELoadAddr usermem.Addr = maxAddr64 / 6 * 5
+)
+
+// These constants are selected as heuristics to help make the Platform's
+// potentially limited address space conform as closely to Linux as possible.
+const (
+ preferredTopDownAllocMin usermem.Addr = 0x7e8000000000
+ preferredAllocationGap = 128 << 30 // 128 GB
+ preferredTopDownBaseMin = preferredTopDownAllocMin + preferredAllocationGap
+
+ // minMmapRand64 is the smallest we are willing to make the
+ // randomization to stay above preferredTopDownBaseMin.
+ minMmapRand64 = (1 << 18) * usermem.PageSize
+)
+
+// context64 represents an ARM64 context.
+type context64 struct {
+ State
+}
+
+// Arch implements Context.Arch.
+func (c *context64) Arch() Arch {
+ return ARM64
+}
+
+// Fork returns an exact copy of this context.
+func (c *context64) Fork() Context {
+ return &context64{
+ State: c.State.Fork(),
+ }
+}
+
+// General purpose registers usage on Arm64:
+// R0...R7: parameter/result registers.
+// R8: indirect result location register.
+// R9...R15: temporary rgisters.
+// R16: the first intra-procedure-call scratch register.
+// R17: the second intra-procedure-call scratch register.
+// R18: the platform register.
+// R19...R28: callee-saved registers.
+// R29: the frame pointer.
+// R30: the link register.
+
+// Return returns the current syscall return value.
+func (c *context64) Return() uintptr {
+ return uintptr(c.Regs.Regs[0])
+}
+
+// SetReturn sets the syscall return value.
+func (c *context64) SetReturn(value uintptr) {
+ c.Regs.Regs[0] = uint64(value)
+}
+
+// IP returns the current instruction pointer.
+func (c *context64) IP() uintptr {
+ return uintptr(c.Regs.Pc)
+}
+
+// SetIP sets the current instruction pointer.
+func (c *context64) SetIP(value uintptr) {
+ c.Regs.Pc = uint64(value)
+}
+
+// Stack returns the current stack pointer.
+func (c *context64) Stack() uintptr {
+ return uintptr(c.Regs.Sp)
+}
+
+// SetStack sets the current stack pointer.
+func (c *context64) SetStack(value uintptr) {
+ c.Regs.Sp = uint64(value)
+}
+
+// TLS returns the current TLS pointer.
+func (c *context64) TLS() uintptr {
+ // TODO(gvisor.dev/issue/1238): TLS is not supported.
+ // MRS_TPIDR_EL0
+ return 0
+}
+
+// SetTLS sets the current TLS pointer. Returns false if value is invalid.
+func (c *context64) SetTLS(value uintptr) bool {
+ // TODO(gvisor.dev/issue/1238): TLS is not supported.
+ // MSR_TPIDR_EL0
+ return false
+}
+
+// SetRSEQInterruptedIP implements Context.SetRSEQInterruptedIP.
+func (c *context64) SetRSEQInterruptedIP(value uintptr) {
+ c.Regs.Regs[3] = uint64(value)
+}
+
+// Native returns the native type for the given val.
+func (c *context64) Native(val uintptr) interface{} {
+ v := uint64(val)
+ return &v
+}
+
+// Value returns the generic val for the given native type.
+func (c *context64) Value(val interface{}) uintptr {
+ return uintptr(*val.(*uint64))
+}
+
+// Width returns the byte width of this architecture.
+func (c *context64) Width() uint {
+ return 8
+}
+
+// FeatureSet returns the FeatureSet in use.
+func (c *context64) FeatureSet() *cpuid.FeatureSet {
+ return c.State.FeatureSet
+}
+
+// mmapRand returns a random adjustment for randomizing an mmap layout.
+func mmapRand(max uint64) usermem.Addr {
+ return usermem.Addr(rand.Int63n(int64(max))).RoundDown()
+}
+
+// NewMmapLayout implements Context.NewMmapLayout consistently with Linux.
+func (c *context64) NewMmapLayout(min, max usermem.Addr, r *limits.LimitSet) (MmapLayout, error) {
+ min, ok := min.RoundUp()
+ if !ok {
+ return MmapLayout{}, syscall.EINVAL
+ }
+ if max > maxAddr64 {
+ max = maxAddr64
+ }
+ max = max.RoundDown()
+
+ if min > max {
+ return MmapLayout{}, syscall.EINVAL
+ }
+
+ stackSize := r.Get(limits.Stack)
+
+ // MAX_GAP in Linux.
+ maxGap := (max / 6) * 5
+ gap := usermem.Addr(stackSize.Cur)
+ if gap < minGap64 {
+ gap = minGap64
+ }
+ if gap > maxGap {
+ gap = maxGap
+ }
+ defaultDir := MmapTopDown
+ if stackSize.Cur == limits.Infinity {
+ defaultDir = MmapBottomUp
+ }
+
+ topDownMin := max - gap - maxMmapRand64
+ maxRand := usermem.Addr(maxMmapRand64)
+ if topDownMin < preferredTopDownBaseMin {
+ // Try to keep TopDownBase above preferredTopDownBaseMin by
+ // shrinking maxRand.
+ maxAdjust := maxRand - minMmapRand64
+ needAdjust := preferredTopDownBaseMin - topDownMin
+ if needAdjust <= maxAdjust {
+ maxRand -= needAdjust
+ }
+ }
+
+ rnd := mmapRand(uint64(maxRand))
+ l := MmapLayout{
+ MinAddr: min,
+ MaxAddr: max,
+ // TASK_UNMAPPED_BASE in Linux.
+ BottomUpBase: (max/3 + rnd).RoundDown(),
+ TopDownBase: (max - gap - rnd).RoundDown(),
+ DefaultDirection: defaultDir,
+ // We may have reduced the maximum randomization to keep
+ // TopDownBase above preferredTopDownBaseMin while maintaining
+ // our stack gap. Stack allocations must use that max
+ // randomization to avoiding eating into the gap.
+ MaxStackRand: uint64(maxRand),
+ }
+
+ // Final sanity check on the layout.
+ if !l.Valid() {
+ panic(fmt.Sprintf("Invalid MmapLayout: %+v", l))
+ }
+
+ return l, nil
+}
+
+// PIELoadAddress implements Context.PIELoadAddress.
+func (c *context64) PIELoadAddress(l MmapLayout) usermem.Addr {
+ base := preferredPIELoadAddr
+ max, ok := base.AddLength(maxMmapRand64)
+ if !ok {
+ panic(fmt.Sprintf("preferredPIELoadAddr %#x too large", base))
+ }
+
+ if max > l.MaxAddr {
+ // preferredPIELoadAddr won't fit; fall back to the standard
+ // Linux behavior of 2/3 of TopDownBase. TSAN won't like this.
+ //
+ // Don't bother trying to shrink the randomization for now.
+ base = l.TopDownBase / 3 * 2
+ }
+
+ return base + mmapRand(maxMmapRand64)
+}
+
+// PtracePeekUser implements Context.PtracePeekUser.
+func (c *context64) PtracePeekUser(addr uintptr) (interface{}, error) {
+ // TODO(gvisor.dev/issue/1239): Full ptrace supporting for Arm64.
+ return c.Native(0), nil
+}
+
+// PtracePokeUser implements Context.PtracePokeUser.
+func (c *context64) PtracePokeUser(addr, data uintptr) error {
+ // TODO(gvisor.dev/issue/1239): Full ptrace supporting for Arm64.
+ return nil
+}
diff --git a/pkg/sentry/arch/arch_state_aarch64.go b/pkg/sentry/arch/arch_state_aarch64.go
new file mode 100644
index 000000000..0136a85ad
--- /dev/null
+++ b/pkg/sentry/arch/arch_state_aarch64.go
@@ -0,0 +1,38 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package arch
+
+import (
+ "syscall"
+)
+
+type syscallPtraceRegs struct {
+ Regs [31]uint64
+ Sp uint64
+ Pc uint64
+ Pstate uint64
+}
+
+// saveRegs is invoked by stateify.
+func (s *State) saveRegs() syscallPtraceRegs {
+ return syscallPtraceRegs(s.Regs)
+}
+
+// loadRegs is invoked by stateify.
+func (s *State) loadRegs(r syscallPtraceRegs) {
+ s.Regs = syscall.PtraceRegs(r)
+}
diff --git a/pkg/sentry/arch/arch_state_x86.go b/pkg/sentry/arch/arch_state_x86.go
index 9061fcc86..84f11b0d1 100644
--- a/pkg/sentry/arch/arch_state_x86.go
+++ b/pkg/sentry/arch/arch_state_x86.go
@@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+// +build amd64 i386
+
package arch
import (
diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go
index 9294ac773..9f41e566f 100644
--- a/pkg/sentry/arch/arch_x86.go
+++ b/pkg/sentry/arch/arch_x86.go
@@ -19,7 +19,6 @@ package arch
import (
"fmt"
"io"
- "sync"
"syscall"
"gvisor.dev/gvisor/pkg/binary"
@@ -27,6 +26,7 @@ import (
"gvisor.dev/gvisor/pkg/log"
rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
"gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
)
diff --git a/pkg/sentry/arch/registers.proto b/pkg/sentry/arch/registers.proto
index 9dc83e241..60c027aab 100644
--- a/pkg/sentry/arch/registers.proto
+++ b/pkg/sentry/arch/registers.proto
@@ -48,8 +48,45 @@ message AMD64Registers {
uint64 gs_base = 27;
}
+message ARM64Registers {
+ uint64 r0 = 1;
+ uint64 r1 = 2;
+ uint64 r2 = 3;
+ uint64 r3 = 4;
+ uint64 r4 = 5;
+ uint64 r5 = 6;
+ uint64 r6 = 7;
+ uint64 r7 = 8;
+ uint64 r8 = 9;
+ uint64 r9 = 10;
+ uint64 r10 = 11;
+ uint64 r11 = 12;
+ uint64 r12 = 13;
+ uint64 r13 = 14;
+ uint64 r14 = 15;
+ uint64 r15 = 16;
+ uint64 r16 = 17;
+ uint64 r17 = 18;
+ uint64 r18 = 19;
+ uint64 r19 = 20;
+ uint64 r20 = 21;
+ uint64 r21 = 22;
+ uint64 r22 = 23;
+ uint64 r23 = 24;
+ uint64 r24 = 25;
+ uint64 r25 = 26;
+ uint64 r26 = 27;
+ uint64 r27 = 28;
+ uint64 r28 = 29;
+ uint64 r29 = 30;
+ uint64 r30 = 31;
+ uint64 sp = 32;
+ uint64 pc = 33;
+ uint64 pstate = 34;
+}
message Registers {
oneof arch {
AMD64Registers amd64 = 1;
+ ARM64Registers arm64 = 2;
}
}
diff --git a/pkg/sentry/arch/signal.go b/pkg/sentry/arch/signal.go
new file mode 100644
index 000000000..402e46025
--- /dev/null
+++ b/pkg/sentry/arch/signal.go
@@ -0,0 +1,250 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package arch
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
+)
+
+// SignalAct represents the action that should be taken when a signal is
+// delivered, and is equivalent to struct sigaction.
+//
+// +stateify savable
+type SignalAct struct {
+ Handler uint64
+ Flags uint64
+ Restorer uint64 // Only used on amd64.
+ Mask linux.SignalSet
+}
+
+// SerializeFrom implements NativeSignalAct.SerializeFrom.
+func (s *SignalAct) SerializeFrom(other *SignalAct) {
+ *s = *other
+}
+
+// DeserializeTo implements NativeSignalAct.DeserializeTo.
+func (s *SignalAct) DeserializeTo(other *SignalAct) {
+ *other = *s
+}
+
+// SignalStack represents information about a user stack, and is equivalent to
+// stack_t.
+//
+// +stateify savable
+type SignalStack struct {
+ Addr uint64
+ Flags uint32
+ _ uint32
+ Size uint64
+}
+
+// SerializeFrom implements NativeSignalStack.SerializeFrom.
+func (s *SignalStack) SerializeFrom(other *SignalStack) {
+ *s = *other
+}
+
+// DeserializeTo implements NativeSignalStack.DeserializeTo.
+func (s *SignalStack) DeserializeTo(other *SignalStack) {
+ *other = *s
+}
+
+// SignalInfo represents information about a signal being delivered, and is
+// equivalent to struct siginfo in linux kernel(linux/include/uapi/asm-generic/siginfo.h).
+//
+// +stateify savable
+type SignalInfo struct {
+ Signo int32 // Signal number
+ Errno int32 // Errno value
+ Code int32 // Signal code
+ _ uint32
+
+ // struct siginfo::_sifields is a union. In SignalInfo, fields in the union
+ // are accessed through methods.
+ //
+ // For reference, here is the definition of _sifields: (_sigfault._trapno,
+ // which does not exist on x86, omitted for clarity)
+ //
+ // union {
+ // int _pad[SI_PAD_SIZE];
+ //
+ // /* kill() */
+ // struct {
+ // __kernel_pid_t _pid; /* sender's pid */
+ // __ARCH_SI_UID_T _uid; /* sender's uid */
+ // } _kill;
+ //
+ // /* POSIX.1b timers */
+ // struct {
+ // __kernel_timer_t _tid; /* timer id */
+ // int _overrun; /* overrun count */
+ // char _pad[sizeof( __ARCH_SI_UID_T) - sizeof(int)];
+ // sigval_t _sigval; /* same as below */
+ // int _sys_private; /* not to be passed to user */
+ // } _timer;
+ //
+ // /* POSIX.1b signals */
+ // struct {
+ // __kernel_pid_t _pid; /* sender's pid */
+ // __ARCH_SI_UID_T _uid; /* sender's uid */
+ // sigval_t _sigval;
+ // } _rt;
+ //
+ // /* SIGCHLD */
+ // struct {
+ // __kernel_pid_t _pid; /* which child */
+ // __ARCH_SI_UID_T _uid; /* sender's uid */
+ // int _status; /* exit code */
+ // __ARCH_SI_CLOCK_T _utime;
+ // __ARCH_SI_CLOCK_T _stime;
+ // } _sigchld;
+ //
+ // /* SIGILL, SIGFPE, SIGSEGV, SIGBUS */
+ // struct {
+ // void *_addr; /* faulting insn/memory ref. */
+ // short _addr_lsb; /* LSB of the reported address */
+ // } _sigfault;
+ //
+ // /* SIGPOLL */
+ // struct {
+ // __ARCH_SI_BAND_T _band; /* POLL_IN, POLL_OUT, POLL_MSG */
+ // int _fd;
+ // } _sigpoll;
+ //
+ // /* SIGSYS */
+ // struct {
+ // void *_call_addr; /* calling user insn */
+ // int _syscall; /* triggering system call number */
+ // unsigned int _arch; /* AUDIT_ARCH_* of syscall */
+ // } _sigsys;
+ // } _sifields;
+ //
+ // _sifields is padded so that the size of siginfo is SI_MAX_SIZE = 128
+ // bytes.
+ Fields [128 - 16]byte
+}
+
+// FixSignalCodeForUser fixes up si_code.
+//
+// The si_code we get from Linux may contain the kernel-specific code in the
+// top 16 bits if it's positive (e.g., from ptrace). Linux's
+// copy_siginfo_to_user does
+// err |= __put_user((short)from->si_code, &to->si_code);
+// to mask out those bits and we need to do the same.
+func (s *SignalInfo) FixSignalCodeForUser() {
+ if s.Code > 0 {
+ s.Code &= 0x0000ffff
+ }
+}
+
+// Pid returns the si_pid field.
+func (s *SignalInfo) Pid() int32 {
+ return int32(usermem.ByteOrder.Uint32(s.Fields[0:4]))
+}
+
+// SetPid mutates the si_pid field.
+func (s *SignalInfo) SetPid(val int32) {
+ usermem.ByteOrder.PutUint32(s.Fields[0:4], uint32(val))
+}
+
+// Uid returns the si_uid field.
+func (s *SignalInfo) Uid() int32 {
+ return int32(usermem.ByteOrder.Uint32(s.Fields[4:8]))
+}
+
+// SetUid mutates the si_uid field.
+func (s *SignalInfo) SetUid(val int32) {
+ usermem.ByteOrder.PutUint32(s.Fields[4:8], uint32(val))
+}
+
+// Sigval returns the sigval field, which is aliased to both si_int and si_ptr.
+func (s *SignalInfo) Sigval() uint64 {
+ return usermem.ByteOrder.Uint64(s.Fields[8:16])
+}
+
+// SetSigval mutates the sigval field.
+func (s *SignalInfo) SetSigval(val uint64) {
+ usermem.ByteOrder.PutUint64(s.Fields[8:16], val)
+}
+
+// TimerID returns the si_timerid field.
+func (s *SignalInfo) TimerID() linux.TimerID {
+ return linux.TimerID(usermem.ByteOrder.Uint32(s.Fields[0:4]))
+}
+
+// SetTimerID sets the si_timerid field.
+func (s *SignalInfo) SetTimerID(val linux.TimerID) {
+ usermem.ByteOrder.PutUint32(s.Fields[0:4], uint32(val))
+}
+
+// Overrun returns the si_overrun field.
+func (s *SignalInfo) Overrun() int32 {
+ return int32(usermem.ByteOrder.Uint32(s.Fields[4:8]))
+}
+
+// SetOverrun sets the si_overrun field.
+func (s *SignalInfo) SetOverrun(val int32) {
+ usermem.ByteOrder.PutUint32(s.Fields[4:8], uint32(val))
+}
+
+// Addr returns the si_addr field.
+func (s *SignalInfo) Addr() uint64 {
+ return usermem.ByteOrder.Uint64(s.Fields[0:8])
+}
+
+// SetAddr sets the si_addr field.
+func (s *SignalInfo) SetAddr(val uint64) {
+ usermem.ByteOrder.PutUint64(s.Fields[0:8], val)
+}
+
+// Status returns the si_status field.
+func (s *SignalInfo) Status() int32 {
+ return int32(usermem.ByteOrder.Uint32(s.Fields[8:12]))
+}
+
+// SetStatus mutates the si_status field.
+func (s *SignalInfo) SetStatus(val int32) {
+ usermem.ByteOrder.PutUint32(s.Fields[8:12], uint32(val))
+}
+
+// CallAddr returns the si_call_addr field.
+func (s *SignalInfo) CallAddr() uint64 {
+ return usermem.ByteOrder.Uint64(s.Fields[0:8])
+}
+
+// SetCallAddr mutates the si_call_addr field.
+func (s *SignalInfo) SetCallAddr(val uint64) {
+ usermem.ByteOrder.PutUint64(s.Fields[0:8], val)
+}
+
+// Syscall returns the si_syscall field.
+func (s *SignalInfo) Syscall() int32 {
+ return int32(usermem.ByteOrder.Uint32(s.Fields[8:12]))
+}
+
+// SetSyscall mutates the si_syscall field.
+func (s *SignalInfo) SetSyscall(val int32) {
+ usermem.ByteOrder.PutUint32(s.Fields[8:12], uint32(val))
+}
+
+// Arch returns the si_arch field.
+func (s *SignalInfo) Arch() uint32 {
+ return usermem.ByteOrder.Uint32(s.Fields[12:16])
+}
+
+// SetArch mutates the si_arch field.
+func (s *SignalInfo) SetArch(val uint32) {
+ usermem.ByteOrder.PutUint32(s.Fields[12:16], val)
+}
diff --git a/pkg/sentry/arch/signal_amd64.go b/pkg/sentry/arch/signal_amd64.go
index febd6f9b9..1e4f9c3c2 100644
--- a/pkg/sentry/arch/signal_amd64.go
+++ b/pkg/sentry/arch/signal_amd64.go
@@ -26,236 +26,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/usermem"
)
-// SignalAct represents the action that should be taken when a signal is
-// delivered, and is equivalent to struct sigaction on 64-bit x86.
-//
-// +stateify savable
-type SignalAct struct {
- Handler uint64
- Flags uint64
- Restorer uint64
- Mask linux.SignalSet
-}
-
-// SerializeFrom implements NativeSignalAct.SerializeFrom.
-func (s *SignalAct) SerializeFrom(other *SignalAct) {
- *s = *other
-}
-
-// DeserializeTo implements NativeSignalAct.DeserializeTo.
-func (s *SignalAct) DeserializeTo(other *SignalAct) {
- *other = *s
-}
-
-// SignalStack represents information about a user stack, and is equivalent to
-// stack_t on 64-bit x86.
-//
-// +stateify savable
-type SignalStack struct {
- Addr uint64
- Flags uint32
- _ uint32
- Size uint64
-}
-
-// SerializeFrom implements NativeSignalStack.SerializeFrom.
-func (s *SignalStack) SerializeFrom(other *SignalStack) {
- *s = *other
-}
-
-// DeserializeTo implements NativeSignalStack.DeserializeTo.
-func (s *SignalStack) DeserializeTo(other *SignalStack) {
- *other = *s
-}
-
-// SignalInfo represents information about a signal being delivered, and is
-// equivalent to struct siginfo on 64-bit x86.
-//
-// +stateify savable
-type SignalInfo struct {
- Signo int32 // Signal number
- Errno int32 // Errno value
- Code int32 // Signal code
- _ uint32
-
- // struct siginfo::_sifields is a union. In SignalInfo, fields in the union
- // are accessed through methods.
- //
- // For reference, here is the definition of _sifields: (_sigfault._trapno,
- // which does not exist on x86, omitted for clarity)
- //
- // union {
- // int _pad[SI_PAD_SIZE];
- //
- // /* kill() */
- // struct {
- // __kernel_pid_t _pid; /* sender's pid */
- // __ARCH_SI_UID_T _uid; /* sender's uid */
- // } _kill;
- //
- // /* POSIX.1b timers */
- // struct {
- // __kernel_timer_t _tid; /* timer id */
- // int _overrun; /* overrun count */
- // char _pad[sizeof( __ARCH_SI_UID_T) - sizeof(int)];
- // sigval_t _sigval; /* same as below */
- // int _sys_private; /* not to be passed to user */
- // } _timer;
- //
- // /* POSIX.1b signals */
- // struct {
- // __kernel_pid_t _pid; /* sender's pid */
- // __ARCH_SI_UID_T _uid; /* sender's uid */
- // sigval_t _sigval;
- // } _rt;
- //
- // /* SIGCHLD */
- // struct {
- // __kernel_pid_t _pid; /* which child */
- // __ARCH_SI_UID_T _uid; /* sender's uid */
- // int _status; /* exit code */
- // __ARCH_SI_CLOCK_T _utime;
- // __ARCH_SI_CLOCK_T _stime;
- // } _sigchld;
- //
- // /* SIGILL, SIGFPE, SIGSEGV, SIGBUS */
- // struct {
- // void *_addr; /* faulting insn/memory ref. */
- // short _addr_lsb; /* LSB of the reported address */
- // } _sigfault;
- //
- // /* SIGPOLL */
- // struct {
- // __ARCH_SI_BAND_T _band; /* POLL_IN, POLL_OUT, POLL_MSG */
- // int _fd;
- // } _sigpoll;
- //
- // /* SIGSYS */
- // struct {
- // void *_call_addr; /* calling user insn */
- // int _syscall; /* triggering system call number */
- // unsigned int _arch; /* AUDIT_ARCH_* of syscall */
- // } _sigsys;
- // } _sifields;
- //
- // _sifields is padded so that the size of siginfo is SI_MAX_SIZE = 128
- // bytes.
- Fields [128 - 16]byte
-}
-
-// FixSignalCodeForUser fixes up si_code.
-//
-// The si_code we get from Linux may contain the kernel-specific code in the
-// top 16 bits if it's positive (e.g., from ptrace). Linux's
-// copy_siginfo_to_user does
-// err |= __put_user((short)from->si_code, &to->si_code);
-// to mask out those bits and we need to do the same.
-func (s *SignalInfo) FixSignalCodeForUser() {
- if s.Code > 0 {
- s.Code &= 0x0000ffff
- }
-}
-
-// Pid returns the si_pid field.
-func (s *SignalInfo) Pid() int32 {
- return int32(usermem.ByteOrder.Uint32(s.Fields[0:4]))
-}
-
-// SetPid mutates the si_pid field.
-func (s *SignalInfo) SetPid(val int32) {
- usermem.ByteOrder.PutUint32(s.Fields[0:4], uint32(val))
-}
-
-// Uid returns the si_uid field.
-func (s *SignalInfo) Uid() int32 {
- return int32(usermem.ByteOrder.Uint32(s.Fields[4:8]))
-}
-
-// SetUid mutates the si_uid field.
-func (s *SignalInfo) SetUid(val int32) {
- usermem.ByteOrder.PutUint32(s.Fields[4:8], uint32(val))
-}
-
-// Sigval returns the sigval field, which is aliased to both si_int and si_ptr.
-func (s *SignalInfo) Sigval() uint64 {
- return usermem.ByteOrder.Uint64(s.Fields[8:16])
-}
-
-// SetSigval mutates the sigval field.
-func (s *SignalInfo) SetSigval(val uint64) {
- usermem.ByteOrder.PutUint64(s.Fields[8:16], val)
-}
-
-// TimerID returns the si_timerid field.
-func (s *SignalInfo) TimerID() linux.TimerID {
- return linux.TimerID(usermem.ByteOrder.Uint32(s.Fields[0:4]))
-}
-
-// SetTimerID sets the si_timerid field.
-func (s *SignalInfo) SetTimerID(val linux.TimerID) {
- usermem.ByteOrder.PutUint32(s.Fields[0:4], uint32(val))
-}
-
-// Overrun returns the si_overrun field.
-func (s *SignalInfo) Overrun() int32 {
- return int32(usermem.ByteOrder.Uint32(s.Fields[4:8]))
-}
-
-// SetOverrun sets the si_overrun field.
-func (s *SignalInfo) SetOverrun(val int32) {
- usermem.ByteOrder.PutUint32(s.Fields[4:8], uint32(val))
-}
-
-// Addr returns the si_addr field.
-func (s *SignalInfo) Addr() uint64 {
- return usermem.ByteOrder.Uint64(s.Fields[0:8])
-}
-
-// SetAddr sets the si_addr field.
-func (s *SignalInfo) SetAddr(val uint64) {
- usermem.ByteOrder.PutUint64(s.Fields[0:8], val)
-}
-
-// Status returns the si_status field.
-func (s *SignalInfo) Status() int32 {
- return int32(usermem.ByteOrder.Uint32(s.Fields[8:12]))
-}
-
-// SetStatus mutates the si_status field.
-func (s *SignalInfo) SetStatus(val int32) {
- usermem.ByteOrder.PutUint32(s.Fields[8:12], uint32(val))
-}
-
-// CallAddr returns the si_call_addr field.
-func (s *SignalInfo) CallAddr() uint64 {
- return usermem.ByteOrder.Uint64(s.Fields[0:8])
-}
-
-// SetCallAddr mutates the si_call_addr field.
-func (s *SignalInfo) SetCallAddr(val uint64) {
- usermem.ByteOrder.PutUint64(s.Fields[0:8], val)
-}
-
-// Syscall returns the si_syscall field.
-func (s *SignalInfo) Syscall() int32 {
- return int32(usermem.ByteOrder.Uint32(s.Fields[8:12]))
-}
-
-// SetSyscall mutates the si_syscall field.
-func (s *SignalInfo) SetSyscall(val int32) {
- usermem.ByteOrder.PutUint32(s.Fields[8:12], uint32(val))
-}
-
-// Arch returns the si_arch field.
-func (s *SignalInfo) Arch() uint32 {
- return usermem.ByteOrder.Uint32(s.Fields[12:16])
-}
-
-// SetArch mutates the si_arch field.
-func (s *SignalInfo) SetArch(val uint32) {
- usermem.ByteOrder.PutUint32(s.Fields[12:16], val)
-}
-
// SignalContext64 is equivalent to struct sigcontext, the type passed as the
// second argument to signal handlers set by signal(2).
type SignalContext64 struct {
diff --git a/pkg/sentry/arch/signal_arm64.go b/pkg/sentry/arch/signal_arm64.go
new file mode 100644
index 000000000..7d0e98935
--- /dev/null
+++ b/pkg/sentry/arch/signal_arm64.go
@@ -0,0 +1,126 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package arch
+
+import (
+ "encoding/binary"
+ "syscall"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
+)
+
+// SignalContext64 is equivalent to struct sigcontext, the type passed as the
+// second argument to signal handlers set by signal(2).
+type SignalContext64 struct {
+ FaultAddr uint64
+ Regs [31]uint64
+ Sp uint64
+ Pc uint64
+ Pstate uint64
+ _pad [8]byte // __attribute__((__aligned__(16)))
+ Reserved [4096]uint8
+}
+
+// UContext64 is equivalent to ucontext on arm64(arch/arm64/include/uapi/asm/ucontext.h).
+type UContext64 struct {
+ Flags uint64
+ Link *UContext64
+ Stack SignalStack
+ Sigset linux.SignalSet
+ // glibc uses a 1024-bit sigset_t
+ _pad [(1024 - 64) / 8]byte
+ // sigcontext must be aligned to 16-byte
+ _pad2 [8]byte
+ // last for future expansion
+ MContext SignalContext64
+}
+
+// NewSignalAct implements Context.NewSignalAct.
+func (c *context64) NewSignalAct() NativeSignalAct {
+ return &SignalAct{}
+}
+
+// NewSignalStack implements Context.NewSignalStack.
+func (c *context64) NewSignalStack() NativeSignalStack {
+ return &SignalStack{}
+}
+
+// SignalSetup implements Context.SignalSetup.
+func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt *SignalStack, sigset linux.SignalSet) error {
+ sp := st.Bottom
+
+ if !(alt.IsEnabled() && sp == alt.Top()) {
+ sp -= 128
+ }
+
+ // Construct the UContext64 now since we need its size.
+ uc := &UContext64{
+ Flags: 0,
+ Stack: *alt,
+ MContext: SignalContext64{
+ Regs: c.Regs.Regs,
+ Sp: c.Regs.Sp,
+ Pc: c.Regs.Pc,
+ Pstate: c.Regs.Pstate,
+ },
+ Sigset: sigset,
+ }
+
+ ucSize := binary.Size(uc)
+ if ucSize < 0 {
+ panic("can't get size of UContext64")
+ }
+ // st.Arch.Width() is for the restorer address. sizeof(siginfo) == 128.
+ frameSize := int(st.Arch.Width()) + ucSize + 128
+ frameBottom := (sp-usermem.Addr(frameSize)) & ^usermem.Addr(15) - 8
+ sp = frameBottom + usermem.Addr(frameSize)
+ st.Bottom = sp
+
+ // Prior to proceeding, figure out if the frame will exhaust the range
+ // for the signal stack. This is not allowed, and should immediately
+ // force signal delivery (reverting to the default handler).
+ if act.IsOnStack() && alt.IsEnabled() && !alt.Contains(frameBottom) {
+ return syscall.EFAULT
+ }
+
+ // Adjust the code.
+ info.FixSignalCodeForUser()
+
+ // Set up the stack frame.
+ infoAddr, err := st.Push(info)
+ if err != nil {
+ return err
+ }
+ ucAddr, err := st.Push(uc)
+ if err != nil {
+ return err
+ }
+
+ // Set up registers.
+ c.Regs.Sp = uint64(st.Bottom)
+ c.Regs.Pc = act.Handler
+ c.Regs.Regs[0] = uint64(info.Signo)
+ c.Regs.Regs[1] = uint64(infoAddr)
+ c.Regs.Regs[2] = uint64(ucAddr)
+
+ return nil
+}
+
+// SignalRestore implements Context.SignalRestore.
+// Only used on intel.
+func (c *context64) SignalRestore(st *Stack, rt bool) (linux.SignalSet, SignalStack, error) {
+ return 0, SignalStack{}, nil
+}
diff --git a/pkg/sentry/arch/signal_stack.go b/pkg/sentry/arch/signal_stack.go
index 5a3228113..d324da705 100644
--- a/pkg/sentry/arch/signal_stack.go
+++ b/pkg/sentry/arch/signal_stack.go
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-// +build i386 amd64
+// +build i386 amd64 arm64
package arch
diff --git a/pkg/sentry/arch/syscalls_arm64.go b/pkg/sentry/arch/syscalls_arm64.go
new file mode 100644
index 000000000..00d5ef461
--- /dev/null
+++ b/pkg/sentry/arch/syscalls_arm64.go
@@ -0,0 +1,62 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package arch
+
+const restartSyscallNr = uintptr(128)
+
+// SyscallNo returns the syscall number according to the 64-bit convention.
+func (c *context64) SyscallNo() uintptr {
+ return uintptr(c.Regs.Regs[8])
+}
+
+// SyscallArgs provides syscall arguments according to the 64-bit convention.
+//
+// Due to the way addresses are mapped for the sentry this binary *must* be
+// built in 64-bit mode. So we can just assume the syscall numbers that come
+// back match the expected host system call numbers.
+// General purpose registers usage on Arm64:
+// R0...R7: parameter/result registers.
+// R8: indirect result location register.
+// R9...R15: temporary registers.
+// R16: the first intra-procedure-call scratch register.
+// R17: the second intra-procedure-call scratch register.
+// R18: the platform register.
+// R19...R28: callee-saved registers.
+// R29: the frame pointer.
+// R30: the link register.
+func (c *context64) SyscallArgs() SyscallArguments {
+ return SyscallArguments{
+ SyscallArgument{Value: uintptr(c.Regs.Regs[0])},
+ SyscallArgument{Value: uintptr(c.Regs.Regs[1])},
+ SyscallArgument{Value: uintptr(c.Regs.Regs[2])},
+ SyscallArgument{Value: uintptr(c.Regs.Regs[3])},
+ SyscallArgument{Value: uintptr(c.Regs.Regs[4])},
+ SyscallArgument{Value: uintptr(c.Regs.Regs[5])},
+ }
+}
+
+// RestartSyscall implements Context.RestartSyscall.
+func (c *context64) RestartSyscall() {
+ c.Regs.Pc -= SyscallWidth
+ c.Regs.Regs[8] = uint64(restartSyscallNr)
+}
+
+// RestartSyscallWithRestartBlock implements Context.RestartSyscallWithRestartBlock.
+func (c *context64) RestartSyscallWithRestartBlock() {
+ c.Regs.Pc -= SyscallWidth
+ c.Regs.Regs[8] = uint64(restartSyscallNr)
+}
diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD
index 5522cecd0..2561a6109 100644
--- a/pkg/sentry/control/BUILD
+++ b/pkg/sentry/control/BUILD
@@ -30,6 +30,7 @@ go_library(
"//pkg/sentry/strace",
"//pkg/sentry/usage",
"//pkg/sentry/watchdog",
+ "//pkg/sync",
"//pkg/tcpip/link/sniffer",
"//pkg/urpc",
],
diff --git a/pkg/sentry/control/pprof.go b/pkg/sentry/control/pprof.go
index 1f78d54a2..151808911 100644
--- a/pkg/sentry/control/pprof.go
+++ b/pkg/sentry/control/pprof.go
@@ -19,9 +19,10 @@ import (
"runtime"
"runtime/pprof"
"runtime/trace"
- "sync"
"gvisor.dev/gvisor/pkg/fd"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/urpc"
)
@@ -56,6 +57,9 @@ type Profile struct {
// traceFile is the current execution trace output file.
traceFile *fd.FD
+
+ // Kernel is the kernel under profile.
+ Kernel *kernel.Kernel
}
// StartCPUProfile is an RPC stub which starts recording the CPU profile in a
@@ -147,6 +151,9 @@ func (p *Profile) StartTrace(o *ProfileOpts, _ *struct{}) error {
return err
}
+ // Ensure all trace contexts are registered.
+ p.Kernel.RebuildTraceContexts()
+
p.traceFile = output
return nil
}
@@ -158,9 +165,15 @@ func (p *Profile) StopTrace(_, _ *struct{}) error {
defer p.mu.Unlock()
if p.traceFile == nil {
- return errors.New("Execution tracing not start")
+ return errors.New("Execution tracing not started")
}
+ // Similarly to the case above, if tasks have not ended traces, we will
+ // lose information. Thus we need to rebuild the tasks in order to have
+ // complete information. This will not lose information if multiple
+ // traces are overlapping.
+ p.Kernel.RebuildTraceContexts()
+
trace.Stop()
p.traceFile.Close()
p.traceFile = nil
diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index c35faeb4c..ced51c66c 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -268,14 +268,17 @@ func (proc *Proc) Ps(args *PsArgs, out *string) error {
}
// Process contains information about a single process in a Sandbox.
-// TODO(b/117881927): Implement TTY field.
type Process struct {
UID auth.KUID `json:"uid"`
PID kernel.ThreadID `json:"pid"`
// Parent PID
- PPID kernel.ThreadID `json:"ppid"`
+ PPID kernel.ThreadID `json:"ppid"`
+ Threads []kernel.ThreadID `json:"threads"`
// Processor utilization
C int32 `json:"c"`
+ // TTY name of the process. Will be of the form "pts/N" if there is a
+ // TTY, or "?" if there is not.
+ TTY string `json:"tty"`
// Start time
STime string `json:"stime"`
// CPU time
@@ -285,18 +288,19 @@ type Process struct {
}
// ProcessListToTable prints a table with the following format:
-// UID PID PPID C STIME TIME CMD
-// 0 1 0 0 14:04 505262ns tail
+// UID PID PPID C TTY STIME TIME CMD
+// 0 1 0 0 pty/4 14:04 505262ns tail
func ProcessListToTable(pl []*Process) string {
var buf bytes.Buffer
tw := tabwriter.NewWriter(&buf, 10, 1, 3, ' ', 0)
- fmt.Fprint(tw, "UID\tPID\tPPID\tC\tSTIME\tTIME\tCMD")
+ fmt.Fprint(tw, "UID\tPID\tPPID\tC\tTTY\tSTIME\tTIME\tCMD")
for _, d := range pl {
- fmt.Fprintf(tw, "\n%d\t%d\t%d\t%d\t%s\t%s\t%s",
+ fmt.Fprintf(tw, "\n%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s",
d.UID,
d.PID,
d.PPID,
d.C,
+ d.TTY,
d.STime,
d.Time,
d.Cmd)
@@ -307,7 +311,7 @@ func ProcessListToTable(pl []*Process) string {
// ProcessListToJSON will return the JSON representation of ps.
func ProcessListToJSON(pl []*Process) (string, error) {
- b, err := json.Marshal(pl)
+ b, err := json.MarshalIndent(pl, "", " ")
if err != nil {
return "", fmt.Errorf("couldn't marshal process list %v: %v", pl, err)
}
@@ -334,7 +338,9 @@ func Processes(k *kernel.Kernel, containerID string, out *[]*Process) error {
ts := k.TaskSet()
now := k.RealtimeClock().Now()
for _, tg := range ts.Root.ThreadGroups() {
- pid := tg.PIDNamespace().IDOfThreadGroup(tg)
+ pidns := tg.PIDNamespace()
+ pid := pidns.IDOfThreadGroup(tg)
+
// If tg has already been reaped ignore it.
if pid == 0 {
continue
@@ -345,16 +351,19 @@ func Processes(k *kernel.Kernel, containerID string, out *[]*Process) error {
ppid := kernel.ThreadID(0)
if p := tg.Leader().Parent(); p != nil {
- ppid = p.PIDNamespace().IDOfThreadGroup(p.ThreadGroup())
+ ppid = pidns.IDOfThreadGroup(p.ThreadGroup())
}
+ threads := tg.MemberIDs(pidns)
*out = append(*out, &Process{
- UID: tg.Leader().Credentials().EffectiveKUID,
- PID: pid,
- PPID: ppid,
- STime: formatStartTime(now, tg.Leader().StartTime()),
- C: percentCPU(tg.CPUStats(), tg.Leader().StartTime(), now),
- Time: tg.CPUStats().SysTime.String(),
- Cmd: tg.Leader().Name(),
+ UID: tg.Leader().Credentials().EffectiveKUID,
+ PID: pid,
+ PPID: ppid,
+ Threads: threads,
+ STime: formatStartTime(now, tg.Leader().StartTime()),
+ C: percentCPU(tg.CPUStats(), tg.Leader().StartTime(), now),
+ Time: tg.CPUStats().SysTime.String(),
+ Cmd: tg.Leader().Name(),
+ TTY: ttyName(tg.TTY()),
})
}
sort.Slice(*out, func(i, j int) bool { return (*out)[i].PID < (*out)[j].PID })
@@ -395,3 +404,10 @@ func percentCPU(stats usage.CPUStats, startTime, now ktime.Time) int32 {
}
return int32(percentCPU)
}
+
+func ttyName(tty *kernel.TTY) string {
+ if tty == nil {
+ return "?"
+ }
+ return fmt.Sprintf("pts/%d", tty.Index)
+}
diff --git a/pkg/sentry/control/proc_test.go b/pkg/sentry/control/proc_test.go
index d8ada2694..0a88459b2 100644
--- a/pkg/sentry/control/proc_test.go
+++ b/pkg/sentry/control/proc_test.go
@@ -34,7 +34,7 @@ func TestProcessListTable(t *testing.T) {
}{
{
pl: []*Process{},
- expected: "UID PID PPID C STIME TIME CMD",
+ expected: "UID PID PPID C TTY STIME TIME CMD",
},
{
pl: []*Process{
@@ -43,6 +43,7 @@ func TestProcessListTable(t *testing.T) {
PID: 0,
PPID: 0,
C: 0,
+ TTY: "?",
STime: "0",
Time: "0",
Cmd: "zero",
@@ -52,14 +53,15 @@ func TestProcessListTable(t *testing.T) {
PID: 1,
PPID: 1,
C: 1,
+ TTY: "pts/4",
STime: "1",
Time: "1",
Cmd: "one",
},
},
- expected: `UID PID PPID C STIME TIME CMD
-0 0 0 0 0 0 zero
-1 1 1 1 1 1 one`,
+ expected: `UID PID PPID C TTY STIME TIME CMD
+0 0 0 0 ? 0 0 zero
+1 1 1 1 pts/4 1 1 one`,
},
}
diff --git a/pkg/sentry/device/BUILD b/pkg/sentry/device/BUILD
index 1098ed777..97fa1512c 100644
--- a/pkg/sentry/device/BUILD
+++ b/pkg/sentry/device/BUILD
@@ -8,7 +8,10 @@ go_library(
srcs = ["device.go"],
importpath = "gvisor.dev/gvisor/pkg/sentry/device",
visibility = ["//pkg/sentry:internal"],
- deps = ["//pkg/abi/linux"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/sync",
+ ],
)
go_test(
diff --git a/pkg/sentry/device/device.go b/pkg/sentry/device/device.go
index 47945d1a7..69e71e322 100644
--- a/pkg/sentry/device/device.go
+++ b/pkg/sentry/device/device.go
@@ -19,10 +19,10 @@ package device
import (
"bytes"
"fmt"
- "sync"
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sync"
)
// Registry tracks all simple devices and related state on the system for
diff --git a/pkg/sentry/fs/BUILD b/pkg/sentry/fs/BUILD
index c035ffff7..7d5d72d5a 100644
--- a/pkg/sentry/fs/BUILD
+++ b/pkg/sentry/fs/BUILD
@@ -68,7 +68,7 @@ go_library(
"//pkg/sentry/usage",
"//pkg/sentry/usermem",
"//pkg/state",
- "//pkg/syncutil",
+ "//pkg/sync",
"//pkg/syserror",
"//pkg/waiter",
],
@@ -115,6 +115,7 @@ go_test(
"//pkg/sentry/fs/tmpfs",
"//pkg/sentry/kernel/contexttest",
"//pkg/sentry/usermem",
+ "//pkg/sync",
"//pkg/syserror",
],
)
diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go
index 9ac62c84d..734177e90 100644
--- a/pkg/sentry/fs/copy_up.go
+++ b/pkg/sentry/fs/copy_up.go
@@ -17,12 +17,12 @@ package fs
import (
"fmt"
"io"
- "sync"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
)
diff --git a/pkg/sentry/fs/copy_up_test.go b/pkg/sentry/fs/copy_up_test.go
index 1d80bf15a..738580c5f 100644
--- a/pkg/sentry/fs/copy_up_test.go
+++ b/pkg/sentry/fs/copy_up_test.go
@@ -19,13 +19,13 @@ import (
"crypto/rand"
"fmt"
"io"
- "sync"
"testing"
"gvisor.dev/gvisor/pkg/sentry/fs"
_ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
"gvisor.dev/gvisor/pkg/sentry/kernel/contexttest"
"gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sync"
)
const (
diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index 3cb73bd78..31fc4d87b 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -18,7 +18,6 @@ import (
"fmt"
"path"
"sort"
- "sync"
"sync/atomic"
"syscall"
@@ -28,6 +27,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
"gvisor.dev/gvisor/pkg/sentry/uniqueid"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
)
diff --git a/pkg/sentry/fs/dirent_cache.go b/pkg/sentry/fs/dirent_cache.go
index 60a15a275..25514ace4 100644
--- a/pkg/sentry/fs/dirent_cache.go
+++ b/pkg/sentry/fs/dirent_cache.go
@@ -16,7 +16,8 @@ package fs
import (
"fmt"
- "sync"
+
+ "gvisor.dev/gvisor/pkg/sync"
)
// DirentCache is an LRU cache of Dirents. The Dirent's refCount is
diff --git a/pkg/sentry/fs/dirent_cache_limiter.go b/pkg/sentry/fs/dirent_cache_limiter.go
index ebb80bd50..525ee25f9 100644
--- a/pkg/sentry/fs/dirent_cache_limiter.go
+++ b/pkg/sentry/fs/dirent_cache_limiter.go
@@ -16,7 +16,8 @@ package fs
import (
"fmt"
- "sync"
+
+ "gvisor.dev/gvisor/pkg/sync"
)
// DirentCacheLimiter acts as a global limit for all dirent caches in the
diff --git a/pkg/sentry/fs/fdpipe/BUILD b/pkg/sentry/fs/fdpipe/BUILD
index 277ee4c31..cc43de69d 100644
--- a/pkg/sentry/fs/fdpipe/BUILD
+++ b/pkg/sentry/fs/fdpipe/BUILD
@@ -23,6 +23,7 @@ go_library(
"//pkg/sentry/fs/fsutil",
"//pkg/sentry/safemem",
"//pkg/sentry/usermem",
+ "//pkg/sync",
"//pkg/syserror",
"//pkg/waiter",
],
diff --git a/pkg/sentry/fs/fdpipe/pipe.go b/pkg/sentry/fs/fdpipe/pipe.go
index 669ffcb75..5b6cfeb0a 100644
--- a/pkg/sentry/fs/fdpipe/pipe.go
+++ b/pkg/sentry/fs/fdpipe/pipe.go
@@ -17,7 +17,6 @@ package fdpipe
import (
"os"
- "sync"
"syscall"
"gvisor.dev/gvisor/pkg/fd"
@@ -29,6 +28,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
"gvisor.dev/gvisor/pkg/sentry/safemem"
"gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/waiter"
)
diff --git a/pkg/sentry/fs/fdpipe/pipe_state.go b/pkg/sentry/fs/fdpipe/pipe_state.go
index 29175fb3d..cee87f726 100644
--- a/pkg/sentry/fs/fdpipe/pipe_state.go
+++ b/pkg/sentry/fs/fdpipe/pipe_state.go
@@ -17,10 +17,10 @@ package fdpipe
import (
"fmt"
"io/ioutil"
- "sync"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sync"
)
// beforeSave is invoked by stateify.
diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go
index c0a6e884b..7c4586296 100644
--- a/pkg/sentry/fs/file.go
+++ b/pkg/sentry/fs/file.go
@@ -16,7 +16,6 @@ package fs
import (
"math"
- "sync"
"sync/atomic"
"time"
@@ -29,6 +28,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/uniqueid"
"gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -555,6 +555,10 @@ type lockedWriter struct {
//
// This applies only to Write, not WriteAt.
Offset int64
+
+ // Err contains the first error encountered while copying. This is
+ // useful to determine whether Writer or Reader failed during io.Copy.
+ Err error
}
// Write implements io.Writer.Write.
@@ -590,5 +594,8 @@ func (w *lockedWriter) WriteAt(buf []byte, offset int64) (int, error) {
break
}
}
+ if w.Err == nil {
+ w.Err = err
+ }
return written, err
}
diff --git a/pkg/sentry/fs/file_overlay.go b/pkg/sentry/fs/file_overlay.go
index 225e40186..8a633b1ba 100644
--- a/pkg/sentry/fs/file_overlay.go
+++ b/pkg/sentry/fs/file_overlay.go
@@ -16,13 +16,13 @@ package fs
import (
"io"
- "sync"
"gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/waiter"
)
diff --git a/pkg/sentry/fs/filesystems.go b/pkg/sentry/fs/filesystems.go
index b157fd228..c5b51620a 100644
--- a/pkg/sentry/fs/filesystems.go
+++ b/pkg/sentry/fs/filesystems.go
@@ -18,9 +18,9 @@ import (
"fmt"
"sort"
"strings"
- "sync"
"gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sync"
)
// FilesystemFlags matches include/linux/fs.h:file_system_type.fs_flags.
diff --git a/pkg/sentry/fs/fs.go b/pkg/sentry/fs/fs.go
index 8b2a5e6b2..26abf49e2 100644
--- a/pkg/sentry/fs/fs.go
+++ b/pkg/sentry/fs/fs.go
@@ -54,10 +54,9 @@
package fs
import (
- "sync"
-
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sync"
)
var (
diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD
index b2e8d9c77..945b6270d 100644
--- a/pkg/sentry/fs/fsutil/BUILD
+++ b/pkg/sentry/fs/fsutil/BUILD
@@ -53,7 +53,7 @@ go_template_instance(
"Key": "uint64",
"Range": "memmap.MappableRange",
"Value": "uint64",
- "Functions": "fileRangeSetFunctions",
+ "Functions": "FileRangeSetFunctions",
},
)
@@ -93,6 +93,7 @@ go_library(
"//pkg/sentry/usage",
"//pkg/sentry/usermem",
"//pkg/state",
+ "//pkg/sync",
"//pkg/syserror",
"//pkg/waiter",
],
diff --git a/pkg/sentry/fs/fsutil/file_range_set.go b/pkg/sentry/fs/fsutil/file_range_set.go
index 0a5466b0a..f52d712e3 100644
--- a/pkg/sentry/fs/fsutil/file_range_set.go
+++ b/pkg/sentry/fs/fsutil/file_range_set.go
@@ -34,25 +34,25 @@ import (
//
// type FileRangeSet <generated by go_generics>
-// fileRangeSetFunctions implements segment.Functions for FileRangeSet.
-type fileRangeSetFunctions struct{}
+// FileRangeSetFunctions implements segment.Functions for FileRangeSet.
+type FileRangeSetFunctions struct{}
// MinKey implements segment.Functions.MinKey.
-func (fileRangeSetFunctions) MinKey() uint64 {
+func (FileRangeSetFunctions) MinKey() uint64 {
return 0
}
// MaxKey implements segment.Functions.MaxKey.
-func (fileRangeSetFunctions) MaxKey() uint64 {
+func (FileRangeSetFunctions) MaxKey() uint64 {
return math.MaxUint64
}
// ClearValue implements segment.Functions.ClearValue.
-func (fileRangeSetFunctions) ClearValue(_ *uint64) {
+func (FileRangeSetFunctions) ClearValue(_ *uint64) {
}
// Merge implements segment.Functions.Merge.
-func (fileRangeSetFunctions) Merge(mr1 memmap.MappableRange, frstart1 uint64, _ memmap.MappableRange, frstart2 uint64) (uint64, bool) {
+func (FileRangeSetFunctions) Merge(mr1 memmap.MappableRange, frstart1 uint64, _ memmap.MappableRange, frstart2 uint64) (uint64, bool) {
if frstart1+mr1.Length() != frstart2 {
return 0, false
}
@@ -60,7 +60,7 @@ func (fileRangeSetFunctions) Merge(mr1 memmap.MappableRange, frstart1 uint64, _
}
// Split implements segment.Functions.Split.
-func (fileRangeSetFunctions) Split(mr memmap.MappableRange, frstart uint64, split uint64) (uint64, uint64) {
+func (FileRangeSetFunctions) Split(mr memmap.MappableRange, frstart uint64, split uint64) (uint64, uint64) {
return frstart, frstart + (split - mr.Start)
}
diff --git a/pkg/sentry/fs/fsutil/host_file_mapper.go b/pkg/sentry/fs/fsutil/host_file_mapper.go
index b06a71cc2..837fc70b5 100644
--- a/pkg/sentry/fs/fsutil/host_file_mapper.go
+++ b/pkg/sentry/fs/fsutil/host_file_mapper.go
@@ -16,7 +16,6 @@ package fsutil
import (
"fmt"
- "sync"
"syscall"
"gvisor.dev/gvisor/pkg/log"
@@ -24,6 +23,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/safemem"
"gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sync"
)
// HostFileMapper caches mappings of an arbitrary host file descriptor. It is
diff --git a/pkg/sentry/fs/fsutil/host_mappable.go b/pkg/sentry/fs/fsutil/host_mappable.go
index 30475f340..a625f0e26 100644
--- a/pkg/sentry/fs/fsutil/host_mappable.go
+++ b/pkg/sentry/fs/fsutil/host_mappable.go
@@ -16,7 +16,6 @@ package fsutil
import (
"math"
- "sync"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -24,6 +23,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/safemem"
"gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sync"
)
// HostMappable implements memmap.Mappable and platform.File over a
diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go
index 4e100a402..adf5ec69c 100644
--- a/pkg/sentry/fs/fsutil/inode.go
+++ b/pkg/sentry/fs/fsutil/inode.go
@@ -15,13 +15,12 @@
package fsutil
import (
- "sync"
-
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/fs"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/waiter"
)
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index 798920d18..20a014402 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -17,7 +17,6 @@ package fsutil
import (
"fmt"
"io"
- "sync"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/context"
@@ -30,6 +29,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/safemem"
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sync"
)
// Lock order (compare the lock order model in mm/mm.go):
diff --git a/pkg/sentry/fs/gofer/BUILD b/pkg/sentry/fs/gofer/BUILD
index 4a005c605..fd870e8e1 100644
--- a/pkg/sentry/fs/gofer/BUILD
+++ b/pkg/sentry/fs/gofer/BUILD
@@ -44,6 +44,7 @@ go_library(
"//pkg/sentry/safemem",
"//pkg/sentry/socket/unix/transport",
"//pkg/sentry/usermem",
+ "//pkg/sync",
"//pkg/syserr",
"//pkg/syserror",
"//pkg/unet",
diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go
index 91263ebdc..245fe2ef1 100644
--- a/pkg/sentry/fs/gofer/inode.go
+++ b/pkg/sentry/fs/gofer/inode.go
@@ -16,7 +16,6 @@ package gofer
import (
"errors"
- "sync"
"syscall"
"gvisor.dev/gvisor/pkg/abi/linux"
@@ -31,6 +30,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs/host"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/safemem"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
)
diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go
index 8c17603f8..c09f3b71c 100644
--- a/pkg/sentry/fs/gofer/path.go
+++ b/pkg/sentry/fs/gofer/path.go
@@ -234,6 +234,8 @@ func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string,
if err != nil {
return nil, err
}
+ // We're not going to use newFile after return.
+ defer newFile.close(ctx)
// Stabilize the endpoint map while creation is in progress.
unlock := i.session().endpoints.lock()
@@ -254,7 +256,6 @@ func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string,
// Get the attributes of the file to create inode key.
qid, mask, attr, err := getattr(ctx, newFile)
if err != nil {
- newFile.close(ctx)
return nil, err
}
@@ -270,7 +271,6 @@ func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string,
// cloned and re-opened multiple times after creation.
_, unopened, err := i.fileState.file.walk(ctx, []string{name})
if err != nil {
- newFile.close(ctx)
return nil, err
}
diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index 0da608548..edc796ce0 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -16,7 +16,6 @@ package gofer
import (
"fmt"
- "sync"
"gvisor.dev/gvisor/pkg/p9"
"gvisor.dev/gvisor/pkg/refs"
@@ -25,6 +24,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/unet"
)
@@ -143,9 +143,9 @@ type session struct {
// socket files. This allows unix domain sockets to be used with paths that
// belong to a gofer.
//
- // TODO(b/77154739): there are few possible races with someone stat'ing the
- // file and another deleting it concurrently, where the file will not be
- // reported as socket file.
+ // TODO(gvisor.dev/issue/1200): there are few possible races with someone
+ // stat'ing the file and another deleting it concurrently, where the file
+ // will not be reported as socket file.
endpoints *endpointMaps `state:"wait"`
}
diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD
index 23daeb528..2b581aa69 100644
--- a/pkg/sentry/fs/host/BUILD
+++ b/pkg/sentry/fs/host/BUILD
@@ -50,6 +50,7 @@ go_library(
"//pkg/sentry/unimpl",
"//pkg/sentry/uniqueid",
"//pkg/sentry/usermem",
+ "//pkg/sync",
"//pkg/syserr",
"//pkg/syserror",
"//pkg/tcpip",
diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go
index a6e4a09e3..873a1c52d 100644
--- a/pkg/sentry/fs/host/inode.go
+++ b/pkg/sentry/fs/host/inode.go
@@ -15,7 +15,6 @@
package host
import (
- "sync"
"syscall"
"gvisor.dev/gvisor/pkg/abi/linux"
@@ -28,6 +27,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/safemem"
"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/waiter"
)
diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go
index 107336a3e..c076d5bdd 100644
--- a/pkg/sentry/fs/host/socket.go
+++ b/pkg/sentry/fs/host/socket.go
@@ -16,7 +16,6 @@ package host
import (
"fmt"
- "sync"
"syscall"
"gvisor.dev/gvisor/pkg/abi/linux"
@@ -30,6 +29,7 @@ import (
unixsocket "gvisor.dev/gvisor/pkg/sentry/socket/unix"
"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
"gvisor.dev/gvisor/pkg/sentry/uniqueid"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserr"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/tcpip"
diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go
index 90331e3b2..753ef8cd6 100644
--- a/pkg/sentry/fs/host/tty.go
+++ b/pkg/sentry/fs/host/tty.go
@@ -15,8 +15,6 @@
package host
import (
- "sync"
-
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/context"
@@ -24,6 +22,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/unimpl"
"gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
)
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index 2d43dff1d..468043df0 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -15,8 +15,6 @@
package fs
import (
- "sync"
-
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/metric"
@@ -26,6 +24,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -270,6 +269,14 @@ func (i *Inode) Getxattr(name string) (string, error) {
return i.InodeOperations.Getxattr(i, name)
}
+// Setxattr calls i.InodeOperations.Setxattr with i as the Inode.
+func (i *Inode) Setxattr(name, value string) error {
+ if i.overlay != nil {
+ return overlaySetxattr(i.overlay, name, value)
+ }
+ return i.InodeOperations.Setxattr(i, name, value)
+}
+
// Listxattr calls i.InodeOperations.Listxattr with i as the Inode.
func (i *Inode) Listxattr() (map[string]struct{}, error) {
if i.overlay != nil {
diff --git a/pkg/sentry/fs/inode_inotify.go b/pkg/sentry/fs/inode_inotify.go
index 0f2a66a79..efd3c962b 100644
--- a/pkg/sentry/fs/inode_inotify.go
+++ b/pkg/sentry/fs/inode_inotify.go
@@ -16,7 +16,8 @@ package fs
import (
"fmt"
- "sync"
+
+ "gvisor.dev/gvisor/pkg/sync"
)
// Watches is the collection of inotify watches on an inode.
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index a09147080..13d11e001 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -436,7 +436,7 @@ func overlayRename(ctx context.Context, o *overlayEntry, oldParent *Dirent, rena
}
func overlayBind(ctx context.Context, o *overlayEntry, parent *Dirent, name string, data transport.BoundEndpoint, perm FilePermissions) (*Dirent, error) {
- if err := copyUp(ctx, parent); err != nil {
+ if err := copyUpLockedForRename(ctx, parent); err != nil {
return nil, err
}
@@ -552,6 +552,11 @@ func overlayGetxattr(o *overlayEntry, name string) (string, error) {
return s, err
}
+// TODO(b/146028302): Support setxattr for overlayfs.
+func overlaySetxattr(o *overlayEntry, name, value string) error {
+ return syserror.EOPNOTSUPP
+}
+
func overlayListxattr(o *overlayEntry) (map[string]struct{}, error) {
o.copyMu.RLock()
defer o.copyMu.RUnlock()
diff --git a/pkg/sentry/fs/inotify.go b/pkg/sentry/fs/inotify.go
index ba3e0233d..cc7dd1c92 100644
--- a/pkg/sentry/fs/inotify.go
+++ b/pkg/sentry/fs/inotify.go
@@ -16,7 +16,6 @@ package fs
import (
"io"
- "sync"
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
@@ -25,6 +24,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/uniqueid"
"gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/waiter"
)
diff --git a/pkg/sentry/fs/inotify_watch.go b/pkg/sentry/fs/inotify_watch.go
index 0aa0a5e9b..900cba3ca 100644
--- a/pkg/sentry/fs/inotify_watch.go
+++ b/pkg/sentry/fs/inotify_watch.go
@@ -15,10 +15,10 @@
package fs
import (
- "sync"
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sync"
)
// Watch represent a particular inotify watch created by inotify_add_watch.
diff --git a/pkg/sentry/fs/lock/BUILD b/pkg/sentry/fs/lock/BUILD
index 8d62642e7..2c332a82a 100644
--- a/pkg/sentry/fs/lock/BUILD
+++ b/pkg/sentry/fs/lock/BUILD
@@ -44,6 +44,7 @@ go_library(
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/log",
+ "//pkg/sync",
"//pkg/waiter",
],
)
diff --git a/pkg/sentry/fs/lock/lock.go b/pkg/sentry/fs/lock/lock.go
index 636484424..41b040818 100644
--- a/pkg/sentry/fs/lock/lock.go
+++ b/pkg/sentry/fs/lock/lock.go
@@ -52,9 +52,9 @@ package lock
import (
"fmt"
"math"
- "sync"
"syscall"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/waiter"
)
diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go
index ac0398bd9..db3dfd096 100644
--- a/pkg/sentry/fs/mounts.go
+++ b/pkg/sentry/fs/mounts.go
@@ -19,7 +19,6 @@ import (
"math"
"path"
"strings"
- "sync"
"syscall"
"gvisor.dev/gvisor/pkg/abi/linux"
@@ -27,6 +26,7 @@ import (
"gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
)
diff --git a/pkg/sentry/fs/overlay.go b/pkg/sentry/fs/overlay.go
index 25573e986..4cad55327 100644
--- a/pkg/sentry/fs/overlay.go
+++ b/pkg/sentry/fs/overlay.go
@@ -17,13 +17,12 @@ package fs
import (
"fmt"
"strings"
- "sync"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/usermem"
- "gvisor.dev/gvisor/pkg/syncutil"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -199,7 +198,7 @@ type overlayEntry struct {
upper *Inode
// dirCacheMu protects dirCache.
- dirCacheMu syncutil.DowngradableRWMutex `state:"nosave"`
+ dirCacheMu sync.DowngradableRWMutex `state:"nosave"`
// dirCache is cache of DentAttrs from upper and lower Inodes.
dirCache *SortedDentryMap
diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD
index 75cbb0622..94d46ab1b 100644
--- a/pkg/sentry/fs/proc/BUILD
+++ b/pkg/sentry/fs/proc/BUILD
@@ -51,6 +51,7 @@ go_library(
"//pkg/sentry/socket/unix/transport",
"//pkg/sentry/usage",
"//pkg/sentry/usermem",
+ "//pkg/sync",
"//pkg/syserror",
"//pkg/tcpip/header",
"//pkg/waiter",
diff --git a/pkg/sentry/fs/proc/seqfile/BUILD b/pkg/sentry/fs/proc/seqfile/BUILD
index fe7067be1..38b246dff 100644
--- a/pkg/sentry/fs/proc/seqfile/BUILD
+++ b/pkg/sentry/fs/proc/seqfile/BUILD
@@ -16,6 +16,7 @@ go_library(
"//pkg/sentry/fs/proc/device",
"//pkg/sentry/kernel/time",
"//pkg/sentry/usermem",
+ "//pkg/sync",
"//pkg/syserror",
"//pkg/waiter",
],
diff --git a/pkg/sentry/fs/proc/seqfile/seqfile.go b/pkg/sentry/fs/proc/seqfile/seqfile.go
index 5fe823000..f9af191d5 100644
--- a/pkg/sentry/fs/proc/seqfile/seqfile.go
+++ b/pkg/sentry/fs/proc/seqfile/seqfile.go
@@ -17,7 +17,6 @@ package seqfile
import (
"io"
- "sync"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/context"
@@ -26,6 +25,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs/proc/device"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/waiter"
)
diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go
index f3b63dfc2..a37e1fa06 100644
--- a/pkg/sentry/fs/proc/sys_net.go
+++ b/pkg/sentry/fs/proc/sys_net.go
@@ -17,7 +17,6 @@ package proc
import (
"fmt"
"io"
- "sync"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/context"
@@ -27,6 +26,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
"gvisor.dev/gvisor/pkg/sentry/inet"
"gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -64,7 +64,7 @@ var _ fs.InodeOperations = (*tcpMemInode)(nil)
func newTCPMemInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack, dir tcpMemDir) *fs.Inode {
tm := &tcpMemInode{
- SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0444), linux.PROC_SUPER_MAGIC),
+ SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0644), linux.PROC_SUPER_MAGIC),
s: s,
dir: dir,
}
@@ -77,6 +77,11 @@ func newTCPMemInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack, dir
return fs.NewInode(ctx, tm, msrc, sattr)
}
+// Truncate implements fs.InodeOperations.Truncate.
+func (tcpMemInode) Truncate(context.Context, *fs.Inode, int64) error {
+ return nil
+}
+
// GetFile implements fs.InodeOperations.GetFile.
func (m *tcpMemInode) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
flags.Pread = true
@@ -168,14 +173,15 @@ func writeSize(dirType tcpMemDir, s inet.Stack, size inet.TCPBufferSize) error {
// +stateify savable
type tcpSack struct {
+ fsutil.SimpleFileInode
+
stack inet.Stack `state:"wait"`
enabled *bool
- fsutil.SimpleFileInode
}
func newTCPSackInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode {
ts := &tcpSack{
- SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0444), linux.PROC_SUPER_MAGIC),
+ SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0644), linux.PROC_SUPER_MAGIC),
stack: s,
}
sattr := fs.StableAttr{
@@ -187,6 +193,11 @@ func newTCPSackInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *f
return fs.NewInode(ctx, ts, msrc, sattr)
}
+// Truncate implements fs.InodeOperations.Truncate.
+func (tcpSack) Truncate(context.Context, *fs.Inode, int64) error {
+ return nil
+}
+
// GetFile implements fs.InodeOperations.GetFile.
func (s *tcpSack) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
flags.Pread = true
diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 87184ec67..9bf4b4527 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -67,29 +67,28 @@ type taskDir struct {
var _ fs.InodeOperations = (*taskDir)(nil)
// newTaskDir creates a new proc task entry.
-func (p *proc) newTaskDir(t *kernel.Task, msrc *fs.MountSource, showSubtasks bool) *fs.Inode {
+func (p *proc) newTaskDir(t *kernel.Task, msrc *fs.MountSource, isThreadGroup bool) *fs.Inode {
contents := map[string]*fs.Inode{
- "auxv": newAuxvec(t, msrc),
- "cmdline": newExecArgInode(t, msrc, cmdlineExecArg),
- "comm": newComm(t, msrc),
- "environ": newExecArgInode(t, msrc, environExecArg),
- "exe": newExe(t, msrc),
- "fd": newFdDir(t, msrc),
- "fdinfo": newFdInfoDir(t, msrc),
- "gid_map": newGIDMap(t, msrc),
- // FIXME(b/123511468): create the correct io file for threads.
- "io": newIO(t, msrc),
+ "auxv": newAuxvec(t, msrc),
+ "cmdline": newExecArgInode(t, msrc, cmdlineExecArg),
+ "comm": newComm(t, msrc),
+ "environ": newExecArgInode(t, msrc, environExecArg),
+ "exe": newExe(t, msrc),
+ "fd": newFdDir(t, msrc),
+ "fdinfo": newFdInfoDir(t, msrc),
+ "gid_map": newGIDMap(t, msrc),
+ "io": newIO(t, msrc, isThreadGroup),
"maps": newMaps(t, msrc),
"mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc),
"mounts": seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc),
"ns": newNamespaceDir(t, msrc),
"smaps": newSmaps(t, msrc),
- "stat": newTaskStat(t, msrc, showSubtasks, p.pidns),
+ "stat": newTaskStat(t, msrc, isThreadGroup, p.pidns),
"statm": newStatm(t, msrc),
"status": newStatus(t, msrc, p.pidns),
"uid_map": newUIDMap(t, msrc),
}
- if showSubtasks {
+ if isThreadGroup {
contents["task"] = p.newSubtasks(t, msrc)
}
if len(p.cgroupControllers) > 0 {
@@ -605,6 +604,10 @@ func (s *statusData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) (
fmt.Fprintf(&buf, "CapEff:\t%016x\n", creds.EffectiveCaps)
fmt.Fprintf(&buf, "CapBnd:\t%016x\n", creds.BoundingCaps)
fmt.Fprintf(&buf, "Seccomp:\t%d\n", s.t.SeccompMode())
+ // We unconditionally report a single NUMA node. See
+ // pkg/sentry/syscalls/linux/sys_mempolicy.go.
+ fmt.Fprintf(&buf, "Mems_allowed:\t1\n")
+ fmt.Fprintf(&buf, "Mems_allowed_list:\t0\n")
return []seqfile.SeqData{{Buf: buf.Bytes(), Handle: (*statusData)(nil)}}, 0
}
@@ -619,8 +622,11 @@ type ioData struct {
ioUsage
}
-func newIO(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
- return newProcInode(t, seqfile.NewSeqFile(t, &ioData{t.ThreadGroup()}), msrc, fs.SpecialFile, t)
+func newIO(t *kernel.Task, msrc *fs.MountSource, isThreadGroup bool) *fs.Inode {
+ if isThreadGroup {
+ return newProcInode(t, seqfile.NewSeqFile(t, &ioData{t.ThreadGroup()}), msrc, fs.SpecialFile, t)
+ }
+ return newProcInode(t, seqfile.NewSeqFile(t, &ioData{t}), msrc, fs.SpecialFile, t)
}
// NeedsUpdate returns whether the generation is old or not.
@@ -639,7 +645,7 @@ func (i *ioData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se
io.Accumulate(i.IOUsage())
var buf bytes.Buffer
- fmt.Fprintf(&buf, "char: %d\n", io.CharsRead)
+ fmt.Fprintf(&buf, "rchar: %d\n", io.CharsRead)
fmt.Fprintf(&buf, "wchar: %d\n", io.CharsWritten)
fmt.Fprintf(&buf, "syscr: %d\n", io.ReadSyscalls)
fmt.Fprintf(&buf, "syscw: %d\n", io.WriteSyscalls)
diff --git a/pkg/sentry/fs/ramfs/BUILD b/pkg/sentry/fs/ramfs/BUILD
index 012cb3e44..3fb7b0633 100644
--- a/pkg/sentry/fs/ramfs/BUILD
+++ b/pkg/sentry/fs/ramfs/BUILD
@@ -21,6 +21,7 @@ go_library(
"//pkg/sentry/fs/fsutil",
"//pkg/sentry/socket/unix/transport",
"//pkg/sentry/usermem",
+ "//pkg/sync",
"//pkg/syserror",
"//pkg/waiter",
],
diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go
index 78e082b8e..dcbb8eb2e 100644
--- a/pkg/sentry/fs/ramfs/dir.go
+++ b/pkg/sentry/fs/ramfs/dir.go
@@ -17,7 +17,6 @@ package ramfs
import (
"fmt"
- "sync"
"syscall"
"gvisor.dev/gvisor/pkg/abi/linux"
@@ -25,6 +24,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
)
diff --git a/pkg/sentry/fs/restore.go b/pkg/sentry/fs/restore.go
index f10168125..64c6a6ae9 100644
--- a/pkg/sentry/fs/restore.go
+++ b/pkg/sentry/fs/restore.go
@@ -15,7 +15,7 @@
package fs
import (
- "sync"
+ "gvisor.dev/gvisor/pkg/sync"
)
// RestoreEnvironment is the restore environment for file systems. It consists
diff --git a/pkg/sentry/fs/splice.go b/pkg/sentry/fs/splice.go
index 311798811..389c330a0 100644
--- a/pkg/sentry/fs/splice.go
+++ b/pkg/sentry/fs/splice.go
@@ -167,6 +167,11 @@ func Splice(ctx context.Context, dst *File, src *File, opts SpliceOpts) (int64,
if !srcPipe && !opts.SrcOffset {
atomic.StoreInt64(&src.offset, src.offset+n)
}
+
+ // Don't report any errors if we have some progress without data loss.
+ if w.Err == nil {
+ err = nil
+ }
}
// Drop locks.
diff --git a/pkg/sentry/fs/tmpfs/BUILD b/pkg/sentry/fs/tmpfs/BUILD
index 59ce400c2..3400b940c 100644
--- a/pkg/sentry/fs/tmpfs/BUILD
+++ b/pkg/sentry/fs/tmpfs/BUILD
@@ -31,6 +31,7 @@ go_library(
"//pkg/sentry/socket/unix/transport",
"//pkg/sentry/usage",
"//pkg/sentry/usermem",
+ "//pkg/sync",
"//pkg/syserror",
"//pkg/waiter",
],
diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index f86dfaa36..f1c87fe41 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -17,7 +17,6 @@ package tmpfs
import (
"fmt"
"io"
- "sync"
"time"
"gvisor.dev/gvisor/pkg/abi/linux"
@@ -31,6 +30,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/safemem"
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
)
diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD
index 95ad98cb0..f6f60d0cf 100644
--- a/pkg/sentry/fs/tty/BUILD
+++ b/pkg/sentry/fs/tty/BUILD
@@ -30,6 +30,7 @@ go_library(
"//pkg/sentry/socket/unix/transport",
"//pkg/sentry/unimpl",
"//pkg/sentry/usermem",
+ "//pkg/sync",
"//pkg/syserror",
"//pkg/waiter",
],
diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go
index 2f639c823..88aa66b24 100644
--- a/pkg/sentry/fs/tty/dir.go
+++ b/pkg/sentry/fs/tty/dir.go
@@ -19,7 +19,6 @@ import (
"fmt"
"math"
"strconv"
- "sync"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/context"
@@ -28,6 +27,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
"gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/waiter"
)
diff --git a/pkg/sentry/fs/tty/line_discipline.go b/pkg/sentry/fs/tty/line_discipline.go
index 7cc0eb409..894964260 100644
--- a/pkg/sentry/fs/tty/line_discipline.go
+++ b/pkg/sentry/fs/tty/line_discipline.go
@@ -16,13 +16,13 @@ package tty
import (
"bytes"
- "sync"
"unicode/utf8"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/waiter"
)
diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go
index bc56be696..6b07f6bf2 100644
--- a/pkg/sentry/fs/tty/master.go
+++ b/pkg/sentry/fs/tty/master.go
@@ -32,7 +32,6 @@ import (
// +stateify savable
type masterInodeOperations struct {
fsutil.SimpleFileInode
- fsutil.InodeNoopTruncate
// d is the containing dir.
d *dirInodeOperations
@@ -77,6 +76,11 @@ func newMasterInode(ctx context.Context, d *dirInodeOperations, owner fs.FileOwn
func (mi *masterInodeOperations) Release(ctx context.Context) {
}
+// Truncate implements fs.InodeOperations.Truncate.
+func (*masterInodeOperations) Truncate(context.Context, *fs.Inode, int64) error {
+ return nil
+}
+
// GetFile implements fs.InodeOperations.GetFile.
//
// It allocates a new terminal.
diff --git a/pkg/sentry/fs/tty/queue.go b/pkg/sentry/fs/tty/queue.go
index 231e4e6eb..8b5d4699a 100644
--- a/pkg/sentry/fs/tty/queue.go
+++ b/pkg/sentry/fs/tty/queue.go
@@ -15,13 +15,12 @@
package tty
import (
- "sync"
-
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/safemem"
"gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/waiter"
)
diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go
index 4cbea0367..2a51e6bab 100644
--- a/pkg/sentry/fs/tty/slave.go
+++ b/pkg/sentry/fs/tty/slave.go
@@ -31,7 +31,6 @@ import (
// +stateify savable
type slaveInodeOperations struct {
fsutil.SimpleFileInode
- fsutil.InodeNoopTruncate
// d is the containing dir.
d *dirInodeOperations
@@ -73,6 +72,11 @@ func (si *slaveInodeOperations) Release(ctx context.Context) {
si.t.DecRef()
}
+// Truncate implements fs.InodeOperations.Truncate.
+func (slaveInodeOperations) Truncate(context.Context, *fs.Inode, int64) error {
+ return nil
+}
+
// GetFile implements fs.InodeOperations.GetFile.
//
// This may race with destruction of the terminal. If the terminal is gone, it
diff --git a/pkg/sentry/fs/tty/terminal.go b/pkg/sentry/fs/tty/terminal.go
index ff8138820..917f90cc0 100644
--- a/pkg/sentry/fs/tty/terminal.go
+++ b/pkg/sentry/fs/tty/terminal.go
@@ -53,8 +53,8 @@ func newTerminal(ctx context.Context, d *dirInodeOperations, n uint32) *Terminal
d: d,
n: n,
ld: newLineDiscipline(termios),
- masterKTTY: &kernel.TTY{},
- slaveKTTY: &kernel.TTY{},
+ masterKTTY: &kernel.TTY{Index: n},
+ slaveKTTY: &kernel.TTY{Index: n},
}
t.EnableLeakCheck("tty.Terminal")
return &t
diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD
index 7ccff8b0d..903874141 100644
--- a/pkg/sentry/fsimpl/ext/BUILD
+++ b/pkg/sentry/fsimpl/ext/BUILD
@@ -38,6 +38,7 @@ go_library(
"//pkg/abi/linux",
"//pkg/binary",
"//pkg/fd",
+ "//pkg/fspath",
"//pkg/log",
"//pkg/sentry/arch",
"//pkg/sentry/context",
@@ -49,6 +50,7 @@ go_library(
"//pkg/sentry/syscalls/linux",
"//pkg/sentry/usermem",
"//pkg/sentry/vfs",
+ "//pkg/sync",
"//pkg/syserror",
"//pkg/waiter",
],
@@ -73,6 +75,7 @@ go_test(
deps = [
"//pkg/abi/linux",
"//pkg/binary",
+ "//pkg/fspath",
"//pkg/sentry/context",
"//pkg/sentry/context/contexttest",
"//pkg/sentry/fsimpl/ext/disklayout",
diff --git a/pkg/sentry/fsimpl/ext/benchmark/BUILD b/pkg/sentry/fsimpl/ext/benchmark/BUILD
index bfc46dfa6..4fc8296ef 100644
--- a/pkg/sentry/fsimpl/ext/benchmark/BUILD
+++ b/pkg/sentry/fsimpl/ext/benchmark/BUILD
@@ -7,6 +7,7 @@ go_test(
size = "small",
srcs = ["benchmark_test.go"],
deps = [
+ "//pkg/fspath",
"//pkg/sentry/context",
"//pkg/sentry/context/contexttest",
"//pkg/sentry/fsimpl/ext",
diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
index 94cd74095..a56b03711 100644
--- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
+++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
@@ -24,6 +24,7 @@ import (
"strings"
"testing"
+ "gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext"
@@ -49,7 +50,9 @@ func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesys
// Create VFS.
vfsObj := vfs.New()
- vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{})
+ vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+ AllowUserMount: true,
+ })
mntns, err := vfsObj.NewMountNamespace(ctx, creds, imagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())})
if err != nil {
f.Close()
@@ -81,7 +84,11 @@ func mount(b *testing.B, imagePath string, vfsfs *vfs.VirtualFilesystem, pop *vf
ctx := contexttest.Context(b)
creds := auth.CredentialsFromContext(ctx)
- if err := vfsfs.NewMount(ctx, creds, imagePath, pop, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())}); err != nil {
+ if err := vfsfs.MountAt(ctx, creds, imagePath, pop, "extfs", &vfs.MountOptions{
+ GetFilesystemOptions: vfs.GetFilesystemOptions{
+ InternalData: int(f.Fd()),
+ },
+ }); err != nil {
b.Fatalf("failed to mount tmpfs submount: %v", err)
}
return func() {
@@ -117,7 +124,7 @@ func BenchmarkVFS2Ext4fsStat(b *testing.B) {
stat, err := vfsfs.StatAt(ctx, creds, &vfs.PathOperation{
Root: *root,
Start: *root,
- Pathname: filePath,
+ Path: fspath.Parse(filePath),
FollowFinalSymlink: true,
}, &vfs.StatOptions{})
if err != nil {
@@ -146,9 +153,9 @@ func BenchmarkVFS2ExtfsMountStat(b *testing.B) {
creds := auth.CredentialsFromContext(ctx)
mountPointName := "/1/"
pop := vfs.PathOperation{
- Root: *root,
- Start: *root,
- Pathname: mountPointName,
+ Root: *root,
+ Start: *root,
+ Path: fspath.Parse(mountPointName),
}
// Save the mount point for later use.
@@ -177,7 +184,7 @@ func BenchmarkVFS2ExtfsMountStat(b *testing.B) {
stat, err := vfsfs.StatAt(ctx, creds, &vfs.PathOperation{
Root: *root,
Start: *root,
- Pathname: filePath,
+ Path: fspath.Parse(filePath),
FollowFinalSymlink: true,
}, &vfs.StatOptions{})
if err != nil {
diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go
index 91802dc1e..8944171c8 100644
--- a/pkg/sentry/fsimpl/ext/directory.go
+++ b/pkg/sentry/fsimpl/ext/directory.go
@@ -15,8 +15,6 @@
package ext
import (
- "sync"
-
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/binary"
"gvisor.dev/gvisor/pkg/log"
@@ -25,6 +23,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
)
diff --git a/pkg/sentry/fsimpl/ext/disklayout/extent.go b/pkg/sentry/fsimpl/ext/disklayout/extent.go
index 567523d32..4110649ab 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/extent.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/extent.go
@@ -29,8 +29,12 @@ package disklayout
// byte (i * sb.BlockSize()) to ((i+1) * sb.BlockSize()).
const (
- // ExtentStructsSize is the size of all the three extent on-disk structs.
- ExtentStructsSize = 12
+ // ExtentHeaderSize is the size of the header of an extent tree node.
+ ExtentHeaderSize = 12
+
+ // ExtentEntrySize is the size of an entry in an extent tree node.
+ // This size is the same for both leaf and internal nodes.
+ ExtentEntrySize = 12
// ExtentMagic is the magic number which must be present in the header.
ExtentMagic = 0xf30a
@@ -57,7 +61,7 @@ type ExtentNode struct {
Entries []ExtentEntryPair
}
-// ExtentEntry reprsents an extent tree node entry. The entry can either be
+// ExtentEntry represents an extent tree node entry. The entry can either be
// an ExtentIdx or Extent itself. This exists to simplify navigation logic.
type ExtentEntry interface {
// FileBlock returns the first file block number covered by this entry.
diff --git a/pkg/sentry/fsimpl/ext/disklayout/extent_test.go b/pkg/sentry/fsimpl/ext/disklayout/extent_test.go
index b0fad9b71..8762b90db 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/extent_test.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/extent_test.go
@@ -21,7 +21,7 @@ import (
// TestExtentSize tests that the extent structs are of the correct
// size.
func TestExtentSize(t *testing.T) {
- assertSize(t, ExtentHeader{}, ExtentStructsSize)
- assertSize(t, ExtentIdx{}, ExtentStructsSize)
- assertSize(t, Extent{}, ExtentStructsSize)
+ assertSize(t, ExtentHeader{}, ExtentHeaderSize)
+ assertSize(t, ExtentIdx{}, ExtentEntrySize)
+ assertSize(t, Extent{}, ExtentEntrySize)
}
diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go
index 307e4d68c..6c14a1e2d 100644
--- a/pkg/sentry/fsimpl/ext/ext_test.go
+++ b/pkg/sentry/fsimpl/ext/ext_test.go
@@ -25,6 +25,7 @@ import (
"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp/cmpopts"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
@@ -65,7 +66,9 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesys
// Create VFS.
vfsObj := vfs.New()
- vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{})
+ vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+ AllowUserMount: true,
+ })
mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())})
if err != nil {
f.Close()
@@ -140,62 +143,61 @@ func TestSeek(t *testing.T) {
fd, err := vfsfs.OpenAt(
ctx,
auth.CredentialsFromContext(ctx),
- &vfs.PathOperation{Root: *root, Start: *root, Pathname: test.path},
+ &vfs.PathOperation{Root: *root, Start: *root, Path: fspath.Parse(test.path)},
&vfs.OpenOptions{},
)
if err != nil {
t.Fatalf("vfsfs.OpenAt failed: %v", err)
}
- if n, err := fd.Impl().Seek(ctx, 0, linux.SEEK_SET); n != 0 || err != nil {
+ if n, err := fd.Seek(ctx, 0, linux.SEEK_SET); n != 0 || err != nil {
t.Errorf("expected seek position 0, got %d and error %v", n, err)
}
- stat, err := fd.Impl().Stat(ctx, vfs.StatOptions{})
+ stat, err := fd.Stat(ctx, vfs.StatOptions{})
if err != nil {
t.Errorf("fd.stat failed for file %s in image %s: %v", test.path, test.image, err)
}
// We should be able to seek beyond the end of file.
size := int64(stat.Size)
- if n, err := fd.Impl().Seek(ctx, size, linux.SEEK_SET); n != size || err != nil {
+ if n, err := fd.Seek(ctx, size, linux.SEEK_SET); n != size || err != nil {
t.Errorf("expected seek position %d, got %d and error %v", size, n, err)
}
// EINVAL should be returned if the resulting offset is negative.
- if _, err := fd.Impl().Seek(ctx, -1, linux.SEEK_SET); err != syserror.EINVAL {
+ if _, err := fd.Seek(ctx, -1, linux.SEEK_SET); err != syserror.EINVAL {
t.Errorf("expected error EINVAL but got %v", err)
}
- if n, err := fd.Impl().Seek(ctx, 3, linux.SEEK_CUR); n != size+3 || err != nil {
+ if n, err := fd.Seek(ctx, 3, linux.SEEK_CUR); n != size+3 || err != nil {
t.Errorf("expected seek position %d, got %d and error %v", size+3, n, err)
}
// Make sure negative offsets work with SEEK_CUR.
- if n, err := fd.Impl().Seek(ctx, -2, linux.SEEK_CUR); n != size+1 || err != nil {
+ if n, err := fd.Seek(ctx, -2, linux.SEEK_CUR); n != size+1 || err != nil {
t.Errorf("expected seek position %d, got %d and error %v", size+1, n, err)
}
// EINVAL should be returned if the resulting offset is negative.
- if _, err := fd.Impl().Seek(ctx, -(size + 2), linux.SEEK_CUR); err != syserror.EINVAL {
+ if _, err := fd.Seek(ctx, -(size + 2), linux.SEEK_CUR); err != syserror.EINVAL {
t.Errorf("expected error EINVAL but got %v", err)
}
// Make sure SEEK_END works with regular files.
- switch fd.Impl().(type) {
- case *regularFileFD:
+ if _, ok := fd.Impl().(*regularFileFD); ok {
// Seek back to 0.
- if n, err := fd.Impl().Seek(ctx, -size, linux.SEEK_END); n != 0 || err != nil {
+ if n, err := fd.Seek(ctx, -size, linux.SEEK_END); n != 0 || err != nil {
t.Errorf("expected seek position %d, got %d and error %v", 0, n, err)
}
// Seek forward beyond EOF.
- if n, err := fd.Impl().Seek(ctx, 1, linux.SEEK_END); n != size+1 || err != nil {
+ if n, err := fd.Seek(ctx, 1, linux.SEEK_END); n != size+1 || err != nil {
t.Errorf("expected seek position %d, got %d and error %v", size+1, n, err)
}
// EINVAL should be returned if the resulting offset is negative.
- if _, err := fd.Impl().Seek(ctx, -(size + 1), linux.SEEK_END); err != syserror.EINVAL {
+ if _, err := fd.Seek(ctx, -(size + 1), linux.SEEK_END); err != syserror.EINVAL {
t.Errorf("expected error EINVAL but got %v", err)
}
}
@@ -360,7 +362,7 @@ func TestStatAt(t *testing.T) {
got, err := vfsfs.StatAt(ctx,
auth.CredentialsFromContext(ctx),
- &vfs.PathOperation{Root: *root, Start: *root, Pathname: test.path},
+ &vfs.PathOperation{Root: *root, Start: *root, Path: fspath.Parse(test.path)},
&vfs.StatOptions{},
)
if err != nil {
@@ -430,7 +432,7 @@ func TestRead(t *testing.T) {
fd, err := vfsfs.OpenAt(
ctx,
auth.CredentialsFromContext(ctx),
- &vfs.PathOperation{Root: *root, Start: *root, Pathname: test.absPath},
+ &vfs.PathOperation{Root: *root, Start: *root, Path: fspath.Parse(test.absPath)},
&vfs.OpenOptions{},
)
if err != nil {
@@ -456,7 +458,7 @@ func TestRead(t *testing.T) {
want := make([]byte, 1)
for {
n, err := f.Read(want)
- fd.Impl().Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{})
+ fd.Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{})
if diff := cmp.Diff(got, want); diff != "" {
t.Errorf("file data mismatch (-want +got):\n%s", diff)
@@ -464,7 +466,7 @@ func TestRead(t *testing.T) {
// Make sure there is no more file data left after getting EOF.
if n == 0 || err == io.EOF {
- if n, _ := fd.Impl().Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{}); n != 0 {
+ if n, _ := fd.Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{}); n != 0 {
t.Errorf("extra unexpected file data in file %s in image %s", test.absPath, test.image)
}
@@ -566,7 +568,7 @@ func TestIterDirents(t *testing.T) {
fd, err := vfsfs.OpenAt(
ctx,
auth.CredentialsFromContext(ctx),
- &vfs.PathOperation{Root: *root, Start: *root, Pathname: test.path},
+ &vfs.PathOperation{Root: *root, Start: *root, Path: fspath.Parse(test.path)},
&vfs.OpenOptions{},
)
if err != nil {
@@ -574,7 +576,7 @@ func TestIterDirents(t *testing.T) {
}
cb := &iterDirentsCb{}
- if err = fd.Impl().IterDirents(ctx, cb); err != nil {
+ if err = fd.IterDirents(ctx, cb); err != nil {
t.Fatalf("dir fd.IterDirents() failed: %v", err)
}
diff --git a/pkg/sentry/fsimpl/ext/extent_file.go b/pkg/sentry/fsimpl/ext/extent_file.go
index 3d3ebaca6..11dcc0346 100644
--- a/pkg/sentry/fsimpl/ext/extent_file.go
+++ b/pkg/sentry/fsimpl/ext/extent_file.go
@@ -57,7 +57,7 @@ func newExtentFile(regFile regularFile) (*extentFile, error) {
func (f *extentFile) buildExtTree() error {
rootNodeData := f.regFile.inode.diskInode.Data()
- binary.Unmarshal(rootNodeData[:disklayout.ExtentStructsSize], binary.LittleEndian, &f.root.Header)
+ binary.Unmarshal(rootNodeData[:disklayout.ExtentHeaderSize], binary.LittleEndian, &f.root.Header)
// Root node can not have more than 4 entries: 60 bytes = 1 header + 4 entries.
if f.root.Header.NumEntries > 4 {
@@ -67,7 +67,7 @@ func (f *extentFile) buildExtTree() error {
}
f.root.Entries = make([]disklayout.ExtentEntryPair, f.root.Header.NumEntries)
- for i, off := uint16(0), disklayout.ExtentStructsSize; i < f.root.Header.NumEntries; i, off = i+1, off+disklayout.ExtentStructsSize {
+ for i, off := uint16(0), disklayout.ExtentEntrySize; i < f.root.Header.NumEntries; i, off = i+1, off+disklayout.ExtentEntrySize {
var curEntry disklayout.ExtentEntry
if f.root.Header.Height == 0 {
// Leaf node.
@@ -76,7 +76,7 @@ func (f *extentFile) buildExtTree() error {
// Internal node.
curEntry = &disklayout.ExtentIdx{}
}
- binary.Unmarshal(rootNodeData[off:off+disklayout.ExtentStructsSize], binary.LittleEndian, curEntry)
+ binary.Unmarshal(rootNodeData[off:off+disklayout.ExtentEntrySize], binary.LittleEndian, curEntry)
f.root.Entries[i].Entry = curEntry
}
@@ -105,7 +105,7 @@ func (f *extentFile) buildExtTreeFromDisk(entry disklayout.ExtentEntry) (*diskla
}
entries := make([]disklayout.ExtentEntryPair, header.NumEntries)
- for i, off := uint16(0), off+disklayout.ExtentStructsSize; i < header.NumEntries; i, off = i+1, off+disklayout.ExtentStructsSize {
+ for i, off := uint16(0), off+disklayout.ExtentEntrySize; i < header.NumEntries; i, off = i+1, off+disklayout.ExtentEntrySize {
var curEntry disklayout.ExtentEntry
if header.Height == 0 {
// Leaf node.
diff --git a/pkg/sentry/fsimpl/ext/file_description.go b/pkg/sentry/fsimpl/ext/file_description.go
index 5eca2b83f..841274daf 100644
--- a/pkg/sentry/fsimpl/ext/file_description.go
+++ b/pkg/sentry/fsimpl/ext/file_description.go
@@ -26,13 +26,6 @@ import (
type fileDescription struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
-
- // flags is the same as vfs.OpenOptions.Flags which are passed to
- // vfs.FilesystemImpl.OpenAt.
- // TODO(b/134676337): syscalls like read(2), write(2), fchmod(2), fchown(2),
- // fgetxattr(2), ioctl(2), mmap(2) should fail with EBADF if O_PATH is set.
- // Only close(2), fstat(2), fstatfs(2) should work.
- flags uint32
}
func (fd *fileDescription) filesystem() *filesystem {
@@ -43,18 +36,6 @@ func (fd *fileDescription) inode() *inode {
return fd.vfsfd.Dentry().Impl().(*dentry).inode
}
-// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags.
-func (fd *fileDescription) StatusFlags(ctx context.Context) (uint32, error) {
- return fd.flags, nil
-}
-
-// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags.
-func (fd *fileDescription) SetStatusFlags(ctx context.Context, flags uint32) error {
- // None of the flags settable by fcntl(F_SETFL) are supported, so this is a
- // no-op.
- return nil
-}
-
// Stat implements vfs.FileDescriptionImpl.Stat.
func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
var stat linux.Statx
diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go
index 2d15e8aaf..9afb1a84c 100644
--- a/pkg/sentry/fsimpl/ext/filesystem.go
+++ b/pkg/sentry/fsimpl/ext/filesystem.go
@@ -17,12 +17,13 @@ package ext
import (
"errors"
"io"
- "sync"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
"gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -274,6 +275,16 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op
return vfsd, nil
}
+// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
+func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
+ vfsd, inode, err := fs.walk(rp, true)
+ if err != nil {
+ return nil, err
+ }
+ inode.incRef()
+ return vfsd, nil
+}
+
// OpenAt implements vfs.FilesystemImpl.OpenAt.
func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
vfsd, inode, err := fs.walk(rp, false)
@@ -377,7 +388,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
}
// RenameAt implements vfs.FilesystemImpl.RenameAt.
-func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error {
+func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
if rp.Done() {
return syserror.ENOENT
}
@@ -441,3 +452,46 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
return syserror.EROFS
}
+
+// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
+func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) {
+ _, _, err := fs.walk(rp, false)
+ if err != nil {
+ return nil, err
+ }
+ return nil, syserror.ENOTSUP
+}
+
+// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
+func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) {
+ _, _, err := fs.walk(rp, false)
+ if err != nil {
+ return "", err
+ }
+ return "", syserror.ENOTSUP
+}
+
+// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
+func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+ _, _, err := fs.walk(rp, false)
+ if err != nil {
+ return err
+ }
+ return syserror.ENOTSUP
+}
+
+// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
+func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+ _, _, err := fs.walk(rp, false)
+ if err != nil {
+ return err
+ }
+ return syserror.ENOTSUP
+}
+
+// PrependPath implements vfs.FilesystemImpl.PrependPath.
+func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
+ fs.mu.RLock()
+ defer fs.mu.RUnlock()
+ return vfs.GenericPrependPath(vfsroot, vd, b)
+}
diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go
index 24249525c..8608805bf 100644
--- a/pkg/sentry/fsimpl/ext/inode.go
+++ b/pkg/sentry/fsimpl/ext/inode.go
@@ -157,10 +157,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v
switch in.impl.(type) {
case *regularFile:
var fd regularFileFD
- fd.flags = flags
- mnt.IncRef()
- vfsd.IncRef()
- fd.vfsfd.Init(&fd, mnt, vfsd)
+ fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
return &fd.vfsfd, nil
case *directory:
// Can't open directories writably. This check is not necessary for a read
@@ -169,10 +166,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v
return nil, syserror.EISDIR
}
var fd directoryFD
- fd.flags = flags
- mnt.IncRef()
- vfsd.IncRef()
- fd.vfsfd.Init(&fd, mnt, vfsd)
+ fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
return &fd.vfsfd, nil
case *symlink:
if flags&linux.O_PATH == 0 {
@@ -180,10 +174,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v
return nil, syserror.ELOOP
}
var fd symlinkFD
- fd.flags = flags
- mnt.IncRef()
- vfsd.IncRef()
- fd.vfsfd.Init(&fd, mnt, vfsd)
+ fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
return &fd.vfsfd, nil
default:
panic(fmt.Sprintf("unknown inode type: %T", in.impl))
diff --git a/pkg/sentry/fsimpl/ext/regular_file.go b/pkg/sentry/fsimpl/ext/regular_file.go
index aec33e00a..d11153c90 100644
--- a/pkg/sentry/fsimpl/ext/regular_file.go
+++ b/pkg/sentry/fsimpl/ext/regular_file.go
@@ -16,7 +16,6 @@ package ext
import (
"io"
- "sync"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/context"
@@ -24,6 +23,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/safemem"
"gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
)
diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD
new file mode 100644
index 000000000..809178250
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/BUILD
@@ -0,0 +1,64 @@
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+package(licenses = ["notice"])
+
+go_template_instance(
+ name = "slot_list",
+ out = "slot_list.go",
+ package = "kernfs",
+ prefix = "slot",
+ template = "//pkg/ilist:generic_list",
+ types = {
+ "Element": "*slot",
+ "Linker": "*slot",
+ },
+)
+
+go_library(
+ name = "kernfs",
+ srcs = [
+ "dynamic_bytes_file.go",
+ "fd_impl_util.go",
+ "filesystem.go",
+ "inode_impl_util.go",
+ "kernfs.go",
+ "slot_list.go",
+ "symlink.go",
+ ],
+ importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs",
+ visibility = ["//pkg/sentry:internal"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/fspath",
+ "//pkg/log",
+ "//pkg/refs",
+ "//pkg/sentry/context",
+ "//pkg/sentry/kernel/auth",
+ "//pkg/sentry/memmap",
+ "//pkg/sentry/usermem",
+ "//pkg/sentry/vfs",
+ "//pkg/sync",
+ "//pkg/syserror",
+ ],
+)
+
+go_test(
+ name = "kernfs_test",
+ size = "small",
+ srcs = ["kernfs_test.go"],
+ deps = [
+ ":kernfs",
+ "//pkg/abi/linux",
+ "//pkg/fspath",
+ "//pkg/sentry/context",
+ "//pkg/sentry/context/contexttest",
+ "//pkg/sentry/kernel/auth",
+ "//pkg/sentry/usermem",
+ "//pkg/sentry/vfs",
+ "//pkg/sync",
+ "//pkg/syserror",
+ "@com_github_google_go-cmp//cmp:go_default_library",
+ ],
+)
diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
new file mode 100644
index 000000000..606ca692d
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
@@ -0,0 +1,127 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernfs
+
+import (
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// DynamicBytesFile implements kernfs.Inode and represents a read-only
+// file whose contents are backed by a vfs.DynamicBytesSource.
+//
+// Must be instantiated with NewDynamicBytesFile or initialized with Init
+// before first use.
+//
+// +stateify savable
+type DynamicBytesFile struct {
+ InodeAttrs
+ InodeNoopRefCount
+ InodeNotDirectory
+ InodeNotSymlink
+
+ data vfs.DynamicBytesSource
+}
+
+var _ Inode = (*DynamicBytesFile)(nil)
+
+// Init initializes a dynamic bytes file.
+func (f *DynamicBytesFile) Init(creds *auth.Credentials, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode) {
+ if perm&^linux.PermissionsMask != 0 {
+ panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask))
+ }
+ f.InodeAttrs.Init(creds, ino, linux.ModeRegular|perm)
+ f.data = data
+}
+
+// Open implements Inode.Open.
+func (f *DynamicBytesFile) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+ fd := &DynamicBytesFD{}
+ fd.Init(rp.Mount(), vfsd, f.data, flags)
+ return &fd.vfsfd, nil
+}
+
+// SetStat implements Inode.SetStat.
+func (f *DynamicBytesFile) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error {
+ // DynamicBytesFiles are immutable.
+ return syserror.EPERM
+}
+
+// DynamicBytesFD implements vfs.FileDescriptionImpl for an FD backed by a
+// DynamicBytesFile.
+//
+// Must be initialized with Init before first use.
+//
+// +stateify savable
+type DynamicBytesFD struct {
+ vfs.FileDescriptionDefaultImpl
+ vfs.DynamicBytesFileDescriptionImpl
+
+ vfsfd vfs.FileDescription
+ inode Inode
+}
+
+// Init initializes a DynamicBytesFD.
+func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, flags uint32) {
+ fd.inode = d.Impl().(*Dentry).inode
+ fd.SetDataSource(data)
+ fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{})
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *DynamicBytesFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+ return fd.DynamicBytesFileDescriptionImpl.Seek(ctx, offset, whence)
+}
+
+// Read implmenets vfs.FileDescriptionImpl.Read.
+func (fd *DynamicBytesFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+ return fd.DynamicBytesFileDescriptionImpl.Read(ctx, dst, opts)
+}
+
+// PRead implmenets vfs.FileDescriptionImpl.PRead.
+func (fd *DynamicBytesFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+ return fd.DynamicBytesFileDescriptionImpl.PRead(ctx, dst, offset, opts)
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *DynamicBytesFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+ return fd.FileDescriptionDefaultImpl.Write(ctx, src, opts)
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *DynamicBytesFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+ return fd.FileDescriptionDefaultImpl.PWrite(ctx, src, offset, opts)
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *DynamicBytesFD) Release() {}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *DynamicBytesFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+ fs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
+ return fd.inode.Stat(fs), nil
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *DynamicBytesFD) SetStat(context.Context, vfs.SetStatOptions) error {
+ // DynamicBytesFiles are immutable.
+ return syserror.EPERM
+}
diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
new file mode 100644
index 000000000..bcf069b5f
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
@@ -0,0 +1,194 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernfs
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// GenericDirectoryFD implements vfs.FileDescriptionImpl for a generic directory
+// inode that uses OrderChildren to track child nodes. GenericDirectoryFD is not
+// compatible with dynamic directories.
+//
+// Note that GenericDirectoryFD holds a lock over OrderedChildren while calling
+// IterDirents callback. The IterDirents callback therefore cannot hash or
+// unhash children, or recursively call IterDirents on the same underlying
+// inode.
+//
+// Must be initialize with Init before first use.
+type GenericDirectoryFD struct {
+ vfs.FileDescriptionDefaultImpl
+ vfs.DirectoryFileDescriptionDefaultImpl
+
+ vfsfd vfs.FileDescription
+ children *OrderedChildren
+ off int64
+}
+
+// Init initializes a GenericDirectoryFD.
+func (fd *GenericDirectoryFD) Init(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, flags uint32) {
+ fd.children = children
+ fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{})
+}
+
+// VFSFileDescription returns a pointer to the vfs.FileDescription representing
+// this object.
+func (fd *GenericDirectoryFD) VFSFileDescription() *vfs.FileDescription {
+ return &fd.vfsfd
+}
+
+// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
+func (fd *GenericDirectoryFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+ return fd.FileDescriptionDefaultImpl.ConfigureMMap(ctx, opts)
+}
+
+// Read implmenets vfs.FileDescriptionImpl.Read.
+func (fd *GenericDirectoryFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+ return fd.DirectoryFileDescriptionDefaultImpl.Read(ctx, dst, opts)
+}
+
+// PRead implmenets vfs.FileDescriptionImpl.PRead.
+func (fd *GenericDirectoryFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+ return fd.DirectoryFileDescriptionDefaultImpl.PRead(ctx, dst, offset, opts)
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *GenericDirectoryFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+ return fd.DirectoryFileDescriptionDefaultImpl.Write(ctx, src, opts)
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *GenericDirectoryFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+ return fd.DirectoryFileDescriptionDefaultImpl.PWrite(ctx, src, offset, opts)
+}
+
+// Release implements vfs.FileDecriptionImpl.Release.
+func (fd *GenericDirectoryFD) Release() {}
+
+func (fd *GenericDirectoryFD) filesystem() *vfs.Filesystem {
+ return fd.vfsfd.VirtualDentry().Mount().Filesystem()
+}
+
+func (fd *GenericDirectoryFD) inode() Inode {
+ return fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode
+}
+
+// IterDirents implements vfs.FileDecriptionImpl.IterDirents. IterDirents holds
+// o.mu when calling cb.
+func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+ vfsFS := fd.filesystem()
+ fs := vfsFS.Impl().(*Filesystem)
+ vfsd := fd.vfsfd.VirtualDentry().Dentry()
+
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+
+ // Handle ".".
+ if fd.off == 0 {
+ stat := fd.inode().Stat(vfsFS)
+ dirent := vfs.Dirent{
+ Name: ".",
+ Type: linux.DT_DIR,
+ Ino: stat.Ino,
+ NextOff: 1,
+ }
+ if !cb.Handle(dirent) {
+ return nil
+ }
+ fd.off++
+ }
+
+ // Handle "..".
+ if fd.off == 1 {
+ parentInode := vfsd.ParentOrSelf().Impl().(*Dentry).inode
+ stat := parentInode.Stat(vfsFS)
+ dirent := vfs.Dirent{
+ Name: "..",
+ Type: linux.FileMode(stat.Mode).DirentType(),
+ Ino: stat.Ino,
+ NextOff: 2,
+ }
+ if !cb.Handle(dirent) {
+ return nil
+ }
+ fd.off++
+ }
+
+ // Handle static children.
+ fd.children.mu.RLock()
+ defer fd.children.mu.RUnlock()
+ // fd.off accounts for "." and "..", but fd.children do not track
+ // these.
+ childIdx := fd.off - 2
+ for it := fd.children.nthLocked(childIdx); it != nil; it = it.Next() {
+ inode := it.Dentry.Impl().(*Dentry).inode
+ stat := inode.Stat(vfsFS)
+ dirent := vfs.Dirent{
+ Name: it.Name,
+ Type: linux.FileMode(stat.Mode).DirentType(),
+ Ino: stat.Ino,
+ NextOff: fd.off + 1,
+ }
+ if !cb.Handle(dirent) {
+ return nil
+ }
+ fd.off++
+ }
+
+ var err error
+ relOffset := fd.off - int64(len(fd.children.set)) - 2
+ fd.off, err = fd.inode().IterDirents(ctx, cb, fd.off, relOffset)
+ return err
+}
+
+// Seek implements vfs.FileDecriptionImpl.Seek.
+func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+ fs := fd.filesystem().Impl().(*Filesystem)
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+
+ switch whence {
+ case linux.SEEK_SET:
+ // Use offset as given.
+ case linux.SEEK_CUR:
+ offset += fd.off
+ default:
+ return 0, syserror.EINVAL
+ }
+ if offset < 0 {
+ return 0, syserror.EINVAL
+ }
+ fd.off = offset
+ return offset, nil
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *GenericDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+ fs := fd.filesystem()
+ inode := fd.inode()
+ return inode.Stat(fs), nil
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *GenericDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+ fs := fd.filesystem()
+ inode := fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode
+ return inode.SetStat(fs, opts)
+}
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
new file mode 100644
index 000000000..79759e0fc
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -0,0 +1,793 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file implements vfs.FilesystemImpl for kernfs.
+
+package kernfs
+
+import (
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// stepExistingLocked resolves rp.Component() in parent directory vfsd.
+//
+// stepExistingLocked is loosely analogous to fs/namei.c:walk_component().
+//
+// Preconditions: Filesystem.mu must be locked for at least reading. !rp.Done().
+//
+// Postcondition: Caller must call fs.processDeferredDecRefs*.
+func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry) (*vfs.Dentry, error) {
+ d := vfsd.Impl().(*Dentry)
+ if !d.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+ // Directory searchable?
+ if err := d.inode.CheckPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+ return nil, err
+ }
+afterSymlink:
+ name := rp.Component()
+ // Revalidation must be skipped if name is "." or ".."; d or its parent
+ // respectively can't be expected to transition from invalidated back to
+ // valid, so detecting invalidation and retrying would loop forever. This
+ // is consistent with Linux: fs/namei.c:walk_component() => lookup_fast()
+ // calls d_revalidate(), but walk_component() => handle_dots() does not.
+ if name == "." {
+ rp.Advance()
+ return vfsd, nil
+ }
+ if name == ".." {
+ nextVFSD, err := rp.ResolveParent(vfsd)
+ if err != nil {
+ return nil, err
+ }
+ rp.Advance()
+ return nextVFSD, nil
+ }
+ d.dirMu.Lock()
+ nextVFSD, err := rp.ResolveChild(vfsd, name)
+ if err != nil {
+ d.dirMu.Unlock()
+ return nil, err
+ }
+ next, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), d, name, nextVFSD)
+ d.dirMu.Unlock()
+ if err != nil {
+ return nil, err
+ }
+ // Resolve any symlink at current path component.
+ if rp.ShouldFollowSymlink() && next.isSymlink() {
+ // TODO: VFS2 needs something extra for /proc/[pid]/fd/ "magic symlinks".
+ target, err := next.inode.Readlink(ctx)
+ if err != nil {
+ return nil, err
+ }
+ if err := rp.HandleSymlink(target); err != nil {
+ return nil, err
+ }
+ goto afterSymlink
+
+ }
+ rp.Advance()
+ return &next.vfsd, nil
+}
+
+// revalidateChildLocked must be called after a call to parent.vfsd.Child(name)
+// or vfs.ResolvingPath.ResolveChild(name) returns childVFSD (which may be
+// nil) to verify that the returned child (or lack thereof) is correct.
+//
+// Preconditions: Filesystem.mu must be locked for at least reading.
+// parent.dirMu must be locked. parent.isDir(). name is not "." or "..".
+//
+// Postconditions: Caller must call fs.processDeferredDecRefs*.
+func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *Dentry, name string, childVFSD *vfs.Dentry) (*Dentry, error) {
+ if childVFSD != nil {
+ // Cached dentry exists, revalidate.
+ child := childVFSD.Impl().(*Dentry)
+ if !child.inode.Valid(ctx) {
+ vfsObj.ForceDeleteDentry(childVFSD)
+ fs.deferDecRef(childVFSD) // Reference from Lookup.
+ childVFSD = nil
+ }
+ }
+ if childVFSD == nil {
+ // Dentry isn't cached; it either doesn't exist or failed
+ // revalidation. Attempt to resolve it via Lookup.
+ //
+ // FIXME(b/144498111): Inode.Lookup() should return *(kernfs.)Dentry,
+ // not *vfs.Dentry, since (kernfs.)Filesystem assumes that all dentries
+ // in the filesystem are (kernfs.)Dentry and performs vfs.DentryImpl
+ // casts accordingly.
+ var err error
+ childVFSD, err = parent.inode.Lookup(ctx, name)
+ if err != nil {
+ return nil, err
+ }
+ // Reference on childVFSD dropped by a corresponding Valid.
+ parent.insertChildLocked(name, childVFSD)
+ }
+ return childVFSD.Impl().(*Dentry), nil
+}
+
+// walkExistingLocked resolves rp to an existing file.
+//
+// walkExistingLocked is loosely analogous to Linux's
+// fs/namei.c:path_lookupat().
+//
+// Preconditions: Filesystem.mu must be locked for at least reading.
+//
+// Postconditions: Caller must call fs.processDeferredDecRefs*.
+func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, Inode, error) {
+ vfsd := rp.Start()
+ for !rp.Done() {
+ var err error
+ vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd)
+ if err != nil {
+ return nil, nil, err
+ }
+ }
+ d := vfsd.Impl().(*Dentry)
+ if rp.MustBeDir() && !d.isDir() {
+ return nil, nil, syserror.ENOTDIR
+ }
+ return vfsd, d.inode, nil
+}
+
+// walkParentDirLocked resolves all but the last path component of rp to an
+// existing directory. It does not check that the returned directory is
+// searchable by the provider of rp.
+//
+// walkParentDirLocked is loosely analogous to Linux's
+// fs/namei.c:path_parentat().
+//
+// Preconditions: Filesystem.mu must be locked for at least reading. !rp.Done().
+//
+// Postconditions: Caller must call fs.processDeferredDecRefs*.
+func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, Inode, error) {
+ vfsd := rp.Start()
+ for !rp.Final() {
+ var err error
+ vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd)
+ if err != nil {
+ return nil, nil, err
+ }
+ }
+ d := vfsd.Impl().(*Dentry)
+ if !d.isDir() {
+ return nil, nil, syserror.ENOTDIR
+ }
+ return vfsd, d.inode, nil
+}
+
+// checkCreateLocked checks that a file named rp.Component() may be created in
+// directory parentVFSD, then returns rp.Component().
+//
+// Preconditions: Filesystem.mu must be locked for at least reading. parentInode
+// == parentVFSD.Impl().(*Dentry).Inode. isDir(parentInode) == true.
+func checkCreateLocked(rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode Inode) (string, error) {
+ if err := parentInode.CheckPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+ return "", err
+ }
+ pc := rp.Component()
+ if pc == "." || pc == ".." {
+ return "", syserror.EEXIST
+ }
+ childVFSD, err := rp.ResolveChild(parentVFSD, pc)
+ if err != nil {
+ return "", err
+ }
+ if childVFSD != nil {
+ return "", syserror.EEXIST
+ }
+ if parentVFSD.IsDisowned() {
+ return "", syserror.ENOENT
+ }
+ return pc, nil
+}
+
+// checkDeleteLocked checks that the file represented by vfsd may be deleted.
+//
+// Preconditions: Filesystem.mu must be locked for at least reading.
+func checkDeleteLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry) error {
+ parentVFSD := vfsd.Parent()
+ if parentVFSD == nil {
+ return syserror.EBUSY
+ }
+ if parentVFSD.IsDisowned() {
+ return syserror.ENOENT
+ }
+ if err := parentVFSD.Impl().(*Dentry).inode.CheckPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+ return err
+ }
+ return nil
+}
+
+// checkRenameLocked checks that a rename operation may be performed on the
+// target dentry across the given set of parent directories. The target dentry
+// may be nil.
+//
+// Precondition: isDir(dstInode) == true.
+func checkRenameLocked(creds *auth.Credentials, src, dstDir *vfs.Dentry, dstInode Inode) error {
+ srcDir := src.Parent()
+ if srcDir == nil {
+ return syserror.EBUSY
+ }
+ if srcDir.IsDisowned() {
+ return syserror.ENOENT
+ }
+ if dstDir.IsDisowned() {
+ return syserror.ENOENT
+ }
+ // Check for creation permissions on dst dir.
+ if err := dstInode.CheckPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil {
+ return err
+ }
+
+ return nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *Filesystem) Release() {
+}
+
+// Sync implements vfs.FilesystemImpl.Sync.
+func (fs *Filesystem) Sync(ctx context.Context) error {
+ // All filesystem state is in-memory.
+ return nil
+}
+
+// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
+func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
+ fs.mu.RLock()
+ defer fs.processDeferredDecRefs()
+ defer fs.mu.RUnlock()
+ vfsd, inode, err := fs.walkExistingLocked(ctx, rp)
+ if err != nil {
+ return nil, err
+ }
+
+ if opts.CheckSearchable {
+ d := vfsd.Impl().(*Dentry)
+ if !d.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+ if err := inode.CheckPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+ return nil, err
+ }
+ }
+ vfsd.IncRef() // Ownership transferred to caller.
+ return vfsd, nil
+}
+
+// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
+func (fs *Filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
+ fs.mu.RLock()
+ defer fs.processDeferredDecRefs()
+ defer fs.mu.RUnlock()
+ vfsd, _, err := fs.walkParentDirLocked(ctx, rp)
+ if err != nil {
+ return nil, err
+ }
+ vfsd.IncRef() // Ownership transferred to caller.
+ return vfsd, nil
+}
+
+// LinkAt implements vfs.FilesystemImpl.LinkAt.
+func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
+ if rp.Done() {
+ return syserror.EEXIST
+ }
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+ parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
+ fs.processDeferredDecRefsLocked()
+ if err != nil {
+ return err
+ }
+ pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
+ if err != nil {
+ return err
+ }
+ if rp.Mount() != vd.Mount() {
+ return syserror.EXDEV
+ }
+ if err := rp.Mount().CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer rp.Mount().EndWrite()
+
+ d := vd.Dentry().Impl().(*Dentry)
+ if d.isDir() {
+ return syserror.EPERM
+ }
+
+ child, err := parentInode.NewLink(ctx, pc, d.inode)
+ if err != nil {
+ return err
+ }
+ parentVFSD.Impl().(*Dentry).InsertChild(pc, child)
+ return nil
+}
+
+// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
+func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
+ if rp.Done() {
+ return syserror.EEXIST
+ }
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+ parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
+ fs.processDeferredDecRefsLocked()
+ if err != nil {
+ return err
+ }
+ pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
+ if err != nil {
+ return err
+ }
+ if err := rp.Mount().CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer rp.Mount().EndWrite()
+ child, err := parentInode.NewDir(ctx, pc, opts)
+ if err != nil {
+ return err
+ }
+ parentVFSD.Impl().(*Dentry).InsertChild(pc, child)
+ return nil
+}
+
+// MknodAt implements vfs.FilesystemImpl.MknodAt.
+func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
+ if rp.Done() {
+ return syserror.EEXIST
+ }
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+ parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
+ fs.processDeferredDecRefsLocked()
+ if err != nil {
+ return err
+ }
+ pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
+ if err != nil {
+ return err
+ }
+ if err := rp.Mount().CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer rp.Mount().EndWrite()
+ new, err := parentInode.NewNode(ctx, pc, opts)
+ if err != nil {
+ return err
+ }
+ parentVFSD.Impl().(*Dentry).InsertChild(pc, new)
+ return nil
+}
+
+// OpenAt implements vfs.FilesystemImpl.OpenAt.
+func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ // Filter out flags that are not supported by kernfs. O_DIRECTORY and
+ // O_NOFOLLOW have no effect here (they're handled by VFS by setting
+ // appropriate bits in rp), but are returned by
+ // FileDescriptionImpl.StatusFlags().
+ opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW
+ ats := vfs.AccessTypesForOpenFlags(opts.Flags)
+
+ // Do not create new file.
+ if opts.Flags&linux.O_CREAT == 0 {
+ fs.mu.RLock()
+ defer fs.processDeferredDecRefs()
+ defer fs.mu.RUnlock()
+ vfsd, inode, err := fs.walkExistingLocked(ctx, rp)
+ if err != nil {
+ return nil, err
+ }
+ if err := inode.CheckPermissions(rp.Credentials(), ats); err != nil {
+ return nil, err
+ }
+ return inode.Open(rp, vfsd, opts.Flags)
+ }
+
+ // May create new file.
+ mustCreate := opts.Flags&linux.O_EXCL != 0
+ vfsd := rp.Start()
+ inode := vfsd.Impl().(*Dentry).inode
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+ if rp.Done() {
+ if rp.MustBeDir() {
+ return nil, syserror.EISDIR
+ }
+ if mustCreate {
+ return nil, syserror.EEXIST
+ }
+ if err := inode.CheckPermissions(rp.Credentials(), ats); err != nil {
+ return nil, err
+ }
+ return inode.Open(rp, vfsd, opts.Flags)
+ }
+afterTrailingSymlink:
+ parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
+ fs.processDeferredDecRefsLocked()
+ if err != nil {
+ return nil, err
+ }
+ // Check for search permission in the parent directory.
+ if err := parentInode.CheckPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+ return nil, err
+ }
+ // Reject attempts to open directories with O_CREAT.
+ if rp.MustBeDir() {
+ return nil, syserror.EISDIR
+ }
+ pc := rp.Component()
+ if pc == "." || pc == ".." {
+ return nil, syserror.EISDIR
+ }
+ // Determine whether or not we need to create a file.
+ childVFSD, err := rp.ResolveChild(parentVFSD, pc)
+ if err != nil {
+ return nil, err
+ }
+ if childVFSD == nil {
+ // Already checked for searchability above; now check for writability.
+ if err := parentInode.CheckPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
+ return nil, err
+ }
+ if err := rp.Mount().CheckBeginWrite(); err != nil {
+ return nil, err
+ }
+ defer rp.Mount().EndWrite()
+ // Create and open the child.
+ child, err := parentInode.NewFile(ctx, pc, opts)
+ if err != nil {
+ return nil, err
+ }
+ parentVFSD.Impl().(*Dentry).InsertChild(pc, child)
+ return child.Impl().(*Dentry).inode.Open(rp, child, opts.Flags)
+ }
+ // Open existing file or follow symlink.
+ if mustCreate {
+ return nil, syserror.EEXIST
+ }
+ childDentry := childVFSD.Impl().(*Dentry)
+ childInode := childDentry.inode
+ if rp.ShouldFollowSymlink() {
+ if childDentry.isSymlink() {
+ target, err := childInode.Readlink(ctx)
+ if err != nil {
+ return nil, err
+ }
+ if err := rp.HandleSymlink(target); err != nil {
+ return nil, err
+ }
+ // rp.Final() may no longer be true since we now need to resolve the
+ // symlink target.
+ goto afterTrailingSymlink
+ }
+ }
+ if err := childInode.CheckPermissions(rp.Credentials(), ats); err != nil {
+ return nil, err
+ }
+ return childInode.Open(rp, childVFSD, opts.Flags)
+}
+
+// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
+func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
+ fs.mu.RLock()
+ d, inode, err := fs.walkExistingLocked(ctx, rp)
+ fs.mu.RUnlock()
+ fs.processDeferredDecRefs()
+ if err != nil {
+ return "", err
+ }
+ if !d.Impl().(*Dentry).isSymlink() {
+ return "", syserror.EINVAL
+ }
+ return inode.Readlink(ctx)
+}
+
+// RenameAt implements vfs.FilesystemImpl.RenameAt.
+func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
+ // Only RENAME_NOREPLACE is supported.
+ if opts.Flags&^linux.RENAME_NOREPLACE != 0 {
+ return syserror.EINVAL
+ }
+ noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0
+
+ fs.mu.Lock()
+ defer fs.mu.Lock()
+
+ // Resolve the destination directory first to verify that it's on this
+ // Mount.
+ dstDirVFSD, dstDirInode, err := fs.walkParentDirLocked(ctx, rp)
+ fs.processDeferredDecRefsLocked()
+ if err != nil {
+ return err
+ }
+ mnt := rp.Mount()
+ if mnt != oldParentVD.Mount() {
+ return syserror.EXDEV
+ }
+ if err := mnt.CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer mnt.EndWrite()
+
+ srcDirVFSD := oldParentVD.Dentry()
+ srcDir := srcDirVFSD.Impl().(*Dentry)
+ srcDir.dirMu.Lock()
+ src, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), srcDir, oldName, srcDirVFSD.Child(oldName))
+ srcDir.dirMu.Unlock()
+ fs.processDeferredDecRefsLocked()
+ if err != nil {
+ return err
+ }
+ srcVFSD := &src.vfsd
+
+ // Can we remove the src dentry?
+ if err := checkDeleteLocked(rp, srcVFSD); err != nil {
+ return err
+ }
+
+ // Can we create the dst dentry?
+ var dstVFSD *vfs.Dentry
+ pc, err := checkCreateLocked(rp, dstDirVFSD, dstDirInode)
+ switch err {
+ case nil:
+ // Ok, continue with rename as replacement.
+ case syserror.EEXIST:
+ if noReplace {
+ // Won't overwrite existing node since RENAME_NOREPLACE was requested.
+ return syserror.EEXIST
+ }
+ dstVFSD, err = rp.ResolveChild(dstDirVFSD, pc)
+ if err != nil {
+ panic(fmt.Sprintf("Child %q for parent Dentry %+v disappeared inside atomic section?", pc, dstDirVFSD))
+ }
+ default:
+ return err
+ }
+
+ mntns := vfs.MountNamespaceFromContext(ctx)
+ virtfs := rp.VirtualFilesystem()
+
+ srcDirDentry := srcDirVFSD.Impl().(*Dentry)
+ dstDirDentry := dstDirVFSD.Impl().(*Dentry)
+
+ // We can't deadlock here due to lock ordering because we're protected from
+ // concurrent renames by fs.mu held for writing.
+ srcDirDentry.dirMu.Lock()
+ defer srcDirDentry.dirMu.Unlock()
+ dstDirDentry.dirMu.Lock()
+ defer dstDirDentry.dirMu.Unlock()
+
+ if err := virtfs.PrepareRenameDentry(mntns, srcVFSD, dstVFSD); err != nil {
+ return err
+ }
+ srcDirInode := srcDirDentry.inode
+ replaced, err := srcDirInode.Rename(ctx, srcVFSD.Name(), pc, srcVFSD, dstDirVFSD)
+ if err != nil {
+ virtfs.AbortRenameDentry(srcVFSD, dstVFSD)
+ return err
+ }
+ virtfs.CommitRenameReplaceDentry(srcVFSD, dstDirVFSD, pc, replaced)
+ return nil
+}
+
+// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
+func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+ vfsd, inode, err := fs.walkExistingLocked(ctx, rp)
+ fs.processDeferredDecRefsLocked()
+ if err != nil {
+ return err
+ }
+ if err := rp.Mount().CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer rp.Mount().EndWrite()
+ if err := checkDeleteLocked(rp, vfsd); err != nil {
+ return err
+ }
+ if !vfsd.Impl().(*Dentry).isDir() {
+ return syserror.ENOTDIR
+ }
+ if inode.HasChildren() {
+ return syserror.ENOTEMPTY
+ }
+ virtfs := rp.VirtualFilesystem()
+ parentDentry := vfsd.Parent().Impl().(*Dentry)
+ parentDentry.dirMu.Lock()
+ defer parentDentry.dirMu.Unlock()
+ if err := virtfs.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil {
+ return err
+ }
+ if err := parentDentry.inode.RmDir(ctx, rp.Component(), vfsd); err != nil {
+ virtfs.AbortDeleteDentry(vfsd)
+ return err
+ }
+ virtfs.CommitDeleteDentry(vfsd)
+ return nil
+}
+
+// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
+func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
+ fs.mu.RLock()
+ _, inode, err := fs.walkExistingLocked(ctx, rp)
+ fs.mu.RUnlock()
+ fs.processDeferredDecRefs()
+ if err != nil {
+ return err
+ }
+ if opts.Stat.Mask == 0 {
+ return nil
+ }
+ return inode.SetStat(fs.VFSFilesystem(), opts)
+}
+
+// StatAt implements vfs.FilesystemImpl.StatAt.
+func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
+ fs.mu.RLock()
+ _, inode, err := fs.walkExistingLocked(ctx, rp)
+ fs.mu.RUnlock()
+ fs.processDeferredDecRefs()
+ if err != nil {
+ return linux.Statx{}, err
+ }
+ return inode.Stat(fs.VFSFilesystem()), nil
+}
+
+// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
+func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
+ fs.mu.RLock()
+ _, _, err := fs.walkExistingLocked(ctx, rp)
+ fs.mu.RUnlock()
+ fs.processDeferredDecRefs()
+ if err != nil {
+ return linux.Statfs{}, err
+ }
+ // TODO: actually implement statfs
+ return linux.Statfs{}, syserror.ENOSYS
+}
+
+// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
+func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
+ if rp.Done() {
+ return syserror.EEXIST
+ }
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+ parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
+ fs.processDeferredDecRefsLocked()
+ if err != nil {
+ return err
+ }
+ pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
+ if err != nil {
+ return err
+ }
+ if err := rp.Mount().CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer rp.Mount().EndWrite()
+ child, err := parentInode.NewSymlink(ctx, pc, target)
+ if err != nil {
+ return err
+ }
+ parentVFSD.Impl().(*Dentry).InsertChild(pc, child)
+ return nil
+}
+
+// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
+func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+ vfsd, _, err := fs.walkExistingLocked(ctx, rp)
+ fs.processDeferredDecRefsLocked()
+ if err != nil {
+ return err
+ }
+ if err := rp.Mount().CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer rp.Mount().EndWrite()
+ if err := checkDeleteLocked(rp, vfsd); err != nil {
+ return err
+ }
+ if vfsd.Impl().(*Dentry).isDir() {
+ return syserror.EISDIR
+ }
+ virtfs := rp.VirtualFilesystem()
+ parentDentry := vfsd.Parent().Impl().(*Dentry)
+ parentDentry.dirMu.Lock()
+ defer parentDentry.dirMu.Unlock()
+ if err := virtfs.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil {
+ return err
+ }
+ if err := parentDentry.inode.Unlink(ctx, rp.Component(), vfsd); err != nil {
+ virtfs.AbortDeleteDentry(vfsd)
+ return err
+ }
+ virtfs.CommitDeleteDentry(vfsd)
+ return nil
+}
+
+// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
+func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) {
+ fs.mu.RLock()
+ _, _, err := fs.walkExistingLocked(ctx, rp)
+ fs.mu.RUnlock()
+ fs.processDeferredDecRefs()
+ if err != nil {
+ return nil, err
+ }
+ // kernfs currently does not support extended attributes.
+ return nil, syserror.ENOTSUP
+}
+
+// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
+func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) {
+ fs.mu.RLock()
+ _, _, err := fs.walkExistingLocked(ctx, rp)
+ fs.mu.RUnlock()
+ fs.processDeferredDecRefs()
+ if err != nil {
+ return "", err
+ }
+ // kernfs currently does not support extended attributes.
+ return "", syserror.ENOTSUP
+}
+
+// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
+func (fs *Filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+ fs.mu.RLock()
+ _, _, err := fs.walkExistingLocked(ctx, rp)
+ fs.mu.RUnlock()
+ fs.processDeferredDecRefs()
+ if err != nil {
+ return err
+ }
+ // kernfs currently does not support extended attributes.
+ return syserror.ENOTSUP
+}
+
+// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
+func (fs *Filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+ fs.mu.RLock()
+ _, _, err := fs.walkExistingLocked(ctx, rp)
+ fs.mu.RUnlock()
+ fs.processDeferredDecRefs()
+ if err != nil {
+ return err
+ }
+ // kernfs currently does not support extended attributes.
+ return syserror.ENOTSUP
+}
+
+// PrependPath implements vfs.FilesystemImpl.PrependPath.
+func (fs *Filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
+ fs.mu.RLock()
+ defer fs.mu.RUnlock()
+ return vfs.GenericPrependPath(vfsroot, vd, b)
+}
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
new file mode 100644
index 000000000..1d469a0db
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -0,0 +1,512 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernfs
+
+import (
+ "fmt"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/refs"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// InodeNoopRefCount partially implements the Inode interface, specifically the
+// inodeRefs sub interface. InodeNoopRefCount implements a simple reference
+// count for inodes, performing no extra actions when references are obtained or
+// released. This is suitable for simple file inodes that don't reference any
+// resources.
+type InodeNoopRefCount struct {
+}
+
+// IncRef implements Inode.IncRef.
+func (n *InodeNoopRefCount) IncRef() {
+}
+
+// DecRef implements Inode.DecRef.
+func (n *InodeNoopRefCount) DecRef() {
+}
+
+// TryIncRef implements Inode.TryIncRef.
+func (n *InodeNoopRefCount) TryIncRef() bool {
+ return true
+}
+
+// Destroy implements Inode.Destroy.
+func (n *InodeNoopRefCount) Destroy() {
+}
+
+// InodeDirectoryNoNewChildren partially implements the Inode interface.
+// InodeDirectoryNoNewChildren represents a directory inode which does not
+// support creation of new children.
+type InodeDirectoryNoNewChildren struct{}
+
+// NewFile implements Inode.NewFile.
+func (*InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) {
+ return nil, syserror.EPERM
+}
+
+// NewDir implements Inode.NewDir.
+func (*InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) {
+ return nil, syserror.EPERM
+}
+
+// NewLink implements Inode.NewLink.
+func (*InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) {
+ return nil, syserror.EPERM
+}
+
+// NewSymlink implements Inode.NewSymlink.
+func (*InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
+ return nil, syserror.EPERM
+}
+
+// NewNode implements Inode.NewNode.
+func (*InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
+ return nil, syserror.EPERM
+}
+
+// InodeNotDirectory partially implements the Inode interface, specifically the
+// inodeDirectory and inodeDynamicDirectory sub interfaces. Inodes that do not
+// represent directories can embed this to provide no-op implementations for
+// directory-related functions.
+type InodeNotDirectory struct {
+}
+
+// HasChildren implements Inode.HasChildren.
+func (*InodeNotDirectory) HasChildren() bool {
+ return false
+}
+
+// NewFile implements Inode.NewFile.
+func (*InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) {
+ panic("NewFile called on non-directory inode")
+}
+
+// NewDir implements Inode.NewDir.
+func (*InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) {
+ panic("NewDir called on non-directory inode")
+}
+
+// NewLink implements Inode.NewLinkink.
+func (*InodeNotDirectory) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) {
+ panic("NewLink called on non-directory inode")
+}
+
+// NewSymlink implements Inode.NewSymlink.
+func (*InodeNotDirectory) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
+ panic("NewSymlink called on non-directory inode")
+}
+
+// NewNode implements Inode.NewNode.
+func (*InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
+ panic("NewNode called on non-directory inode")
+}
+
+// Unlink implements Inode.Unlink.
+func (*InodeNotDirectory) Unlink(context.Context, string, *vfs.Dentry) error {
+ panic("Unlink called on non-directory inode")
+}
+
+// RmDir implements Inode.RmDir.
+func (*InodeNotDirectory) RmDir(context.Context, string, *vfs.Dentry) error {
+ panic("RmDir called on non-directory inode")
+}
+
+// Rename implements Inode.Rename.
+func (*InodeNotDirectory) Rename(context.Context, string, string, *vfs.Dentry, *vfs.Dentry) (*vfs.Dentry, error) {
+ panic("Rename called on non-directory inode")
+}
+
+// Lookup implements Inode.Lookup.
+func (*InodeNotDirectory) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+ panic("Lookup called on non-directory inode")
+}
+
+// IterDirents implements Inode.IterDirents.
+func (*InodeNotDirectory) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) {
+ panic("IterDirents called on non-directory inode")
+}
+
+// Valid implements Inode.Valid.
+func (*InodeNotDirectory) Valid(context.Context) bool {
+ return true
+}
+
+// InodeNoDynamicLookup partially implements the Inode interface, specifically
+// the inodeDynamicLookup sub interface. Directory inodes that do not support
+// dymanic entries (i.e. entries that are not "hashed" into the
+// vfs.Dentry.children) can embed this to provide no-op implementations for
+// functions related to dynamic entries.
+type InodeNoDynamicLookup struct{}
+
+// Lookup implements Inode.Lookup.
+func (*InodeNoDynamicLookup) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+ return nil, syserror.ENOENT
+}
+
+// IterDirents implements Inode.IterDirents.
+func (*InodeNoDynamicLookup) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
+ return offset, nil
+}
+
+// Valid implements Inode.Valid.
+func (*InodeNoDynamicLookup) Valid(ctx context.Context) bool {
+ return true
+}
+
+// InodeNotSymlink partially implements the Inode interface, specifically the
+// inodeSymlink sub interface. All inodes that are not symlinks may embed this
+// to return the appropriate errors from symlink-related functions.
+type InodeNotSymlink struct{}
+
+// Readlink implements Inode.Readlink.
+func (*InodeNotSymlink) Readlink(context.Context) (string, error) {
+ return "", syserror.EINVAL
+}
+
+// InodeAttrs partially implements the Inode interface, specifically the
+// inodeMetadata sub interface. InodeAttrs provides functionality related to
+// inode attributes.
+//
+// Must be initialized by Init prior to first use.
+type InodeAttrs struct {
+ ino uint64
+ mode uint32
+ uid uint32
+ gid uint32
+ nlink uint32
+}
+
+// Init initializes this InodeAttrs.
+func (a *InodeAttrs) Init(creds *auth.Credentials, ino uint64, mode linux.FileMode) {
+ if mode.FileType() == 0 {
+ panic(fmt.Sprintf("No file type specified in 'mode' for InodeAttrs.Init(): mode=0%o", mode))
+ }
+
+ nlink := uint32(1)
+ if mode.FileType() == linux.ModeDirectory {
+ nlink = 2
+ }
+ atomic.StoreUint64(&a.ino, ino)
+ atomic.StoreUint32(&a.mode, uint32(mode))
+ atomic.StoreUint32(&a.uid, uint32(creds.EffectiveKUID))
+ atomic.StoreUint32(&a.gid, uint32(creds.EffectiveKGID))
+ atomic.StoreUint32(&a.nlink, nlink)
+}
+
+// Mode implements Inode.Mode.
+func (a *InodeAttrs) Mode() linux.FileMode {
+ return linux.FileMode(atomic.LoadUint32(&a.mode))
+}
+
+// Stat partially implements Inode.Stat. Note that this function doesn't provide
+// all the stat fields, and the embedder should consider extending the result
+// with filesystem-specific fields.
+func (a *InodeAttrs) Stat(*vfs.Filesystem) linux.Statx {
+ var stat linux.Statx
+ stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_NLINK
+ stat.Ino = atomic.LoadUint64(&a.ino)
+ stat.Mode = uint16(a.Mode())
+ stat.UID = atomic.LoadUint32(&a.uid)
+ stat.GID = atomic.LoadUint32(&a.gid)
+ stat.Nlink = atomic.LoadUint32(&a.nlink)
+
+ // TODO: Implement other stat fields like timestamps.
+
+ return stat
+}
+
+// SetStat implements Inode.SetStat.
+func (a *InodeAttrs) SetStat(_ *vfs.Filesystem, opts vfs.SetStatOptions) error {
+ stat := opts.Stat
+ if stat.Mask&linux.STATX_MODE != 0 {
+ for {
+ old := atomic.LoadUint32(&a.mode)
+ new := old | uint32(stat.Mode & ^uint16(linux.S_IFMT))
+ if swapped := atomic.CompareAndSwapUint32(&a.mode, old, new); swapped {
+ break
+ }
+ }
+ }
+
+ if stat.Mask&linux.STATX_UID != 0 {
+ atomic.StoreUint32(&a.uid, stat.UID)
+ }
+ if stat.Mask&linux.STATX_GID != 0 {
+ atomic.StoreUint32(&a.gid, stat.GID)
+ }
+
+ // Note that not all fields are modifiable. For example, the file type and
+ // inode numbers are immutable after node creation.
+
+ // TODO: Implement other stat fields like timestamps.
+
+ return nil
+}
+
+// CheckPermissions implements Inode.CheckPermissions.
+func (a *InodeAttrs) CheckPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
+ mode := a.Mode()
+ return vfs.GenericCheckPermissions(
+ creds,
+ ats,
+ mode.FileType() == linux.ModeDirectory,
+ uint16(mode),
+ auth.KUID(atomic.LoadUint32(&a.uid)),
+ auth.KGID(atomic.LoadUint32(&a.gid)),
+ )
+}
+
+// IncLinks implements Inode.IncLinks.
+func (a *InodeAttrs) IncLinks(n uint32) {
+ if atomic.AddUint32(&a.nlink, n) <= n {
+ panic("InodeLink.IncLinks called with no existing links")
+ }
+}
+
+// DecLinks implements Inode.DecLinks.
+func (a *InodeAttrs) DecLinks() {
+ if nlink := atomic.AddUint32(&a.nlink, ^uint32(0)); nlink == ^uint32(0) {
+ // Negative overflow
+ panic("Inode.DecLinks called at 0 links")
+ }
+}
+
+type slot struct {
+ Name string
+ Dentry *vfs.Dentry
+ slotEntry
+}
+
+// OrderedChildrenOptions contains initialization options for OrderedChildren.
+type OrderedChildrenOptions struct {
+ // Writable indicates whether vfs.FilesystemImpl methods implemented by
+ // OrderedChildren may modify the tracked children. This applies to
+ // operations related to rename, unlink and rmdir. If an OrderedChildren is
+ // not writable, these operations all fail with EPERM.
+ Writable bool
+}
+
+// OrderedChildren partially implements the Inode interface. OrderedChildren can
+// be embedded in directory inodes to keep track of the children in the
+// directory, and can then be used to implement a generic directory FD -- see
+// GenericDirectoryFD. OrderedChildren is not compatible with dynamic
+// directories.
+//
+// Must be initialize with Init before first use.
+type OrderedChildren struct {
+ refs.AtomicRefCount
+
+ // Can children be modified by user syscalls? It set to false, interface
+ // methods that would modify the children return EPERM. Immutable.
+ writable bool
+
+ mu sync.RWMutex
+ order slotList
+ set map[string]*slot
+}
+
+// Init initializes an OrderedChildren.
+func (o *OrderedChildren) Init(opts OrderedChildrenOptions) {
+ o.writable = opts.Writable
+ o.set = make(map[string]*slot)
+}
+
+// DecRef implements Inode.DecRef.
+func (o *OrderedChildren) DecRef() {
+ o.AtomicRefCount.DecRefWithDestructor(o.Destroy)
+}
+
+// Destroy cleans up resources referenced by this OrderedChildren.
+func (o *OrderedChildren) Destroy() {
+ o.mu.Lock()
+ defer o.mu.Unlock()
+ o.order.Reset()
+ o.set = nil
+}
+
+// Populate inserts children into this OrderedChildren, and d's dentry
+// cache. Populate returns the number of directories inserted, which the caller
+// may use to update the link count for the parent directory.
+//
+// Precondition: d.Impl() must be a kernfs Dentry. d must represent a directory
+// inode. children must not contain any conflicting entries already in o.
+func (o *OrderedChildren) Populate(d *Dentry, children map[string]*Dentry) uint32 {
+ var links uint32
+ for name, child := range children {
+ if child.isDir() {
+ links++
+ }
+ if err := o.Insert(name, child.VFSDentry()); err != nil {
+ panic(fmt.Sprintf("Collision when attempting to insert child %q (%+v) into %+v", name, child, d))
+ }
+ d.InsertChild(name, child.VFSDentry())
+ }
+ return links
+}
+
+// HasChildren implements Inode.HasChildren.
+func (o *OrderedChildren) HasChildren() bool {
+ o.mu.RLock()
+ defer o.mu.RUnlock()
+ return len(o.set) > 0
+}
+
+// Insert inserts child into o. This ignores the writability of o, as this is
+// not part of the vfs.FilesystemImpl interface, and is a lower-level operation.
+func (o *OrderedChildren) Insert(name string, child *vfs.Dentry) error {
+ o.mu.Lock()
+ defer o.mu.Unlock()
+ if _, ok := o.set[name]; ok {
+ return syserror.EEXIST
+ }
+ s := &slot{
+ Name: name,
+ Dentry: child,
+ }
+ o.order.PushBack(s)
+ o.set[name] = s
+ return nil
+}
+
+// Precondition: caller must hold o.mu for writing.
+func (o *OrderedChildren) removeLocked(name string) {
+ if s, ok := o.set[name]; ok {
+ delete(o.set, name)
+ o.order.Remove(s)
+ }
+}
+
+// Precondition: caller must hold o.mu for writing.
+func (o *OrderedChildren) replaceChildLocked(name string, new *vfs.Dentry) *vfs.Dentry {
+ if s, ok := o.set[name]; ok {
+ // Existing slot with given name, simply replace the dentry.
+ var old *vfs.Dentry
+ old, s.Dentry = s.Dentry, new
+ return old
+ }
+
+ // No existing slot with given name, create and hash new slot.
+ s := &slot{
+ Name: name,
+ Dentry: new,
+ }
+ o.order.PushBack(s)
+ o.set[name] = s
+ return nil
+}
+
+// Precondition: caller must hold o.mu for reading or writing.
+func (o *OrderedChildren) checkExistingLocked(name string, child *vfs.Dentry) error {
+ s, ok := o.set[name]
+ if !ok {
+ return syserror.ENOENT
+ }
+ if s.Dentry != child {
+ panic(fmt.Sprintf("Dentry hashed into inode doesn't match what vfs thinks! OrderedChild: %+v, vfs: %+v", s.Dentry, child))
+ }
+ return nil
+}
+
+// Unlink implements Inode.Unlink.
+func (o *OrderedChildren) Unlink(ctx context.Context, name string, child *vfs.Dentry) error {
+ if !o.writable {
+ return syserror.EPERM
+ }
+ o.mu.Lock()
+ defer o.mu.Unlock()
+ if err := o.checkExistingLocked(name, child); err != nil {
+ return err
+ }
+ o.removeLocked(name)
+ return nil
+}
+
+// Rmdir implements Inode.Rmdir.
+func (o *OrderedChildren) RmDir(ctx context.Context, name string, child *vfs.Dentry) error {
+ // We're not responsible for checking that child is a directory, that it's
+ // empty, or updating any link counts; so this is the same as unlink.
+ return o.Unlink(ctx, name, child)
+}
+
+type renameAcrossDifferentImplementationsError struct{}
+
+func (renameAcrossDifferentImplementationsError) Error() string {
+ return "rename across inodes with different implementations"
+}
+
+// Rename implements Inode.Rename.
+//
+// Precondition: Rename may only be called across two directory inodes with
+// identical implementations of Rename. Practically, this means filesystems that
+// implement Rename by embedding OrderedChildren for any directory
+// implementation must use OrderedChildren for all directory implementations
+// that will support Rename.
+//
+// Postcondition: reference on any replaced dentry transferred to caller.
+func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, child, dstDir *vfs.Dentry) (*vfs.Dentry, error) {
+ dst, ok := dstDir.Impl().(*Dentry).inode.(interface{}).(*OrderedChildren)
+ if !ok {
+ return nil, renameAcrossDifferentImplementationsError{}
+ }
+ if !o.writable || !dst.writable {
+ return nil, syserror.EPERM
+ }
+ // Note: There's a potential deadlock below if concurrent calls to Rename
+ // refer to the same src and dst directories in reverse. We avoid any
+ // ordering issues because the caller is required to serialize concurrent
+ // calls to Rename in accordance with the interface declaration.
+ o.mu.Lock()
+ defer o.mu.Unlock()
+ if dst != o {
+ dst.mu.Lock()
+ defer dst.mu.Unlock()
+ }
+ if err := o.checkExistingLocked(oldname, child); err != nil {
+ return nil, err
+ }
+ replaced := dst.replaceChildLocked(newname, child)
+ return replaced, nil
+}
+
+// nthLocked returns an iterator to the nth child tracked by this object. The
+// iterator is valid until the caller releases o.mu. Returns nil if the
+// requested index falls out of bounds.
+//
+// Preconditon: Caller must hold o.mu for reading.
+func (o *OrderedChildren) nthLocked(i int64) *slot {
+ for it := o.order.Front(); it != nil && i >= 0; it = it.Next() {
+ if i == 0 {
+ return it
+ }
+ i--
+ }
+ return nil
+}
+
+// InodeSymlink partially implements Inode interface for symlinks.
+type InodeSymlink struct {
+ InodeNotDirectory
+}
+
+// Open implements Inode.Open.
+func (InodeSymlink) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+ return nil, syserror.ELOOP
+}
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
new file mode 100644
index 000000000..bb12f39a2
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -0,0 +1,422 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package kernfs provides the tools to implement inode-based filesystems.
+// Kernfs has two main features:
+//
+// 1. The Inode interface, which maps VFS2's path-based filesystem operations to
+// specific filesystem nodes. Kernfs uses the Inode interface to provide a
+// blanket implementation for the vfs.FilesystemImpl. Kernfs also serves as
+// the synchronization mechanism for all filesystem operations by holding a
+// filesystem-wide lock across all operations.
+//
+// 2. Various utility types which provide generic implementations for various
+// parts of the Inode and vfs.FileDescription interfaces. Client filesystems
+// based on kernfs can embed the appropriate set of these to avoid having to
+// reimplement common filesystem operations. See inode_impl_util.go and
+// fd_impl_util.go.
+//
+// Reference Model:
+//
+// Kernfs dentries represents named pointers to inodes. Dentries and inode have
+// independent lifetimes and reference counts. A child dentry unconditionally
+// holds a reference on its parent directory's dentry. A dentry also holds a
+// reference on the inode it points to. Multiple dentries can point to the same
+// inode (for example, in the case of hardlinks). File descriptors hold a
+// reference to the dentry they're opened on.
+//
+// Dentries are guaranteed to exist while holding Filesystem.mu for
+// reading. Dropping dentries require holding Filesystem.mu for writing. To
+// queue dentries for destruction from a read critical section, see
+// Filesystem.deferDecRef.
+//
+// Lock ordering:
+//
+// kernfs.Filesystem.mu
+// kernfs.Dentry.dirMu
+// vfs.VirtualFilesystem.mountMu
+// vfs.Dentry.mu
+// kernfs.Filesystem.droppedDentriesMu
+// (inode implementation locks, if any)
+package kernfs
+
+import (
+ "fmt"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/refs"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/sync"
+)
+
+// FilesystemType implements vfs.FilesystemType.
+type FilesystemType struct{}
+
+// Filesystem mostly implements vfs.FilesystemImpl for a generic in-memory
+// filesystem. Concrete implementations are expected to embed this in their own
+// Filesystem type.
+type Filesystem struct {
+ vfsfs vfs.Filesystem
+
+ droppedDentriesMu sync.Mutex
+
+ // droppedDentries is a list of dentries waiting to be DecRef()ed. This is
+ // used to defer dentry destruction until mu can be acquired for
+ // writing. Protected by droppedDentriesMu.
+ droppedDentries []*vfs.Dentry
+
+ // mu synchronizes the lifetime of Dentries on this filesystem. Holding it
+ // for reading guarantees continued existence of any resolved dentries, but
+ // the dentry tree may be modified.
+ //
+ // Kernfs dentries can only be DecRef()ed while holding mu for writing. For
+ // example:
+ //
+ // fs.mu.Lock()
+ // defer fs.mu.Unlock()
+ // ...
+ // dentry1.DecRef()
+ // defer dentry2.DecRef() // Ok, will run before Unlock.
+ //
+ // If discarding dentries in a read context, use Filesystem.deferDecRef. For
+ // example:
+ //
+ // fs.mu.RLock()
+ // fs.mu.processDeferredDecRefs()
+ // defer fs.mu.RUnlock()
+ // ...
+ // fs.deferDecRef(dentry)
+ mu sync.RWMutex
+
+ // nextInoMinusOne is used to to allocate inode numbers on this
+ // filesystem. Must be accessed by atomic operations.
+ nextInoMinusOne uint64
+}
+
+// deferDecRef defers dropping a dentry ref until the next call to
+// processDeferredDecRefs{,Locked}. See comment on Filesystem.mu.
+//
+// Precondition: d must not already be pending destruction.
+func (fs *Filesystem) deferDecRef(d *vfs.Dentry) {
+ fs.droppedDentriesMu.Lock()
+ fs.droppedDentries = append(fs.droppedDentries, d)
+ fs.droppedDentriesMu.Unlock()
+}
+
+// processDeferredDecRefs calls vfs.Dentry.DecRef on all dentries in the
+// droppedDentries list. See comment on Filesystem.mu.
+func (fs *Filesystem) processDeferredDecRefs() {
+ fs.mu.Lock()
+ fs.processDeferredDecRefsLocked()
+ fs.mu.Unlock()
+}
+
+// Precondition: fs.mu must be held for writing.
+func (fs *Filesystem) processDeferredDecRefsLocked() {
+ fs.droppedDentriesMu.Lock()
+ for _, d := range fs.droppedDentries {
+ d.DecRef()
+ }
+ fs.droppedDentries = fs.droppedDentries[:0] // Keep slice memory for reuse.
+ fs.droppedDentriesMu.Unlock()
+}
+
+// Init initializes a kernfs filesystem. This should be called from during
+// vfs.FilesystemType.NewFilesystem for the concrete filesystem embedding
+// kernfs.
+func (fs *Filesystem) Init(vfsObj *vfs.VirtualFilesystem) {
+ fs.vfsfs.Init(vfsObj, fs)
+}
+
+// VFSFilesystem returns the generic vfs filesystem object.
+func (fs *Filesystem) VFSFilesystem() *vfs.Filesystem {
+ return &fs.vfsfs
+}
+
+// NextIno allocates a new inode number on this filesystem.
+func (fs *Filesystem) NextIno() uint64 {
+ return atomic.AddUint64(&fs.nextInoMinusOne, 1)
+}
+
+// These consts are used in the Dentry.flags field.
+const (
+ // Dentry points to a directory inode.
+ dflagsIsDir = 1 << iota
+
+ // Dentry points to a symlink inode.
+ dflagsIsSymlink
+)
+
+// Dentry implements vfs.DentryImpl.
+//
+// A kernfs dentry is similar to a dentry in a traditional filesystem: it's a
+// named reference to an inode. A dentry generally lives as long as it's part of
+// a mounted filesystem tree. Kernfs doesn't cache dentries once all references
+// to them are removed. Dentries hold a single reference to the inode they point
+// to, and child dentries hold a reference on their parent.
+//
+// Must be initialized by Init prior to first use.
+type Dentry struct {
+ refs.AtomicRefCount
+
+ vfsd vfs.Dentry
+ inode Inode
+
+ refs uint64
+
+ // flags caches useful information about the dentry from the inode. See the
+ // dflags* consts above. Must be accessed by atomic ops.
+ flags uint32
+
+ // dirMu protects vfsd.children for directory dentries.
+ dirMu sync.Mutex
+}
+
+// Init initializes this dentry.
+//
+// Precondition: Caller must hold a reference on inode.
+//
+// Postcondition: Caller's reference on inode is transferred to the dentry.
+func (d *Dentry) Init(inode Inode) {
+ d.vfsd.Init(d)
+ d.inode = inode
+ ftype := inode.Mode().FileType()
+ if ftype == linux.ModeDirectory {
+ d.flags |= dflagsIsDir
+ }
+ if ftype == linux.ModeSymlink {
+ d.flags |= dflagsIsSymlink
+ }
+}
+
+// VFSDentry returns the generic vfs dentry for this kernfs dentry.
+func (d *Dentry) VFSDentry() *vfs.Dentry {
+ return &d.vfsd
+}
+
+// isDir checks whether the dentry points to a directory inode.
+func (d *Dentry) isDir() bool {
+ return atomic.LoadUint32(&d.flags)&dflagsIsDir != 0
+}
+
+// isSymlink checks whether the dentry points to a symlink inode.
+func (d *Dentry) isSymlink() bool {
+ return atomic.LoadUint32(&d.flags)&dflagsIsSymlink != 0
+}
+
+// DecRef implements vfs.DentryImpl.DecRef.
+func (d *Dentry) DecRef() {
+ d.AtomicRefCount.DecRefWithDestructor(d.destroy)
+}
+
+// Precondition: Dentry must be removed from VFS' dentry cache.
+func (d *Dentry) destroy() {
+ d.inode.DecRef() // IncRef from Init.
+ d.inode = nil
+ if parent := d.vfsd.Parent(); parent != nil {
+ parent.DecRef() // IncRef from Dentry.InsertChild.
+ }
+}
+
+// InsertChild inserts child into the vfs dentry cache with the given name under
+// this dentry. This does not update the directory inode, so calling this on
+// it's own isn't sufficient to insert a child into a directory. InsertChild
+// updates the link count on d if required.
+//
+// Precondition: d must represent a directory inode.
+func (d *Dentry) InsertChild(name string, child *vfs.Dentry) {
+ d.dirMu.Lock()
+ d.insertChildLocked(name, child)
+ d.dirMu.Unlock()
+}
+
+// insertChildLocked is equivalent to InsertChild, with additional
+// preconditions.
+//
+// Precondition: d.dirMu must be locked.
+func (d *Dentry) insertChildLocked(name string, child *vfs.Dentry) {
+ if !d.isDir() {
+ panic(fmt.Sprintf("InsertChild called on non-directory Dentry: %+v.", d))
+ }
+ vfsDentry := d.VFSDentry()
+ vfsDentry.IncRef() // DecRef in child's Dentry.destroy.
+ vfsDentry.InsertChild(child, name)
+}
+
+// The Inode interface maps filesystem-level operations that operate on paths to
+// equivalent operations on specific filesystem nodes.
+//
+// The interface methods are groups into logical categories as sub interfaces
+// below. Generally, an implementation for each sub interface can be provided by
+// embedding an appropriate type from inode_impl_utils.go. The sub interfaces
+// are purely organizational. Methods declared directly in the main interface
+// have no generic implementations, and should be explicitly provided by the
+// client filesystem.
+//
+// Generally, implementations are not responsible for tasks that are common to
+// all filesystems. These include:
+//
+// - Checking that dentries passed to methods are of the appropriate file type.
+// - Checking permissions.
+// - Updating link and reference counts.
+//
+// Specific responsibilities of implementations are documented below.
+type Inode interface {
+ // Methods related to reference counting. A generic implementation is
+ // provided by InodeNoopRefCount. These methods are generally called by the
+ // equivalent Dentry methods.
+ inodeRefs
+
+ // Methods related to node metadata. A generic implementation is provided by
+ // InodeAttrs.
+ inodeMetadata
+
+ // Method for inodes that represent symlink. InodeNotSymlink provides a
+ // blanket implementation for all non-symlink inodes.
+ inodeSymlink
+
+ // Method for inodes that represent directories. InodeNotDirectory provides
+ // a blanket implementation for all non-directory inodes.
+ inodeDirectory
+
+ // Method for inodes that represent dynamic directories and their
+ // children. InodeNoDynamicLookup provides a blanket implementation for all
+ // non-dynamic-directory inodes.
+ inodeDynamicLookup
+
+ // Open creates a file description for the filesystem object represented by
+ // this inode. The returned file description should hold a reference on the
+ // inode for its lifetime.
+ //
+ // Precondition: !rp.Done(). vfsd.Impl() must be a kernfs Dentry.
+ Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error)
+}
+
+type inodeRefs interface {
+ IncRef()
+ DecRef()
+ TryIncRef() bool
+ // Destroy is called when the inode reaches zero references. Destroy release
+ // all resources (references) on objects referenced by the inode, including
+ // any child dentries.
+ Destroy()
+}
+
+type inodeMetadata interface {
+ // CheckPermissions checks that creds may access this inode for the
+ // requested access type, per the the rules of
+ // fs/namei.c:generic_permission().
+ CheckPermissions(creds *auth.Credentials, atx vfs.AccessTypes) error
+
+ // Mode returns the (struct stat)::st_mode value for this inode. This is
+ // separated from Stat for performance.
+ Mode() linux.FileMode
+
+ // Stat returns the metadata for this inode. This corresponds to
+ // vfs.FilesystemImpl.StatAt.
+ Stat(fs *vfs.Filesystem) linux.Statx
+
+ // SetStat updates the metadata for this inode. This corresponds to
+ // vfs.FilesystemImpl.SetStatAt.
+ SetStat(fs *vfs.Filesystem, opts vfs.SetStatOptions) error
+}
+
+// Precondition: All methods in this interface may only be called on directory
+// inodes.
+type inodeDirectory interface {
+ // The New{File,Dir,Node,Symlink} methods below should return a new inode
+ // hashed into this inode.
+ //
+ // These inode constructors are inode-level operations rather than
+ // filesystem-level operations to allow client filesystems to mix different
+ // implementations based on the new node's location in the
+ // filesystem.
+
+ // HasChildren returns true if the directory inode has any children.
+ HasChildren() bool
+
+ // NewFile creates a new regular file inode.
+ NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error)
+
+ // NewDir creates a new directory inode.
+ NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error)
+
+ // NewLink creates a new hardlink to a specified inode in this
+ // directory. Implementations should create a new kernfs Dentry pointing to
+ // target, and update target's link count.
+ NewLink(ctx context.Context, name string, target Inode) (*vfs.Dentry, error)
+
+ // NewSymlink creates a new symbolic link inode.
+ NewSymlink(ctx context.Context, name, target string) (*vfs.Dentry, error)
+
+ // NewNode creates a new filesystem node for a mknod syscall.
+ NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (*vfs.Dentry, error)
+
+ // Unlink removes a child dentry from this directory inode.
+ Unlink(ctx context.Context, name string, child *vfs.Dentry) error
+
+ // RmDir removes an empty child directory from this directory
+ // inode. Implementations must update the parent directory's link count,
+ // if required. Implementations are not responsible for checking that child
+ // is a directory, checking for an empty directory.
+ RmDir(ctx context.Context, name string, child *vfs.Dentry) error
+
+ // Rename is called on the source directory containing an inode being
+ // renamed. child should point to the resolved child in the source
+ // directory. If Rename replaces a dentry in the destination directory, it
+ // should return the replaced dentry or nil otherwise.
+ //
+ // Precondition: Caller must serialize concurrent calls to Rename.
+ Rename(ctx context.Context, oldname, newname string, child, dstDir *vfs.Dentry) (replaced *vfs.Dentry, err error)
+}
+
+type inodeDynamicLookup interface {
+ // Lookup should return an appropriate dentry if name should resolve to a
+ // child of this dynamic directory inode. This gives the directory an
+ // opportunity on every lookup to resolve additional entries that aren't
+ // hashed into the directory. This is only called when the inode is a
+ // directory. If the inode is not a directory, or if the directory only
+ // contains a static set of children, the implementer can unconditionally
+ // return an appropriate error (ENOTDIR and ENOENT respectively).
+ //
+ // The child returned by Lookup will be hashed into the VFS dentry tree. Its
+ // lifetime can be controlled by the filesystem implementation with an
+ // appropriate implementation of Valid.
+ //
+ // Lookup returns the child with an extra reference and the caller owns this
+ // reference.
+ Lookup(ctx context.Context, name string) (*vfs.Dentry, error)
+
+ // Valid should return true if this inode is still valid, or needs to
+ // be resolved again by a call to Lookup.
+ Valid(ctx context.Context) bool
+
+ // IterDirents is used to iterate over dynamically created entries. It invokes
+ // cb on each entry in the directory represented by the FileDescription.
+ // 'offset' is the offset for the entire IterDirents call, which may include
+ // results from the caller. 'relOffset' is the offset inside the entries
+ // returned by this IterDirents invocation. In other words,
+ // 'offset+relOffset+1' is the value that should be set in vfs.Dirent.NextOff,
+ // while 'relOffset' is the place where iteration should start from.
+ IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error)
+}
+
+type inodeSymlink interface {
+ // Readlink resolves the target of a symbolic link. If an inode is not a
+ // symlink, the implementation should return EINVAL.
+ Readlink(ctx context.Context) (string, error)
+}
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
new file mode 100644
index 000000000..5c9d580e1
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -0,0 +1,426 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernfs_test
+
+import (
+ "bytes"
+ "fmt"
+ "io"
+ "runtime"
+ "testing"
+
+ "github.com/google/go-cmp/cmp"
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+const defaultMode linux.FileMode = 01777
+const staticFileContent = "This is sample content for a static test file."
+
+// RootDentryFn is a generator function for creating the root dentry of a test
+// filesystem. See newTestSystem.
+type RootDentryFn func(*auth.Credentials, *filesystem) *kernfs.Dentry
+
+// TestSystem represents the context for a single test.
+type TestSystem struct {
+ t *testing.T
+ ctx context.Context
+ creds *auth.Credentials
+ vfs *vfs.VirtualFilesystem
+ mns *vfs.MountNamespace
+ root vfs.VirtualDentry
+}
+
+// newTestSystem sets up a minimal environment for running a test, including an
+// instance of a test filesystem. Tests can control the contents of the
+// filesystem by providing an appropriate rootFn, which should return a
+// pre-populated root dentry.
+func newTestSystem(t *testing.T, rootFn RootDentryFn) *TestSystem {
+ ctx := contexttest.Context(t)
+ creds := auth.CredentialsFromContext(ctx)
+ v := vfs.New()
+ v.MustRegisterFilesystemType("testfs", &fsType{rootFn: rootFn}, &vfs.RegisterFilesystemTypeOptions{
+ AllowUserMount: true,
+ })
+ mns, err := v.NewMountNamespace(ctx, creds, "", "testfs", &vfs.GetFilesystemOptions{})
+ if err != nil {
+ t.Fatalf("Failed to create testfs root mount: %v", err)
+ }
+
+ s := &TestSystem{
+ t: t,
+ ctx: ctx,
+ creds: creds,
+ vfs: v,
+ mns: mns,
+ root: mns.Root(),
+ }
+ runtime.SetFinalizer(s, func(s *TestSystem) { s.root.DecRef() })
+ return s
+}
+
+// PathOpAtRoot constructs a vfs.PathOperation for a path from the
+// root of the test filesystem.
+//
+// Precondition: path should be relative path.
+func (s *TestSystem) PathOpAtRoot(path string) vfs.PathOperation {
+ return vfs.PathOperation{
+ Root: s.root,
+ Start: s.root,
+ Path: fspath.Parse(path),
+ }
+}
+
+// GetDentryOrDie attempts to resolve a dentry referred to by the
+// provided path operation. If unsuccessful, the test fails.
+func (s *TestSystem) GetDentryOrDie(pop vfs.PathOperation) vfs.VirtualDentry {
+ vd, err := s.vfs.GetDentryAt(s.ctx, s.creds, &pop, &vfs.GetDentryOptions{})
+ if err != nil {
+ s.t.Fatalf("GetDentryAt(pop:%+v) failed: %v", pop, err)
+ }
+ return vd
+}
+
+func (s *TestSystem) ReadToEnd(fd *vfs.FileDescription) (string, error) {
+ buf := make([]byte, usermem.PageSize)
+ bufIOSeq := usermem.BytesIOSequence(buf)
+ opts := vfs.ReadOptions{}
+
+ var content bytes.Buffer
+ for {
+ n, err := fd.Impl().Read(s.ctx, bufIOSeq, opts)
+ if n == 0 || err != nil {
+ if err == io.EOF {
+ err = nil
+ }
+ return content.String(), err
+ }
+ content.Write(buf[:n])
+ }
+}
+
+type fsType struct {
+ rootFn RootDentryFn
+}
+
+type filesystem struct {
+ kernfs.Filesystem
+}
+
+type file struct {
+ kernfs.DynamicBytesFile
+ content string
+}
+
+func (fs *filesystem) newFile(creds *auth.Credentials, content string) *kernfs.Dentry {
+ f := &file{}
+ f.content = content
+ f.DynamicBytesFile.Init(creds, fs.NextIno(), f, 0777)
+
+ d := &kernfs.Dentry{}
+ d.Init(f)
+ return d
+}
+
+func (f *file) Generate(ctx context.Context, buf *bytes.Buffer) error {
+ fmt.Fprintf(buf, "%s", f.content)
+ return nil
+}
+
+type attrs struct {
+ kernfs.InodeAttrs
+}
+
+func (a *attrs) SetStat(fs *vfs.Filesystem, opt vfs.SetStatOptions) error {
+ return syserror.EPERM
+}
+
+type readonlyDir struct {
+ attrs
+ kernfs.InodeNotSymlink
+ kernfs.InodeNoDynamicLookup
+ kernfs.InodeDirectoryNoNewChildren
+
+ kernfs.OrderedChildren
+ dentry kernfs.Dentry
+}
+
+func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry {
+ dir := &readonlyDir{}
+ dir.attrs.Init(creds, fs.NextIno(), linux.ModeDirectory|mode)
+ dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+ dir.dentry.Init(dir)
+
+ dir.IncLinks(dir.OrderedChildren.Populate(&dir.dentry, contents))
+
+ return &dir.dentry
+}
+
+func (d *readonlyDir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+ fd := &kernfs.GenericDirectoryFD{}
+ fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, flags)
+ return fd.VFSFileDescription(), nil
+}
+
+type dir struct {
+ attrs
+ kernfs.InodeNotSymlink
+ kernfs.InodeNoDynamicLookup
+
+ fs *filesystem
+ dentry kernfs.Dentry
+ kernfs.OrderedChildren
+}
+
+func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry {
+ dir := &dir{}
+ dir.fs = fs
+ dir.attrs.Init(creds, fs.NextIno(), linux.ModeDirectory|mode)
+ dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{Writable: true})
+ dir.dentry.Init(dir)
+
+ dir.IncLinks(dir.OrderedChildren.Populate(&dir.dentry, contents))
+
+ return &dir.dentry
+}
+
+func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+ fd := &kernfs.GenericDirectoryFD{}
+ fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, flags)
+ return fd.VFSFileDescription(), nil
+}
+
+func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error) {
+ creds := auth.CredentialsFromContext(ctx)
+ dir := d.fs.newDir(creds, opts.Mode, nil)
+ dirVFSD := dir.VFSDentry()
+ if err := d.OrderedChildren.Insert(name, dirVFSD); err != nil {
+ dir.DecRef()
+ return nil, err
+ }
+ d.IncLinks(1)
+ return dirVFSD, nil
+}
+
+func (d *dir) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error) {
+ creds := auth.CredentialsFromContext(ctx)
+ f := d.fs.newFile(creds, "")
+ fVFSD := f.VFSDentry()
+ if err := d.OrderedChildren.Insert(name, fVFSD); err != nil {
+ f.DecRef()
+ return nil, err
+ }
+ return fVFSD, nil
+}
+
+func (*dir) NewLink(context.Context, string, kernfs.Inode) (*vfs.Dentry, error) {
+ return nil, syserror.EPERM
+}
+
+func (*dir) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
+ return nil, syserror.EPERM
+}
+
+func (*dir) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
+ return nil, syserror.EPERM
+}
+
+func (fst *fsType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opt vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+ fs := &filesystem{}
+ fs.Init(vfsObj)
+ root := fst.rootFn(creds, fs)
+ return fs.VFSFilesystem(), root.VFSDentry(), nil
+}
+
+// -------------------- Remainder of the file are test cases --------------------
+
+func TestBasic(t *testing.T) {
+ sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
+ return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
+ "file1": fs.newFile(creds, staticFileContent),
+ })
+ })
+ sys.GetDentryOrDie(sys.PathOpAtRoot("file1")).DecRef()
+}
+
+func TestMkdirGetDentry(t *testing.T) {
+ sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
+ return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
+ "dir1": fs.newDir(creds, 0755, nil),
+ })
+ })
+
+ pop := sys.PathOpAtRoot("dir1/a new directory")
+ if err := sys.vfs.MkdirAt(sys.ctx, sys.creds, &pop, &vfs.MkdirOptions{Mode: 0755}); err != nil {
+ t.Fatalf("MkdirAt for PathOperation %+v failed: %v", pop, err)
+ }
+ sys.GetDentryOrDie(pop).DecRef()
+}
+
+func TestReadStaticFile(t *testing.T) {
+ sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
+ return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
+ "file1": fs.newFile(creds, staticFileContent),
+ })
+ })
+
+ pop := sys.PathOpAtRoot("file1")
+ fd, err := sys.vfs.OpenAt(sys.ctx, sys.creds, &pop, &vfs.OpenOptions{})
+ if err != nil {
+ sys.t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err)
+ }
+ defer fd.DecRef()
+
+ content, err := sys.ReadToEnd(fd)
+ if err != nil {
+ sys.t.Fatalf("Read failed: %v", err)
+ }
+ if diff := cmp.Diff(staticFileContent, content); diff != "" {
+ sys.t.Fatalf("Read returned unexpected data:\n--- want\n+++ got\n%v", diff)
+ }
+}
+
+func TestCreateNewFileInStaticDir(t *testing.T) {
+ sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
+ return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
+ "dir1": fs.newDir(creds, 0755, nil),
+ })
+ })
+
+ pop := sys.PathOpAtRoot("dir1/newfile")
+ opts := &vfs.OpenOptions{Flags: linux.O_CREAT | linux.O_EXCL, Mode: defaultMode}
+ fd, err := sys.vfs.OpenAt(sys.ctx, sys.creds, &pop, opts)
+ if err != nil {
+ sys.t.Fatalf("OpenAt(pop:%+v, opts:%+v) failed: %v", pop, opts, err)
+ }
+
+ // Close the file. The file should persist.
+ fd.DecRef()
+
+ fd, err = sys.vfs.OpenAt(sys.ctx, sys.creds, &pop, &vfs.OpenOptions{})
+ if err != nil {
+ sys.t.Fatalf("OpenAt(pop:%+v) = %+v failed: %v", pop, fd, err)
+ }
+ fd.DecRef()
+}
+
+// direntCollector provides an implementation for vfs.IterDirentsCallback for
+// testing. It simply iterates to the end of a given directory FD and collects
+// all dirents emitted by the callback.
+type direntCollector struct {
+ mu sync.Mutex
+ dirents map[string]vfs.Dirent
+}
+
+// Handle implements vfs.IterDirentsCallback.Handle.
+func (d *direntCollector) Handle(dirent vfs.Dirent) bool {
+ d.mu.Lock()
+ if d.dirents == nil {
+ d.dirents = make(map[string]vfs.Dirent)
+ }
+ d.dirents[dirent.Name] = dirent
+ d.mu.Unlock()
+ return true
+}
+
+// count returns the number of dirents currently in the collector.
+func (d *direntCollector) count() int {
+ d.mu.Lock()
+ defer d.mu.Unlock()
+ return len(d.dirents)
+}
+
+// contains checks whether the collector has a dirent with the given name and
+// type.
+func (d *direntCollector) contains(name string, typ uint8) error {
+ d.mu.Lock()
+ defer d.mu.Unlock()
+ dirent, ok := d.dirents[name]
+ if !ok {
+ return fmt.Errorf("No dirent named %q found", name)
+ }
+ if dirent.Type != typ {
+ return fmt.Errorf("Dirent named %q found, but was expecting type %d, got: %+v", name, typ, dirent)
+ }
+ return nil
+}
+
+func TestDirFDReadWrite(t *testing.T) {
+ sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
+ return fs.newReadonlyDir(creds, 0755, nil)
+ })
+
+ pop := sys.PathOpAtRoot("/")
+ fd, err := sys.vfs.OpenAt(sys.ctx, sys.creds, &pop, &vfs.OpenOptions{})
+ if err != nil {
+ sys.t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err)
+ }
+ defer fd.DecRef()
+
+ // Read/Write should fail for directory FDs.
+ if _, err := fd.Read(sys.ctx, usermem.BytesIOSequence([]byte{}), vfs.ReadOptions{}); err != syserror.EISDIR {
+ sys.t.Fatalf("Read for directory FD failed with unexpected error: %v", err)
+ }
+ if _, err := fd.Write(sys.ctx, usermem.BytesIOSequence([]byte{}), vfs.WriteOptions{}); err != syserror.EISDIR {
+ sys.t.Fatalf("Wrire for directory FD failed with unexpected error: %v", err)
+ }
+}
+
+func TestDirFDIterDirents(t *testing.T) {
+ sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
+ return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
+ // Fill root with nodes backed by various inode implementations.
+ "dir1": fs.newReadonlyDir(creds, 0755, nil),
+ "dir2": fs.newDir(creds, 0755, map[string]*kernfs.Dentry{
+ "dir3": fs.newDir(creds, 0755, nil),
+ }),
+ "file1": fs.newFile(creds, staticFileContent),
+ })
+ })
+
+ pop := sys.PathOpAtRoot("/")
+ fd, err := sys.vfs.OpenAt(sys.ctx, sys.creds, &pop, &vfs.OpenOptions{})
+ if err != nil {
+ sys.t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err)
+ }
+ defer fd.DecRef()
+
+ collector := &direntCollector{}
+ if err := fd.IterDirents(sys.ctx, collector); err != nil {
+ sys.t.Fatalf("IterDirent failed: %v", err)
+ }
+
+ // Root directory should contain ".", ".." and 3 children:
+ if collector.count() != 5 {
+ sys.t.Fatalf("IterDirent returned too many dirents")
+ }
+ for _, dirName := range []string{".", "..", "dir1", "dir2"} {
+ if err := collector.contains(dirName, linux.DT_DIR); err != nil {
+ sys.t.Fatalf("IterDirent had unexpected results: %v", err)
+ }
+ }
+ if err := collector.contains("file1", linux.DT_REG); err != nil {
+ sys.t.Fatalf("IterDirent had unexpected results: %v", err)
+ }
+
+}
diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go
new file mode 100644
index 000000000..068063f4e
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/symlink.go
@@ -0,0 +1,45 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernfs
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+)
+
+type staticSymlink struct {
+ InodeAttrs
+ InodeNoopRefCount
+ InodeSymlink
+
+ target string
+}
+
+var _ Inode = (*staticSymlink)(nil)
+
+// NewStaticSymlink creates a new symlink file pointing to 'target'.
+func NewStaticSymlink(creds *auth.Credentials, ino uint64, perm linux.FileMode, target string) *Dentry {
+ inode := &staticSymlink{target: target}
+ inode.Init(creds, ino, linux.ModeSymlink|perm)
+
+ d := &Dentry{}
+ d.Init(inode)
+ return d
+}
+
+func (s *staticSymlink) Readlink(_ context.Context) (string, error) {
+ return s.target, nil
+}
diff --git a/pkg/sentry/fsimpl/memfs/filesystem.go b/pkg/sentry/fsimpl/memfs/filesystem.go
deleted file mode 100644
index 08a9cb8ef..000000000
--- a/pkg/sentry/fsimpl/memfs/filesystem.go
+++ /dev/null
@@ -1,584 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package memfs
-
-import (
- "fmt"
- "sync/atomic"
-
- "gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/sentry/context"
- "gvisor.dev/gvisor/pkg/sentry/vfs"
- "gvisor.dev/gvisor/pkg/syserror"
-)
-
-// stepLocked resolves rp.Component() in parent directory vfsd.
-//
-// stepLocked is loosely analogous to fs/namei.c:walk_component().
-//
-// Preconditions: filesystem.mu must be locked. !rp.Done(). inode ==
-// vfsd.Impl().(*dentry).inode.
-func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode) (*vfs.Dentry, *inode, error) {
- if !inode.isDir() {
- return nil, nil, syserror.ENOTDIR
- }
- if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
- return nil, nil, err
- }
-afterSymlink:
- nextVFSD, err := rp.ResolveComponent(vfsd)
- if err != nil {
- return nil, nil, err
- }
- if nextVFSD == nil {
- // Since the Dentry tree is the sole source of truth for memfs, if it's
- // not in the Dentry tree, it doesn't exist.
- return nil, nil, syserror.ENOENT
- }
- nextInode := nextVFSD.Impl().(*dentry).inode
- if symlink, ok := nextInode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
- // TODO: symlink traversals update access time
- if err := rp.HandleSymlink(symlink.target); err != nil {
- return nil, nil, err
- }
- goto afterSymlink // don't check the current directory again
- }
- rp.Advance()
- return nextVFSD, nextInode, nil
-}
-
-// walkExistingLocked resolves rp to an existing file.
-//
-// walkExistingLocked is loosely analogous to Linux's
-// fs/namei.c:path_lookupat().
-//
-// Preconditions: filesystem.mu must be locked.
-func walkExistingLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *inode, error) {
- vfsd := rp.Start()
- inode := vfsd.Impl().(*dentry).inode
- for !rp.Done() {
- var err error
- vfsd, inode, err = stepLocked(rp, vfsd, inode)
- if err != nil {
- return nil, nil, err
- }
- }
- if rp.MustBeDir() && !inode.isDir() {
- return nil, nil, syserror.ENOTDIR
- }
- return vfsd, inode, nil
-}
-
-// walkParentDirLocked resolves all but the last path component of rp to an
-// existing directory. It does not check that the returned directory is
-// searchable by the provider of rp.
-//
-// walkParentDirLocked is loosely analogous to Linux's
-// fs/namei.c:path_parentat().
-//
-// Preconditions: filesystem.mu must be locked. !rp.Done().
-func walkParentDirLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *inode, error) {
- vfsd := rp.Start()
- inode := vfsd.Impl().(*dentry).inode
- for !rp.Final() {
- var err error
- vfsd, inode, err = stepLocked(rp, vfsd, inode)
- if err != nil {
- return nil, nil, err
- }
- }
- if !inode.isDir() {
- return nil, nil, syserror.ENOTDIR
- }
- return vfsd, inode, nil
-}
-
-// checkCreateLocked checks that a file named rp.Component() may be created in
-// directory parentVFSD, then returns rp.Component().
-//
-// Preconditions: filesystem.mu must be locked. parentInode ==
-// parentVFSD.Impl().(*dentry).inode. parentInode.isDir() == true.
-func checkCreateLocked(rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode *inode) (string, error) {
- if err := parentInode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil {
- return "", err
- }
- pc := rp.Component()
- if pc == "." || pc == ".." {
- return "", syserror.EEXIST
- }
- childVFSD, err := rp.ResolveChild(parentVFSD, pc)
- if err != nil {
- return "", err
- }
- if childVFSD != nil {
- return "", syserror.EEXIST
- }
- if parentVFSD.IsDisowned() {
- return "", syserror.ENOENT
- }
- return pc, nil
-}
-
-// checkDeleteLocked checks that the file represented by vfsd may be deleted.
-func checkDeleteLocked(vfsd *vfs.Dentry) error {
- parentVFSD := vfsd.Parent()
- if parentVFSD == nil {
- return syserror.EBUSY
- }
- if parentVFSD.IsDisowned() {
- return syserror.ENOENT
- }
- return nil
-}
-
-// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
-func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
- fs.mu.RLock()
- defer fs.mu.RUnlock()
- vfsd, inode, err := walkExistingLocked(rp)
- if err != nil {
- return nil, err
- }
- if opts.CheckSearchable {
- if !inode.isDir() {
- return nil, syserror.ENOTDIR
- }
- if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
- return nil, err
- }
- }
- inode.incRef()
- return vfsd, nil
-}
-
-// LinkAt implements vfs.FilesystemImpl.LinkAt.
-func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
- if rp.Done() {
- return syserror.EEXIST
- }
- fs.mu.Lock()
- defer fs.mu.Unlock()
- parentVFSD, parentInode, err := walkParentDirLocked(rp)
- if err != nil {
- return err
- }
- pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
- if err != nil {
- return err
- }
- if rp.Mount() != vd.Mount() {
- return syserror.EXDEV
- }
- if err := rp.Mount().CheckBeginWrite(); err != nil {
- return err
- }
- defer rp.Mount().EndWrite()
- d := vd.Dentry().Impl().(*dentry)
- if d.inode.isDir() {
- return syserror.EPERM
- }
- d.inode.incLinksLocked()
- child := fs.newDentry(d.inode)
- parentVFSD.InsertChild(&child.vfsd, pc)
- parentInode.impl.(*directory).childList.PushBack(child)
- return nil
-}
-
-// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
-func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
- if rp.Done() {
- return syserror.EEXIST
- }
- fs.mu.Lock()
- defer fs.mu.Unlock()
- parentVFSD, parentInode, err := walkParentDirLocked(rp)
- if err != nil {
- return err
- }
- pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
- if err != nil {
- return err
- }
- if err := rp.Mount().CheckBeginWrite(); err != nil {
- return err
- }
- defer rp.Mount().EndWrite()
- child := fs.newDentry(fs.newDirectory(rp.Credentials(), opts.Mode))
- parentVFSD.InsertChild(&child.vfsd, pc)
- parentInode.impl.(*directory).childList.PushBack(child)
- parentInode.incLinksLocked() // from child's ".."
- return nil
-}
-
-// MknodAt implements vfs.FilesystemImpl.MknodAt.
-func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
- if rp.Done() {
- return syserror.EEXIST
- }
- fs.mu.Lock()
- defer fs.mu.Unlock()
- parentVFSD, parentInode, err := walkParentDirLocked(rp)
- if err != nil {
- return err
- }
- pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
- if err != nil {
- return err
- }
- if err := rp.Mount().CheckBeginWrite(); err != nil {
- return err
- }
- defer rp.Mount().EndWrite()
-
- switch opts.Mode.FileType() {
- case 0:
- // "Zero file type is equivalent to type S_IFREG." - mknod(2)
- fallthrough
- case linux.ModeRegular:
- // TODO(b/138862511): Implement.
- return syserror.EINVAL
-
- case linux.ModeNamedPipe:
- child := fs.newDentry(fs.newNamedPipe(rp.Credentials(), opts.Mode))
- parentVFSD.InsertChild(&child.vfsd, pc)
- parentInode.impl.(*directory).childList.PushBack(child)
- return nil
-
- case linux.ModeSocket:
- // TODO(b/138862511): Implement.
- return syserror.EINVAL
-
- case linux.ModeCharacterDevice:
- fallthrough
- case linux.ModeBlockDevice:
- // TODO(b/72101894): We don't support creating block or character
- // devices at the moment.
- //
- // When we start supporting block and character devices, we'll
- // need to check for CAP_MKNOD here.
- return syserror.EPERM
-
- default:
- // "EINVAL - mode requested creation of something other than a
- // regular file, device special file, FIFO or socket." - mknod(2)
- return syserror.EINVAL
- }
-}
-
-// OpenAt implements vfs.FilesystemImpl.OpenAt.
-func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
- // Filter out flags that are not supported by memfs. O_DIRECTORY and
- // O_NOFOLLOW have no effect here (they're handled by VFS by setting
- // appropriate bits in rp), but are returned by
- // FileDescriptionImpl.StatusFlags(). O_NONBLOCK is supported only by
- // pipes.
- opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NONBLOCK
-
- if opts.Flags&linux.O_CREAT == 0 {
- fs.mu.RLock()
- defer fs.mu.RUnlock()
- vfsd, inode, err := walkExistingLocked(rp)
- if err != nil {
- return nil, err
- }
- return inode.open(ctx, rp, vfsd, opts.Flags, false)
- }
-
- mustCreate := opts.Flags&linux.O_EXCL != 0
- vfsd := rp.Start()
- inode := vfsd.Impl().(*dentry).inode
- fs.mu.Lock()
- defer fs.mu.Unlock()
- if rp.Done() {
- if rp.MustBeDir() {
- return nil, syserror.EISDIR
- }
- if mustCreate {
- return nil, syserror.EEXIST
- }
- return inode.open(ctx, rp, vfsd, opts.Flags, false)
- }
-afterTrailingSymlink:
- // Walk to the parent directory of the last path component.
- for !rp.Final() {
- var err error
- vfsd, inode, err = stepLocked(rp, vfsd, inode)
- if err != nil {
- return nil, err
- }
- }
- if !inode.isDir() {
- return nil, syserror.ENOTDIR
- }
- // Check for search permission in the parent directory.
- if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
- return nil, err
- }
- // Reject attempts to open directories with O_CREAT.
- if rp.MustBeDir() {
- return nil, syserror.EISDIR
- }
- pc := rp.Component()
- if pc == "." || pc == ".." {
- return nil, syserror.EISDIR
- }
- // Determine whether or not we need to create a file.
- childVFSD, err := rp.ResolveChild(vfsd, pc)
- if err != nil {
- return nil, err
- }
- if childVFSD == nil {
- // Already checked for searchability above; now check for writability.
- if err := inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil {
- return nil, err
- }
- if err := rp.Mount().CheckBeginWrite(); err != nil {
- return nil, err
- }
- defer rp.Mount().EndWrite()
- // Create and open the child.
- childInode := fs.newRegularFile(rp.Credentials(), opts.Mode)
- child := fs.newDentry(childInode)
- vfsd.InsertChild(&child.vfsd, pc)
- inode.impl.(*directory).childList.PushBack(child)
- return childInode.open(ctx, rp, &child.vfsd, opts.Flags, true)
- }
- // Open existing file or follow symlink.
- if mustCreate {
- return nil, syserror.EEXIST
- }
- childInode := childVFSD.Impl().(*dentry).inode
- if symlink, ok := childInode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
- // TODO: symlink traversals update access time
- if err := rp.HandleSymlink(symlink.target); err != nil {
- return nil, err
- }
- // rp.Final() may no longer be true since we now need to resolve the
- // symlink target.
- goto afterTrailingSymlink
- }
- return childInode.open(ctx, rp, childVFSD, opts.Flags, false)
-}
-
-func (i *inode) open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32, afterCreate bool) (*vfs.FileDescription, error) {
- ats := vfs.AccessTypesForOpenFlags(flags)
- if !afterCreate {
- if err := i.checkPermissions(rp.Credentials(), ats, i.isDir()); err != nil {
- return nil, err
- }
- }
- mnt := rp.Mount()
- switch impl := i.impl.(type) {
- case *regularFile:
- var fd regularFileFD
- fd.flags = flags
- fd.readable = vfs.MayReadFileWithOpenFlags(flags)
- fd.writable = vfs.MayWriteFileWithOpenFlags(flags)
- if fd.writable {
- if err := mnt.CheckBeginWrite(); err != nil {
- return nil, err
- }
- // mnt.EndWrite() is called by regularFileFD.Release().
- }
- mnt.IncRef()
- vfsd.IncRef()
- fd.vfsfd.Init(&fd, mnt, vfsd)
- if flags&linux.O_TRUNC != 0 {
- impl.mu.Lock()
- impl.data = impl.data[:0]
- atomic.StoreInt64(&impl.dataLen, 0)
- impl.mu.Unlock()
- }
- return &fd.vfsfd, nil
- case *directory:
- // Can't open directories writably.
- if ats&vfs.MayWrite != 0 {
- return nil, syserror.EISDIR
- }
- var fd directoryFD
- mnt.IncRef()
- vfsd.IncRef()
- fd.vfsfd.Init(&fd, mnt, vfsd)
- fd.flags = flags
- return &fd.vfsfd, nil
- case *symlink:
- // Can't open symlinks without O_PATH (which is unimplemented).
- return nil, syserror.ELOOP
- case *namedPipe:
- return newNamedPipeFD(ctx, impl, rp, vfsd, flags)
- default:
- panic(fmt.Sprintf("unknown inode type: %T", i.impl))
- }
-}
-
-// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
-func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
- fs.mu.RLock()
- _, inode, err := walkExistingLocked(rp)
- fs.mu.RUnlock()
- if err != nil {
- return "", err
- }
- symlink, ok := inode.impl.(*symlink)
- if !ok {
- return "", syserror.EINVAL
- }
- return symlink.target, nil
-}
-
-// RenameAt implements vfs.FilesystemImpl.RenameAt.
-func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error {
- if rp.Done() {
- return syserror.ENOENT
- }
- fs.mu.Lock()
- defer fs.mu.Unlock()
- parentVFSD, parentInode, err := walkParentDirLocked(rp)
- if err != nil {
- return err
- }
- _, err = checkCreateLocked(rp, parentVFSD, parentInode)
- if err != nil {
- return err
- }
- if err := rp.Mount().CheckBeginWrite(); err != nil {
- return err
- }
- defer rp.Mount().EndWrite()
- // TODO: actually implement RenameAt
- return syserror.EPERM
-}
-
-// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
-func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
- fs.mu.Lock()
- defer fs.mu.Unlock()
- vfsd, inode, err := walkExistingLocked(rp)
- if err != nil {
- return err
- }
- if err := rp.Mount().CheckBeginWrite(); err != nil {
- return err
- }
- defer rp.Mount().EndWrite()
- if err := checkDeleteLocked(vfsd); err != nil {
- return err
- }
- if !inode.isDir() {
- return syserror.ENOTDIR
- }
- if vfsd.HasChildren() {
- return syserror.ENOTEMPTY
- }
- if err := rp.VirtualFilesystem().DeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil {
- return err
- }
- // Remove from parent directory's childList.
- vfsd.Parent().Impl().(*dentry).inode.impl.(*directory).childList.Remove(vfsd.Impl().(*dentry))
- inode.decRef()
- return nil
-}
-
-// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
-func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
- fs.mu.RLock()
- _, _, err := walkExistingLocked(rp)
- fs.mu.RUnlock()
- if err != nil {
- return err
- }
- if opts.Stat.Mask == 0 {
- return nil
- }
- // TODO: implement inode.setStat
- return syserror.EPERM
-}
-
-// StatAt implements vfs.FilesystemImpl.StatAt.
-func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
- fs.mu.RLock()
- _, inode, err := walkExistingLocked(rp)
- fs.mu.RUnlock()
- if err != nil {
- return linux.Statx{}, err
- }
- var stat linux.Statx
- inode.statTo(&stat)
- return stat, nil
-}
-
-// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
-func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
- fs.mu.RLock()
- _, _, err := walkExistingLocked(rp)
- fs.mu.RUnlock()
- if err != nil {
- return linux.Statfs{}, err
- }
- // TODO: actually implement statfs
- return linux.Statfs{}, syserror.ENOSYS
-}
-
-// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
-func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
- if rp.Done() {
- return syserror.EEXIST
- }
- fs.mu.Lock()
- defer fs.mu.Unlock()
- parentVFSD, parentInode, err := walkParentDirLocked(rp)
- if err != nil {
- return err
- }
- pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
- if err != nil {
- return err
- }
- if err := rp.Mount().CheckBeginWrite(); err != nil {
- return err
- }
- defer rp.Mount().EndWrite()
- child := fs.newDentry(fs.newSymlink(rp.Credentials(), target))
- parentVFSD.InsertChild(&child.vfsd, pc)
- parentInode.impl.(*directory).childList.PushBack(child)
- return nil
-}
-
-// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
-func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
- fs.mu.Lock()
- defer fs.mu.Unlock()
- vfsd, inode, err := walkExistingLocked(rp)
- if err != nil {
- return err
- }
- if err := rp.Mount().CheckBeginWrite(); err != nil {
- return err
- }
- defer rp.Mount().EndWrite()
- if err := checkDeleteLocked(vfsd); err != nil {
- return err
- }
- if inode.isDir() {
- return syserror.EISDIR
- }
- if err := rp.VirtualFilesystem().DeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil {
- return err
- }
- // Remove from parent directory's childList.
- vfsd.Parent().Impl().(*dentry).inode.impl.(*directory).childList.Remove(vfsd.Impl().(*dentry))
- inode.decLinksLocked()
- return nil
-}
diff --git a/pkg/sentry/fsimpl/memfs/regular_file.go b/pkg/sentry/fsimpl/memfs/regular_file.go
deleted file mode 100644
index b7f4853b3..000000000
--- a/pkg/sentry/fsimpl/memfs/regular_file.go
+++ /dev/null
@@ -1,154 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package memfs
-
-import (
- "io"
- "sync"
- "sync/atomic"
-
- "gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/sentry/context"
- "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
- "gvisor.dev/gvisor/pkg/sentry/vfs"
- "gvisor.dev/gvisor/pkg/syserror"
-)
-
-type regularFile struct {
- inode inode
-
- mu sync.RWMutex
- data []byte
- // dataLen is len(data), but accessed using atomic memory operations to
- // avoid locking in inode.stat().
- dataLen int64
-}
-
-func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode linux.FileMode) *inode {
- file := &regularFile{}
- file.inode.init(file, fs, creds, mode)
- file.inode.nlink = 1 // from parent directory
- return &file.inode
-}
-
-type regularFileFD struct {
- fileDescription
-
- // These are immutable.
- readable bool
- writable bool
-
- // off is the file offset. off is accessed using atomic memory operations.
- // offMu serializes operations that may mutate off.
- off int64
- offMu sync.Mutex
-}
-
-// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *regularFileFD) Release() {
- if fd.writable {
- fd.vfsfd.VirtualDentry().Mount().EndWrite()
- }
-}
-
-// PRead implements vfs.FileDescriptionImpl.PRead.
-func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
- if !fd.readable {
- return 0, syserror.EINVAL
- }
- f := fd.inode().impl.(*regularFile)
- f.mu.RLock()
- if offset >= int64(len(f.data)) {
- f.mu.RUnlock()
- return 0, io.EOF
- }
- n, err := dst.CopyOut(ctx, f.data[offset:])
- f.mu.RUnlock()
- return int64(n), err
-}
-
-// Read implements vfs.FileDescriptionImpl.Read.
-func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
- fd.offMu.Lock()
- n, err := fd.PRead(ctx, dst, fd.off, opts)
- fd.off += n
- fd.offMu.Unlock()
- return n, err
-}
-
-// PWrite implements vfs.FileDescriptionImpl.PWrite.
-func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
- if !fd.writable {
- return 0, syserror.EINVAL
- }
- if offset < 0 {
- return 0, syserror.EINVAL
- }
- srclen := src.NumBytes()
- if srclen == 0 {
- return 0, nil
- }
- f := fd.inode().impl.(*regularFile)
- f.mu.Lock()
- end := offset + srclen
- if end < offset {
- // Overflow.
- f.mu.Unlock()
- return 0, syserror.EFBIG
- }
- if end > f.dataLen {
- f.data = append(f.data, make([]byte, end-f.dataLen)...)
- atomic.StoreInt64(&f.dataLen, end)
- }
- n, err := src.CopyIn(ctx, f.data[offset:end])
- f.mu.Unlock()
- return int64(n), err
-}
-
-// Write implements vfs.FileDescriptionImpl.Write.
-func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
- fd.offMu.Lock()
- n, err := fd.PWrite(ctx, src, fd.off, opts)
- fd.off += n
- fd.offMu.Unlock()
- return n, err
-}
-
-// Seek implements vfs.FileDescriptionImpl.Seek.
-func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
- fd.offMu.Lock()
- defer fd.offMu.Unlock()
- switch whence {
- case linux.SEEK_SET:
- // use offset as specified
- case linux.SEEK_CUR:
- offset += fd.off
- case linux.SEEK_END:
- offset += atomic.LoadInt64(&fd.inode().impl.(*regularFile).dataLen)
- default:
- return 0, syserror.EINVAL
- }
- if offset < 0 {
- return 0, syserror.EINVAL
- }
- fd.off = offset
- return offset, nil
-}
-
-// Sync implements vfs.FileDescriptionImpl.Sync.
-func (fd *regularFileFD) Sync(ctx context.Context) error {
- return nil
-}
diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD
index ade6ac946..1f44b3217 100644
--- a/pkg/sentry/fsimpl/proc/BUILD
+++ b/pkg/sentry/fsimpl/proc/BUILD
@@ -6,15 +6,17 @@ package(licenses = ["notice"])
go_library(
name = "proc",
srcs = [
- "filesystems.go",
+ "filesystem.go",
"loadavg.go",
"meminfo.go",
"mounts.go",
"net.go",
- "proc.go",
"stat.go",
"sys.go",
"task.go",
+ "task_files.go",
+ "tasks.go",
+ "tasks_files.go",
"version.go",
],
importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc",
@@ -24,8 +26,10 @@ go_library(
"//pkg/log",
"//pkg/sentry/context",
"//pkg/sentry/fs",
+ "//pkg/sentry/fsimpl/kernfs",
"//pkg/sentry/inet",
"//pkg/sentry/kernel",
+ "//pkg/sentry/kernel/auth",
"//pkg/sentry/limits",
"//pkg/sentry/mm",
"//pkg/sentry/socket",
@@ -34,17 +38,40 @@ go_library(
"//pkg/sentry/usage",
"//pkg/sentry/usermem",
"//pkg/sentry/vfs",
+ "//pkg/syserror",
],
)
go_test(
name = "proc_test",
size = "small",
- srcs = ["net_test.go"],
+ srcs = [
+ "boot_test.go",
+ "net_test.go",
+ "tasks_test.go",
+ ],
embed = [":proc"],
deps = [
"//pkg/abi/linux",
+ "//pkg/cpuid",
+ "//pkg/fspath",
+ "//pkg/memutil",
+ "//pkg/sentry/context",
"//pkg/sentry/context/contexttest",
+ "//pkg/sentry/fs",
"//pkg/sentry/inet",
+ "//pkg/sentry/kernel",
+ "//pkg/sentry/kernel/auth",
+ "//pkg/sentry/kernel/sched",
+ "//pkg/sentry/limits",
+ "//pkg/sentry/loader",
+ "//pkg/sentry/pgalloc",
+ "//pkg/sentry/platform",
+ "//pkg/sentry/platform/kvm",
+ "//pkg/sentry/platform/ptrace",
+ "//pkg/sentry/time",
+ "//pkg/sentry/usermem",
+ "//pkg/sentry/vfs",
+ "//pkg/syserror",
],
)
diff --git a/pkg/sentry/fsimpl/proc/boot_test.go b/pkg/sentry/fsimpl/proc/boot_test.go
new file mode 100644
index 000000000..84a93ee56
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/boot_test.go
@@ -0,0 +1,149 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+ "flag"
+ "fmt"
+ "os"
+ "runtime"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/cpuid"
+ "gvisor.dev/gvisor/pkg/memutil"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/sched"
+ "gvisor.dev/gvisor/pkg/sentry/limits"
+ "gvisor.dev/gvisor/pkg/sentry/loader"
+ "gvisor.dev/gvisor/pkg/sentry/pgalloc"
+ "gvisor.dev/gvisor/pkg/sentry/platform"
+ "gvisor.dev/gvisor/pkg/sentry/time"
+
+ // Platforms are plugable.
+ _ "gvisor.dev/gvisor/pkg/sentry/platform/kvm"
+ _ "gvisor.dev/gvisor/pkg/sentry/platform/ptrace"
+)
+
+var (
+ platformFlag = flag.String("platform", "ptrace", "specify which platform to use")
+)
+
+// boot initializes a new bare bones kernel for test.
+func boot() (*kernel.Kernel, error) {
+ platformCtr, err := platform.Lookup(*platformFlag)
+ if err != nil {
+ return nil, fmt.Errorf("platform not found: %v", err)
+ }
+ deviceFile, err := platformCtr.OpenDevice()
+ if err != nil {
+ return nil, fmt.Errorf("creating platform: %v", err)
+ }
+ plat, err := platformCtr.New(deviceFile)
+ if err != nil {
+ return nil, fmt.Errorf("creating platform: %v", err)
+ }
+
+ k := &kernel.Kernel{
+ Platform: plat,
+ }
+
+ mf, err := createMemoryFile()
+ if err != nil {
+ return nil, err
+ }
+ k.SetMemoryFile(mf)
+
+ // Pass k as the platform since it is savable, unlike the actual platform.
+ vdso, err := loader.PrepareVDSO(nil, k)
+ if err != nil {
+ return nil, fmt.Errorf("creating vdso: %v", err)
+ }
+
+ // Create timekeeper.
+ tk, err := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange())
+ if err != nil {
+ return nil, fmt.Errorf("creating timekeeper: %v", err)
+ }
+ tk.SetClocks(time.NewCalibratedClocks())
+
+ creds := auth.NewRootCredentials(auth.NewRootUserNamespace())
+
+ // Initiate the Kernel object, which is required by the Context passed
+ // to createVFS in order to mount (among other things) procfs.
+ if err = k.Init(kernel.InitKernelArgs{
+ ApplicationCores: uint(runtime.GOMAXPROCS(-1)),
+ FeatureSet: cpuid.HostFeatureSet(),
+ Timekeeper: tk,
+ RootUserNamespace: creds.UserNamespace,
+ Vdso: vdso,
+ RootUTSNamespace: kernel.NewUTSNamespace("hostname", "domain", creds.UserNamespace),
+ RootIPCNamespace: kernel.NewIPCNamespace(creds.UserNamespace),
+ RootAbstractSocketNamespace: kernel.NewAbstractSocketNamespace(),
+ PIDNamespace: kernel.NewRootPIDNamespace(creds.UserNamespace),
+ }); err != nil {
+ return nil, fmt.Errorf("initializing kernel: %v", err)
+ }
+
+ ctx := k.SupervisorContext()
+
+ // Create mount namespace without root as it's the minimum required to create
+ // the global thread group.
+ mntns, err := fs.NewMountNamespace(ctx, nil)
+ if err != nil {
+ return nil, err
+ }
+ ls, err := limits.NewLinuxLimitSet()
+ if err != nil {
+ return nil, err
+ }
+ tg := k.NewThreadGroup(mntns, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, ls)
+ k.TestOnly_SetGlobalInit(tg)
+
+ return k, nil
+}
+
+// createTask creates a new bare bones task for tests.
+func createTask(ctx context.Context, name string, tc *kernel.ThreadGroup) (*kernel.Task, error) {
+ k := kernel.KernelFromContext(ctx)
+ config := &kernel.TaskConfig{
+ Kernel: k,
+ ThreadGroup: tc,
+ TaskContext: &kernel.TaskContext{Name: name},
+ Credentials: auth.CredentialsFromContext(ctx),
+ AllowedCPUMask: sched.NewFullCPUSet(k.ApplicationCores()),
+ UTSNamespace: kernel.UTSNamespaceFromContext(ctx),
+ IPCNamespace: kernel.IPCNamespaceFromContext(ctx),
+ AbstractSocketNamespace: kernel.NewAbstractSocketNamespace(),
+ }
+ return k.TaskSet().NewTask(config)
+}
+
+func createMemoryFile() (*pgalloc.MemoryFile, error) {
+ const memfileName = "test-memory"
+ memfd, err := memutil.CreateMemFD(memfileName, 0)
+ if err != nil {
+ return nil, fmt.Errorf("error creating memfd: %v", err)
+ }
+ memfile := os.NewFile(uintptr(memfd), memfileName)
+ mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{})
+ if err != nil {
+ memfile.Close()
+ return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %v", err)
+ }
+ return mf, nil
+}
diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go
new file mode 100644
index 000000000..d09182c77
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/filesystem.go
@@ -0,0 +1,69 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package proc implements a partial in-memory file system for procfs.
+package proc
+
+import (
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// procFSType is the factory class for procfs.
+//
+// +stateify savable
+type procFSType struct{}
+
+var _ vfs.FilesystemType = (*procFSType)(nil)
+
+// GetFilesystem implements vfs.FilesystemType.
+func (ft *procFSType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+ k := kernel.KernelFromContext(ctx)
+ if k == nil {
+ return nil, nil, fmt.Errorf("procfs requires a kernel")
+ }
+ pidns := kernel.PIDNamespaceFromContext(ctx)
+ if pidns == nil {
+ return nil, nil, fmt.Errorf("procfs requires a PID namespace")
+ }
+
+ procfs := &kernfs.Filesystem{}
+ procfs.VFSFilesystem().Init(vfsObj, procfs)
+
+ _, dentry := newTasksInode(procfs, k, pidns)
+ return procfs.VFSFilesystem(), dentry.VFSDentry(), nil
+}
+
+// dynamicInode is an overfitted interface for common Inodes with
+// dynamicByteSource types used in procfs.
+type dynamicInode interface {
+ kernfs.Inode
+ vfs.DynamicBytesSource
+
+ Init(creds *auth.Credentials, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode)
+}
+
+func newDentry(creds *auth.Credentials, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry {
+ inode.Init(creds, ino, inode, perm)
+
+ d := &kernfs.Dentry{}
+ d.Init(inode)
+ return d
+}
diff --git a/pkg/sentry/fsimpl/proc/loadavg.go b/pkg/sentry/fsimpl/proc/loadavg.go
index 9135afef1..5351d86e8 100644
--- a/pkg/sentry/fsimpl/proc/loadavg.go
+++ b/pkg/sentry/fsimpl/proc/loadavg.go
@@ -19,15 +19,17 @@ import (
"fmt"
"gvisor.dev/gvisor/pkg/sentry/context"
- "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
)
// loadavgData backs /proc/loadavg.
//
// +stateify savable
-type loadavgData struct{}
+type loadavgData struct {
+ kernfs.DynamicBytesFile
+}
-var _ vfs.DynamicBytesSource = (*loadavgData)(nil)
+var _ dynamicInode = (*loadavgData)(nil)
// Generate implements vfs.DynamicBytesSource.Generate.
func (d *loadavgData) Generate(ctx context.Context, buf *bytes.Buffer) error {
diff --git a/pkg/sentry/fsimpl/proc/meminfo.go b/pkg/sentry/fsimpl/proc/meminfo.go
index 9a827cd66..cbdd4f3fc 100644
--- a/pkg/sentry/fsimpl/proc/meminfo.go
+++ b/pkg/sentry/fsimpl/proc/meminfo.go
@@ -19,21 +19,23 @@ import (
"fmt"
"gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/sentry/usermem"
- "gvisor.dev/gvisor/pkg/sentry/vfs"
)
// meminfoData implements vfs.DynamicBytesSource for /proc/meminfo.
//
// +stateify savable
type meminfoData struct {
+ kernfs.DynamicBytesFile
+
// k is the owning Kernel.
k *kernel.Kernel
}
-var _ vfs.DynamicBytesSource = (*meminfoData)(nil)
+var _ dynamicInode = (*meminfoData)(nil)
// Generate implements vfs.DynamicBytesSource.Generate.
func (d *meminfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
diff --git a/pkg/sentry/fsimpl/proc/mounts.go b/pkg/sentry/fsimpl/proc/mounts.go
index e81b1e910..8683cf677 100644
--- a/pkg/sentry/fsimpl/proc/mounts.go
+++ b/pkg/sentry/fsimpl/proc/mounts.go
@@ -16,7 +16,7 @@ package proc
import "gvisor.dev/gvisor/pkg/sentry/kernel"
-// TODO(b/138862512): Implement mountInfoFile and mountsFile.
+// TODO(gvisor.dev/issue/1195): Implement mountInfoFile and mountsFile.
// mountInfoFile implements vfs.DynamicBytesSource for /proc/[pid]/mountinfo.
//
diff --git a/pkg/sentry/fsimpl/proc/stat.go b/pkg/sentry/fsimpl/proc/stat.go
index 720db3828..50894a534 100644
--- a/pkg/sentry/fsimpl/proc/stat.go
+++ b/pkg/sentry/fsimpl/proc/stat.go
@@ -20,8 +20,8 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/vfs"
)
// cpuStats contains the breakdown of CPU time for /proc/stat.
@@ -66,11 +66,13 @@ func (c cpuStats) String() string {
//
// +stateify savable
type statData struct {
+ kernfs.DynamicBytesFile
+
// k is the owning Kernel.
k *kernel.Kernel
}
-var _ vfs.DynamicBytesSource = (*statData)(nil)
+var _ dynamicInode = (*statData)(nil)
// Generate implements vfs.DynamicBytesSource.Generate.
func (s *statData) Generate(ctx context.Context, buf *bytes.Buffer) error {
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index c46e05c3a..11a64c777 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -15,247 +15,176 @@
package proc
import (
- "bytes"
- "fmt"
-
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/limits"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/mm"
- "gvisor.dev/gvisor/pkg/sentry/usage"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
)
-// mapsCommon is embedded by mapsData and smapsData.
-type mapsCommon struct {
- t *kernel.Task
-}
-
-// mm gets the kernel task's MemoryManager. No additional reference is taken on
-// mm here. This is safe because MemoryManager.destroy is required to leave the
-// MemoryManager in a state where it's still usable as a DynamicBytesSource.
-func (md *mapsCommon) mm() *mm.MemoryManager {
- var tmm *mm.MemoryManager
- md.t.WithMuLocked(func(t *kernel.Task) {
- if mm := t.MemoryManager(); mm != nil {
- tmm = mm
- }
- })
- return tmm
-}
-
-// mapsData implements vfs.DynamicBytesSource for /proc/[pid]/maps.
+// taskInode represents the inode for /proc/PID/ directory.
//
// +stateify savable
-type mapsData struct {
- mapsCommon
+type taskInode struct {
+ kernfs.InodeNotSymlink
+ kernfs.InodeDirectoryNoNewChildren
+ kernfs.InodeNoDynamicLookup
+ kernfs.InodeAttrs
+ kernfs.OrderedChildren
+
+ task *kernel.Task
}
-var _ vfs.DynamicBytesSource = (*mapsData)(nil)
-
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (md *mapsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
- if mm := md.mm(); mm != nil {
- mm.ReadMapsDataInto(ctx, buf)
+var _ kernfs.Inode = (*taskInode)(nil)
+
+func newTaskInode(inoGen InoGenerator, task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool) *kernfs.Dentry {
+ contents := map[string]*kernfs.Dentry{
+ //"auxv": newAuxvec(t, msrc),
+ //"cmdline": newExecArgInode(t, msrc, cmdlineExecArg),
+ //"comm": newComm(t, msrc),
+ //"environ": newExecArgInode(t, msrc, environExecArg),
+ //"exe": newExe(t, msrc),
+ //"fd": newFdDir(t, msrc),
+ //"fdinfo": newFdInfoDir(t, msrc),
+ //"gid_map": newGIDMap(t, msrc),
+ "io": newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, newIO(task, isThreadGroup)),
+ "maps": newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &mapsData{task: task}),
+ //"mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc),
+ //"mounts": seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc),
+ //"ns": newNamespaceDir(t, msrc),
+ "smaps": newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &smapsData{task: task}),
+ "stat": newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &taskStatData{t: task, pidns: pidns, tgstats: isThreadGroup}),
+ "statm": newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &statmData{t: task}),
+ "status": newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &statusData{t: task, pidns: pidns}),
+ //"uid_map": newUIDMap(t, msrc),
}
- return nil
-}
-
-// smapsData implements vfs.DynamicBytesSource for /proc/[pid]/smaps.
-//
-// +stateify savable
-type smapsData struct {
- mapsCommon
-}
-
-var _ vfs.DynamicBytesSource = (*smapsData)(nil)
-
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (sd *smapsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
- if mm := sd.mm(); mm != nil {
- mm.ReadSmapsDataInto(ctx, buf)
+ if isThreadGroup {
+ //contents["task"] = p.newSubtasks(t, msrc)
}
- return nil
-}
-
-// +stateify savable
-type taskStatData struct {
- t *kernel.Task
+ //if len(p.cgroupControllers) > 0 {
+ // contents["cgroup"] = newCGroupInode(t, msrc, p.cgroupControllers)
+ //}
- // If tgstats is true, accumulate fault stats (not implemented) and CPU
- // time across all tasks in t's thread group.
- tgstats bool
+ taskInode := &taskInode{task: task}
+ // Note: credentials are overridden by taskOwnedInode.
+ taskInode.InodeAttrs.Init(task.Credentials(), inoGen.NextIno(), linux.ModeDirectory|0555)
- // pidns is the PID namespace associated with the proc filesystem that
- // includes the file using this statData.
- pidns *kernel.PIDNamespace
-}
-
-var _ vfs.DynamicBytesSource = (*taskStatData)(nil)
-
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (s *taskStatData) Generate(ctx context.Context, buf *bytes.Buffer) error {
- fmt.Fprintf(buf, "%d ", s.pidns.IDOfTask(s.t))
- fmt.Fprintf(buf, "(%s) ", s.t.Name())
- fmt.Fprintf(buf, "%c ", s.t.StateStatus()[0])
- ppid := kernel.ThreadID(0)
- if parent := s.t.Parent(); parent != nil {
- ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup())
- }
- fmt.Fprintf(buf, "%d ", ppid)
- fmt.Fprintf(buf, "%d ", s.pidns.IDOfProcessGroup(s.t.ThreadGroup().ProcessGroup()))
- fmt.Fprintf(buf, "%d ", s.pidns.IDOfSession(s.t.ThreadGroup().Session()))
- fmt.Fprintf(buf, "0 0 " /* tty_nr tpgid */)
- fmt.Fprintf(buf, "0 " /* flags */)
- fmt.Fprintf(buf, "0 0 0 0 " /* minflt cminflt majflt cmajflt */)
- var cputime usage.CPUStats
- if s.tgstats {
- cputime = s.t.ThreadGroup().CPUStats()
- } else {
- cputime = s.t.CPUStats()
- }
- fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime))
- cputime = s.t.ThreadGroup().JoinedChildCPUStats()
- fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime))
- fmt.Fprintf(buf, "%d %d ", s.t.Priority(), s.t.Niceness())
- fmt.Fprintf(buf, "%d ", s.t.ThreadGroup().Count())
+ inode := &taskOwnedInode{Inode: taskInode, owner: task}
+ dentry := &kernfs.Dentry{}
+ dentry.Init(inode)
- // itrealvalue. Since kernel 2.6.17, this field is no longer
- // maintained, and is hard coded as 0.
- fmt.Fprintf(buf, "0 ")
+ taskInode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+ links := taskInode.OrderedChildren.Populate(dentry, contents)
+ taskInode.IncLinks(links)
- // Start time is relative to boot time, expressed in clock ticks.
- fmt.Fprintf(buf, "%d ", linux.ClockTFromDuration(s.t.StartTime().Sub(s.t.Kernel().Timekeeper().BootTime())))
+ return dentry
+}
- var vss, rss uint64
- s.t.WithMuLocked(func(t *kernel.Task) {
- if mm := t.MemoryManager(); mm != nil {
- vss = mm.VirtualMemorySize()
- rss = mm.ResidentSetSize()
- }
- })
- fmt.Fprintf(buf, "%d %d ", vss, rss/usermem.PageSize)
+// Valid implements kernfs.inodeDynamicLookup. This inode remains valid as long
+// as the task is still running. When it's dead, another tasks with the same
+// PID could replace it.
+func (i *taskInode) Valid(ctx context.Context) bool {
+ return i.task.ExitState() != kernel.TaskExitDead
+}
- // rsslim.
- fmt.Fprintf(buf, "%d ", s.t.ThreadGroup().Limits().Get(limits.Rss).Cur)
+// Open implements kernfs.Inode.
+func (i *taskInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+ fd := &kernfs.GenericDirectoryFD{}
+ fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, flags)
+ return fd.VFSFileDescription(), nil
+}
- fmt.Fprintf(buf, "0 0 0 0 0 " /* startcode endcode startstack kstkesp kstkeip */)
- fmt.Fprintf(buf, "0 0 0 0 0 " /* signal blocked sigignore sigcatch wchan */)
- fmt.Fprintf(buf, "0 0 " /* nswap cnswap */)
- terminationSignal := linux.Signal(0)
- if s.t == s.t.ThreadGroup().Leader() {
- terminationSignal = s.t.ThreadGroup().TerminationSignal()
+// SetStat implements kernfs.Inode.
+func (i *taskInode) SetStat(_ *vfs.Filesystem, opts vfs.SetStatOptions) error {
+ stat := opts.Stat
+ if stat.Mask&linux.STATX_MODE != 0 {
+ return syserror.EPERM
}
- fmt.Fprintf(buf, "%d ", terminationSignal)
- fmt.Fprintf(buf, "0 0 0 " /* processor rt_priority policy */)
- fmt.Fprintf(buf, "0 0 0 " /* delayacct_blkio_ticks guest_time cguest_time */)
- fmt.Fprintf(buf, "0 0 0 0 0 0 0 " /* start_data end_data start_brk arg_start arg_end env_start env_end */)
- fmt.Fprintf(buf, "0\n" /* exit_code */)
-
return nil
}
-// statmData implements vfs.DynamicBytesSource for /proc/[pid]/statm.
-//
-// +stateify savable
-type statmData struct {
- t *kernel.Task
+// taskOwnedInode implements kernfs.Inode and overrides inode owner with task
+// effective user and group.
+type taskOwnedInode struct {
+ kernfs.Inode
+
+ // owner is the task that owns this inode.
+ owner *kernel.Task
}
-var _ vfs.DynamicBytesSource = (*statmData)(nil)
+var _ kernfs.Inode = (*taskOwnedInode)(nil)
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (s *statmData) Generate(ctx context.Context, buf *bytes.Buffer) error {
- var vss, rss uint64
- s.t.WithMuLocked(func(t *kernel.Task) {
- if mm := t.MemoryManager(); mm != nil {
- vss = mm.VirtualMemorySize()
- rss = mm.ResidentSetSize()
- }
- })
+func newTaskOwnedFile(task *kernel.Task, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry {
+ // Note: credentials are overridden by taskOwnedInode.
+ inode.Init(task.Credentials(), ino, inode, perm)
- fmt.Fprintf(buf, "%d %d 0 0 0 0 0\n", vss/usermem.PageSize, rss/usermem.PageSize)
- return nil
+ taskInode := &taskOwnedInode{Inode: inode, owner: task}
+ d := &kernfs.Dentry{}
+ d.Init(taskInode)
+ return d
}
-// statusData implements vfs.DynamicBytesSource for /proc/[pid]/status.
-//
-// +stateify savable
-type statusData struct {
- t *kernel.Task
- pidns *kernel.PIDNamespace
+// Stat implements kernfs.Inode.
+func (i *taskOwnedInode) Stat(fs *vfs.Filesystem) linux.Statx {
+ stat := i.Inode.Stat(fs)
+ uid, gid := i.getOwner(linux.FileMode(stat.Mode))
+ stat.UID = uint32(uid)
+ stat.GID = uint32(gid)
+ return stat
}
-var _ vfs.DynamicBytesSource = (*statusData)(nil)
+// CheckPermissions implements kernfs.Inode.
+func (i *taskOwnedInode) CheckPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
+ mode := i.Mode()
+ uid, gid := i.getOwner(mode)
+ return vfs.GenericCheckPermissions(
+ creds,
+ ats,
+ mode.FileType() == linux.ModeDirectory,
+ uint16(mode),
+ uid,
+ gid,
+ )
+}
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error {
- fmt.Fprintf(buf, "Name:\t%s\n", s.t.Name())
- fmt.Fprintf(buf, "State:\t%s\n", s.t.StateStatus())
- fmt.Fprintf(buf, "Tgid:\t%d\n", s.pidns.IDOfThreadGroup(s.t.ThreadGroup()))
- fmt.Fprintf(buf, "Pid:\t%d\n", s.pidns.IDOfTask(s.t))
- ppid := kernel.ThreadID(0)
- if parent := s.t.Parent(); parent != nil {
- ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup())
+func (i *taskOwnedInode) getOwner(mode linux.FileMode) (auth.KUID, auth.KGID) {
+ // By default, set the task owner as the file owner.
+ creds := i.owner.Credentials()
+ uid := creds.EffectiveKUID
+ gid := creds.EffectiveKGID
+
+ // Linux doesn't apply dumpability adjustments to world readable/executable
+ // directories so that applications can stat /proc/PID to determine the
+ // effective UID of a process. See fs/proc/base.c:task_dump_owner.
+ if mode.FileType() == linux.ModeDirectory && mode.Permissions() == 0555 {
+ return uid, gid
}
- fmt.Fprintf(buf, "PPid:\t%d\n", ppid)
- tpid := kernel.ThreadID(0)
- if tracer := s.t.Tracer(); tracer != nil {
- tpid = s.pidns.IDOfTask(tracer)
+
+ // If the task is not dumpable, then root (in the namespace preferred)
+ // owns the file.
+ m := getMM(i.owner)
+ if m == nil {
+ return auth.RootKUID, auth.RootKGID
}
- fmt.Fprintf(buf, "TracerPid:\t%d\n", tpid)
- var fds int
- var vss, rss, data uint64
- s.t.WithMuLocked(func(t *kernel.Task) {
- if fdTable := t.FDTable(); fdTable != nil {
- fds = fdTable.Size()
+ if m.Dumpability() != mm.UserDumpable {
+ uid = auth.RootKUID
+ if kuid := creds.UserNamespace.MapToKUID(auth.RootUID); kuid.Ok() {
+ uid = kuid
}
- if mm := t.MemoryManager(); mm != nil {
- vss = mm.VirtualMemorySize()
- rss = mm.ResidentSetSize()
- data = mm.VirtualDataSize()
+ gid = auth.RootKGID
+ if kgid := creds.UserNamespace.MapToKGID(auth.RootGID); kgid.Ok() {
+ gid = kgid
}
- })
- fmt.Fprintf(buf, "FDSize:\t%d\n", fds)
- fmt.Fprintf(buf, "VmSize:\t%d kB\n", vss>>10)
- fmt.Fprintf(buf, "VmRSS:\t%d kB\n", rss>>10)
- fmt.Fprintf(buf, "VmData:\t%d kB\n", data>>10)
- fmt.Fprintf(buf, "Threads:\t%d\n", s.t.ThreadGroup().Count())
- creds := s.t.Credentials()
- fmt.Fprintf(buf, "CapInh:\t%016x\n", creds.InheritableCaps)
- fmt.Fprintf(buf, "CapPrm:\t%016x\n", creds.PermittedCaps)
- fmt.Fprintf(buf, "CapEff:\t%016x\n", creds.EffectiveCaps)
- fmt.Fprintf(buf, "CapBnd:\t%016x\n", creds.BoundingCaps)
- fmt.Fprintf(buf, "Seccomp:\t%d\n", s.t.SeccompMode())
- return nil
-}
-
-// ioUsage is the /proc/<pid>/io and /proc/<pid>/task/<tid>/io data provider.
-type ioUsage interface {
- // IOUsage returns the io usage data.
- IOUsage() *usage.IO
-}
-
-// +stateify savable
-type ioData struct {
- ioUsage
+ }
+ return uid, gid
}
-var _ vfs.DynamicBytesSource = (*ioData)(nil)
-
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (i *ioData) Generate(ctx context.Context, buf *bytes.Buffer) error {
- io := usage.IO{}
- io.Accumulate(i.IOUsage())
-
- fmt.Fprintf(buf, "char: %d\n", io.CharsRead)
- fmt.Fprintf(buf, "wchar: %d\n", io.CharsWritten)
- fmt.Fprintf(buf, "syscr: %d\n", io.ReadSyscalls)
- fmt.Fprintf(buf, "syscw: %d\n", io.WriteSyscalls)
- fmt.Fprintf(buf, "read_bytes: %d\n", io.BytesRead)
- fmt.Fprintf(buf, "write_bytes: %d\n", io.BytesWritten)
- fmt.Fprintf(buf, "cancelled_write_bytes: %d\n", io.BytesWriteCancelled)
- return nil
+func newIO(t *kernel.Task, isThreadGroup bool) *ioData {
+ if isThreadGroup {
+ return &ioData{ioUsage: t.ThreadGroup()}
+ }
+ return &ioData{ioUsage: t}
}
diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go
new file mode 100644
index 000000000..93f0e1aa8
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/task_files.go
@@ -0,0 +1,272 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+ "bytes"
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/limits"
+ "gvisor.dev/gvisor/pkg/sentry/mm"
+ "gvisor.dev/gvisor/pkg/sentry/usage"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
+)
+
+// mm gets the kernel task's MemoryManager. No additional reference is taken on
+// mm here. This is safe because MemoryManager.destroy is required to leave the
+// MemoryManager in a state where it's still usable as a DynamicBytesSource.
+func getMM(task *kernel.Task) *mm.MemoryManager {
+ var tmm *mm.MemoryManager
+ task.WithMuLocked(func(t *kernel.Task) {
+ if mm := t.MemoryManager(); mm != nil {
+ tmm = mm
+ }
+ })
+ return tmm
+}
+
+// mapsData implements vfs.DynamicBytesSource for /proc/[pid]/maps.
+//
+// +stateify savable
+type mapsData struct {
+ kernfs.DynamicBytesFile
+
+ task *kernel.Task
+}
+
+var _ dynamicInode = (*mapsData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (d *mapsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+ if mm := getMM(d.task); mm != nil {
+ mm.ReadMapsDataInto(ctx, buf)
+ }
+ return nil
+}
+
+// smapsData implements vfs.DynamicBytesSource for /proc/[pid]/smaps.
+//
+// +stateify savable
+type smapsData struct {
+ kernfs.DynamicBytesFile
+
+ task *kernel.Task
+}
+
+var _ dynamicInode = (*smapsData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (d *smapsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+ if mm := getMM(d.task); mm != nil {
+ mm.ReadSmapsDataInto(ctx, buf)
+ }
+ return nil
+}
+
+// +stateify savable
+type taskStatData struct {
+ kernfs.DynamicBytesFile
+
+ t *kernel.Task
+
+ // If tgstats is true, accumulate fault stats (not implemented) and CPU
+ // time across all tasks in t's thread group.
+ tgstats bool
+
+ // pidns is the PID namespace associated with the proc filesystem that
+ // includes the file using this statData.
+ pidns *kernel.PIDNamespace
+}
+
+var _ dynamicInode = (*taskStatData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (s *taskStatData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+ fmt.Fprintf(buf, "%d ", s.pidns.IDOfTask(s.t))
+ fmt.Fprintf(buf, "(%s) ", s.t.Name())
+ fmt.Fprintf(buf, "%c ", s.t.StateStatus()[0])
+ ppid := kernel.ThreadID(0)
+ if parent := s.t.Parent(); parent != nil {
+ ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup())
+ }
+ fmt.Fprintf(buf, "%d ", ppid)
+ fmt.Fprintf(buf, "%d ", s.pidns.IDOfProcessGroup(s.t.ThreadGroup().ProcessGroup()))
+ fmt.Fprintf(buf, "%d ", s.pidns.IDOfSession(s.t.ThreadGroup().Session()))
+ fmt.Fprintf(buf, "0 0 " /* tty_nr tpgid */)
+ fmt.Fprintf(buf, "0 " /* flags */)
+ fmt.Fprintf(buf, "0 0 0 0 " /* minflt cminflt majflt cmajflt */)
+ var cputime usage.CPUStats
+ if s.tgstats {
+ cputime = s.t.ThreadGroup().CPUStats()
+ } else {
+ cputime = s.t.CPUStats()
+ }
+ fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime))
+ cputime = s.t.ThreadGroup().JoinedChildCPUStats()
+ fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime))
+ fmt.Fprintf(buf, "%d %d ", s.t.Priority(), s.t.Niceness())
+ fmt.Fprintf(buf, "%d ", s.t.ThreadGroup().Count())
+
+ // itrealvalue. Since kernel 2.6.17, this field is no longer
+ // maintained, and is hard coded as 0.
+ fmt.Fprintf(buf, "0 ")
+
+ // Start time is relative to boot time, expressed in clock ticks.
+ fmt.Fprintf(buf, "%d ", linux.ClockTFromDuration(s.t.StartTime().Sub(s.t.Kernel().Timekeeper().BootTime())))
+
+ var vss, rss uint64
+ s.t.WithMuLocked(func(t *kernel.Task) {
+ if mm := t.MemoryManager(); mm != nil {
+ vss = mm.VirtualMemorySize()
+ rss = mm.ResidentSetSize()
+ }
+ })
+ fmt.Fprintf(buf, "%d %d ", vss, rss/usermem.PageSize)
+
+ // rsslim.
+ fmt.Fprintf(buf, "%d ", s.t.ThreadGroup().Limits().Get(limits.Rss).Cur)
+
+ fmt.Fprintf(buf, "0 0 0 0 0 " /* startcode endcode startstack kstkesp kstkeip */)
+ fmt.Fprintf(buf, "0 0 0 0 0 " /* signal blocked sigignore sigcatch wchan */)
+ fmt.Fprintf(buf, "0 0 " /* nswap cnswap */)
+ terminationSignal := linux.Signal(0)
+ if s.t == s.t.ThreadGroup().Leader() {
+ terminationSignal = s.t.ThreadGroup().TerminationSignal()
+ }
+ fmt.Fprintf(buf, "%d ", terminationSignal)
+ fmt.Fprintf(buf, "0 0 0 " /* processor rt_priority policy */)
+ fmt.Fprintf(buf, "0 0 0 " /* delayacct_blkio_ticks guest_time cguest_time */)
+ fmt.Fprintf(buf, "0 0 0 0 0 0 0 " /* start_data end_data start_brk arg_start arg_end env_start env_end */)
+ fmt.Fprintf(buf, "0\n" /* exit_code */)
+
+ return nil
+}
+
+// statmData implements vfs.DynamicBytesSource for /proc/[pid]/statm.
+//
+// +stateify savable
+type statmData struct {
+ kernfs.DynamicBytesFile
+
+ t *kernel.Task
+}
+
+var _ dynamicInode = (*statmData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (s *statmData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+ var vss, rss uint64
+ s.t.WithMuLocked(func(t *kernel.Task) {
+ if mm := t.MemoryManager(); mm != nil {
+ vss = mm.VirtualMemorySize()
+ rss = mm.ResidentSetSize()
+ }
+ })
+
+ fmt.Fprintf(buf, "%d %d 0 0 0 0 0\n", vss/usermem.PageSize, rss/usermem.PageSize)
+ return nil
+}
+
+// statusData implements vfs.DynamicBytesSource for /proc/[pid]/status.
+//
+// +stateify savable
+type statusData struct {
+ kernfs.DynamicBytesFile
+
+ t *kernel.Task
+ pidns *kernel.PIDNamespace
+}
+
+var _ dynamicInode = (*statusData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+ fmt.Fprintf(buf, "Name:\t%s\n", s.t.Name())
+ fmt.Fprintf(buf, "State:\t%s\n", s.t.StateStatus())
+ fmt.Fprintf(buf, "Tgid:\t%d\n", s.pidns.IDOfThreadGroup(s.t.ThreadGroup()))
+ fmt.Fprintf(buf, "Pid:\t%d\n", s.pidns.IDOfTask(s.t))
+ ppid := kernel.ThreadID(0)
+ if parent := s.t.Parent(); parent != nil {
+ ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup())
+ }
+ fmt.Fprintf(buf, "PPid:\t%d\n", ppid)
+ tpid := kernel.ThreadID(0)
+ if tracer := s.t.Tracer(); tracer != nil {
+ tpid = s.pidns.IDOfTask(tracer)
+ }
+ fmt.Fprintf(buf, "TracerPid:\t%d\n", tpid)
+ var fds int
+ var vss, rss, data uint64
+ s.t.WithMuLocked(func(t *kernel.Task) {
+ if fdTable := t.FDTable(); fdTable != nil {
+ fds = fdTable.Size()
+ }
+ if mm := t.MemoryManager(); mm != nil {
+ vss = mm.VirtualMemorySize()
+ rss = mm.ResidentSetSize()
+ data = mm.VirtualDataSize()
+ }
+ })
+ fmt.Fprintf(buf, "FDSize:\t%d\n", fds)
+ fmt.Fprintf(buf, "VmSize:\t%d kB\n", vss>>10)
+ fmt.Fprintf(buf, "VmRSS:\t%d kB\n", rss>>10)
+ fmt.Fprintf(buf, "VmData:\t%d kB\n", data>>10)
+ fmt.Fprintf(buf, "Threads:\t%d\n", s.t.ThreadGroup().Count())
+ creds := s.t.Credentials()
+ fmt.Fprintf(buf, "CapInh:\t%016x\n", creds.InheritableCaps)
+ fmt.Fprintf(buf, "CapPrm:\t%016x\n", creds.PermittedCaps)
+ fmt.Fprintf(buf, "CapEff:\t%016x\n", creds.EffectiveCaps)
+ fmt.Fprintf(buf, "CapBnd:\t%016x\n", creds.BoundingCaps)
+ fmt.Fprintf(buf, "Seccomp:\t%d\n", s.t.SeccompMode())
+ // We unconditionally report a single NUMA node. See
+ // pkg/sentry/syscalls/linux/sys_mempolicy.go.
+ fmt.Fprintf(buf, "Mems_allowed:\t1\n")
+ fmt.Fprintf(buf, "Mems_allowed_list:\t0\n")
+ return nil
+}
+
+// ioUsage is the /proc/<pid>/io and /proc/<pid>/task/<tid>/io data provider.
+type ioUsage interface {
+ // IOUsage returns the io usage data.
+ IOUsage() *usage.IO
+}
+
+// +stateify savable
+type ioData struct {
+ kernfs.DynamicBytesFile
+
+ ioUsage
+}
+
+var _ dynamicInode = (*ioData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (i *ioData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+ io := usage.IO{}
+ io.Accumulate(i.IOUsage())
+
+ fmt.Fprintf(buf, "char: %d\n", io.CharsRead)
+ fmt.Fprintf(buf, "wchar: %d\n", io.CharsWritten)
+ fmt.Fprintf(buf, "syscr: %d\n", io.ReadSyscalls)
+ fmt.Fprintf(buf, "syscw: %d\n", io.WriteSyscalls)
+ fmt.Fprintf(buf, "read_bytes: %d\n", io.BytesRead)
+ fmt.Fprintf(buf, "write_bytes: %d\n", io.BytesWritten)
+ fmt.Fprintf(buf, "cancelled_write_bytes: %d\n", io.BytesWriteCancelled)
+ return nil
+}
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
new file mode 100644
index 000000000..d8f92d52f
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -0,0 +1,218 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+ "sort"
+ "strconv"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+const (
+ defaultPermission = 0444
+ selfName = "self"
+ threadSelfName = "thread-self"
+)
+
+// InoGenerator generates unique inode numbers for a given filesystem.
+type InoGenerator interface {
+ NextIno() uint64
+}
+
+// tasksInode represents the inode for /proc/ directory.
+//
+// +stateify savable
+type tasksInode struct {
+ kernfs.InodeNotSymlink
+ kernfs.InodeDirectoryNoNewChildren
+ kernfs.InodeAttrs
+ kernfs.OrderedChildren
+
+ inoGen InoGenerator
+ pidns *kernel.PIDNamespace
+
+ // '/proc/self' and '/proc/thread-self' have custom directory offsets in
+ // Linux. So handle them outside of OrderedChildren.
+ selfSymlink *vfs.Dentry
+ threadSelfSymlink *vfs.Dentry
+}
+
+var _ kernfs.Inode = (*tasksInode)(nil)
+
+func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNamespace) (*tasksInode, *kernfs.Dentry) {
+ root := auth.NewRootCredentials(pidns.UserNamespace())
+ contents := map[string]*kernfs.Dentry{
+ //"cpuinfo": newCPUInfo(ctx, msrc),
+ //"filesystems": seqfile.NewSeqFileInode(ctx, &filesystemsData{}, msrc),
+ "loadavg": newDentry(root, inoGen.NextIno(), defaultPermission, &loadavgData{}),
+ "meminfo": newDentry(root, inoGen.NextIno(), defaultPermission, &meminfoData{k: k}),
+ "mounts": kernfs.NewStaticSymlink(root, inoGen.NextIno(), defaultPermission, "self/mounts"),
+ "stat": newDentry(root, inoGen.NextIno(), defaultPermission, &statData{k: k}),
+ //"uptime": newUptime(ctx, msrc),
+ //"version": newVersionData(root, inoGen.NextIno(), k),
+ "version": newDentry(root, inoGen.NextIno(), defaultPermission, &versionData{k: k}),
+ }
+
+ inode := &tasksInode{
+ pidns: pidns,
+ inoGen: inoGen,
+ selfSymlink: newSelfSymlink(root, inoGen.NextIno(), 0444, pidns).VFSDentry(),
+ threadSelfSymlink: newThreadSelfSymlink(root, inoGen.NextIno(), 0444, pidns).VFSDentry(),
+ }
+ inode.InodeAttrs.Init(root, inoGen.NextIno(), linux.ModeDirectory|0555)
+
+ dentry := &kernfs.Dentry{}
+ dentry.Init(inode)
+
+ inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+ links := inode.OrderedChildren.Populate(dentry, contents)
+ inode.IncLinks(links)
+
+ return inode, dentry
+}
+
+// Lookup implements kernfs.inodeDynamicLookup.
+func (i *tasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+ // Try to lookup a corresponding task.
+ tid, err := strconv.ParseUint(name, 10, 64)
+ if err != nil {
+ // If it failed to parse, check if it's one of the special handled files.
+ switch name {
+ case selfName:
+ return i.selfSymlink, nil
+ case threadSelfName:
+ return i.threadSelfSymlink, nil
+ }
+ return nil, syserror.ENOENT
+ }
+
+ task := i.pidns.TaskWithID(kernel.ThreadID(tid))
+ if task == nil {
+ return nil, syserror.ENOENT
+ }
+
+ taskDentry := newTaskInode(i.inoGen, task, i.pidns, true)
+ return taskDentry.VFSDentry(), nil
+}
+
+// Valid implements kernfs.inodeDynamicLookup.
+func (i *tasksInode) Valid(ctx context.Context) bool {
+ return true
+}
+
+// IterDirents implements kernfs.inodeDynamicLookup.
+func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, _ int64) (int64, error) {
+ // fs/proc/internal.h: #define FIRST_PROCESS_ENTRY 256
+ const FIRST_PROCESS_ENTRY = 256
+
+ // Use maxTaskID to shortcut searches that will result in 0 entries.
+ const maxTaskID = kernel.TasksLimit + 1
+ if offset >= maxTaskID {
+ return offset, nil
+ }
+
+ // According to Linux (fs/proc/base.c:proc_pid_readdir()), process directories
+ // start at offset FIRST_PROCESS_ENTRY with '/proc/self', followed by
+ // '/proc/thread-self' and then '/proc/[pid]'.
+ if offset < FIRST_PROCESS_ENTRY {
+ offset = FIRST_PROCESS_ENTRY
+ }
+
+ if offset == FIRST_PROCESS_ENTRY {
+ dirent := vfs.Dirent{
+ Name: selfName,
+ Type: linux.DT_LNK,
+ Ino: i.inoGen.NextIno(),
+ NextOff: offset + 1,
+ }
+ if !cb.Handle(dirent) {
+ return offset, nil
+ }
+ offset++
+ }
+ if offset == FIRST_PROCESS_ENTRY+1 {
+ dirent := vfs.Dirent{
+ Name: threadSelfName,
+ Type: linux.DT_LNK,
+ Ino: i.inoGen.NextIno(),
+ NextOff: offset + 1,
+ }
+ if !cb.Handle(dirent) {
+ return offset, nil
+ }
+ offset++
+ }
+
+ // Collect all tasks that TGIDs are greater than the offset specified. Per
+ // Linux we only include in directory listings if it's the leader. But for
+ // whatever crazy reason, you can still walk to the given node.
+ var tids []int
+ startTid := offset - FIRST_PROCESS_ENTRY - 2
+ for _, tg := range i.pidns.ThreadGroups() {
+ tid := i.pidns.IDOfThreadGroup(tg)
+ if int64(tid) < startTid {
+ continue
+ }
+ if leader := tg.Leader(); leader != nil {
+ tids = append(tids, int(tid))
+ }
+ }
+
+ if len(tids) == 0 {
+ return offset, nil
+ }
+
+ sort.Ints(tids)
+ for _, tid := range tids {
+ dirent := vfs.Dirent{
+ Name: strconv.FormatUint(uint64(tid), 10),
+ Type: linux.DT_DIR,
+ Ino: i.inoGen.NextIno(),
+ NextOff: FIRST_PROCESS_ENTRY + 2 + int64(tid) + 1,
+ }
+ if !cb.Handle(dirent) {
+ return offset, nil
+ }
+ offset++
+ }
+ return maxTaskID, nil
+}
+
+// Open implements kernfs.Inode.
+func (i *tasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+ fd := &kernfs.GenericDirectoryFD{}
+ fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, flags)
+ return fd.VFSFileDescription(), nil
+}
+
+func (i *tasksInode) Stat(vsfs *vfs.Filesystem) linux.Statx {
+ stat := i.InodeAttrs.Stat(vsfs)
+
+ // Add dynamic children to link count.
+ for _, tg := range i.pidns.ThreadGroups() {
+ if leader := tg.Leader(); leader != nil {
+ stat.Nlink++
+ }
+ }
+
+ return stat
+}
diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go
new file mode 100644
index 000000000..91f30a798
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/tasks_files.go
@@ -0,0 +1,92 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+ "fmt"
+ "strconv"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+type selfSymlink struct {
+ kernfs.InodeAttrs
+ kernfs.InodeNoopRefCount
+ kernfs.InodeSymlink
+
+ pidns *kernel.PIDNamespace
+}
+
+var _ kernfs.Inode = (*selfSymlink)(nil)
+
+func newSelfSymlink(creds *auth.Credentials, ino uint64, perm linux.FileMode, pidns *kernel.PIDNamespace) *kernfs.Dentry {
+ inode := &selfSymlink{pidns: pidns}
+ inode.Init(creds, ino, linux.ModeSymlink|perm)
+
+ d := &kernfs.Dentry{}
+ d.Init(inode)
+ return d
+}
+
+func (s *selfSymlink) Readlink(ctx context.Context) (string, error) {
+ t := kernel.TaskFromContext(ctx)
+ if t == nil {
+ // Who is reading this link?
+ return "", syserror.EINVAL
+ }
+ tgid := s.pidns.IDOfThreadGroup(t.ThreadGroup())
+ if tgid == 0 {
+ return "", syserror.ENOENT
+ }
+ return strconv.FormatUint(uint64(tgid), 10), nil
+}
+
+type threadSelfSymlink struct {
+ kernfs.InodeAttrs
+ kernfs.InodeNoopRefCount
+ kernfs.InodeSymlink
+
+ pidns *kernel.PIDNamespace
+}
+
+var _ kernfs.Inode = (*threadSelfSymlink)(nil)
+
+func newThreadSelfSymlink(creds *auth.Credentials, ino uint64, perm linux.FileMode, pidns *kernel.PIDNamespace) *kernfs.Dentry {
+ inode := &threadSelfSymlink{pidns: pidns}
+ inode.Init(creds, ino, linux.ModeSymlink|perm)
+
+ d := &kernfs.Dentry{}
+ d.Init(inode)
+ return d
+}
+
+func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) {
+ t := kernel.TaskFromContext(ctx)
+ if t == nil {
+ // Who is reading this link?
+ return "", syserror.EINVAL
+ }
+ tgid := s.pidns.IDOfThreadGroup(t.ThreadGroup())
+ tid := s.pidns.IDOfTask(t)
+ if tid == 0 || tgid == 0 {
+ return "", syserror.ENOENT
+ }
+ return fmt.Sprintf("%d/task/%d", tgid, tid), nil
+}
diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go
new file mode 100644
index 000000000..ca8c87ec2
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/tasks_test.go
@@ -0,0 +1,555 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+ "fmt"
+ "math"
+ "path"
+ "strconv"
+ "testing"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+var (
+ // Next offset 256 by convention. Adds 1 for the next offset.
+ selfLink = vfs.Dirent{Type: linux.DT_LNK, NextOff: 256 + 0 + 1}
+ threadSelfLink = vfs.Dirent{Type: linux.DT_LNK, NextOff: 256 + 1 + 1}
+
+ // /proc/[pid] next offset starts at 256+2 (files above), then adds the
+ // PID, and adds 1 for the next offset.
+ proc1 = vfs.Dirent{Type: linux.DT_DIR, NextOff: 258 + 1 + 1}
+ proc2 = vfs.Dirent{Type: linux.DT_DIR, NextOff: 258 + 2 + 1}
+ proc3 = vfs.Dirent{Type: linux.DT_DIR, NextOff: 258 + 3 + 1}
+)
+
+type testIterDirentsCallback struct {
+ dirents []vfs.Dirent
+}
+
+func (t *testIterDirentsCallback) Handle(d vfs.Dirent) bool {
+ t.dirents = append(t.dirents, d)
+ return true
+}
+
+func checkDots(dirs []vfs.Dirent) ([]vfs.Dirent, error) {
+ if got := len(dirs); got < 2 {
+ return dirs, fmt.Errorf("wrong number of dirents, want at least: 2, got: %d: %v", got, dirs)
+ }
+ for i, want := range []string{".", ".."} {
+ if got := dirs[i].Name; got != want {
+ return dirs, fmt.Errorf("wrong name, want: %s, got: %s", want, got)
+ }
+ if got := dirs[i].Type; got != linux.DT_DIR {
+ return dirs, fmt.Errorf("wrong type, want: %d, got: %d", linux.DT_DIR, got)
+ }
+ }
+ return dirs[2:], nil
+}
+
+func checkTasksStaticFiles(gots []vfs.Dirent) ([]vfs.Dirent, error) {
+ wants := map[string]vfs.Dirent{
+ "loadavg": {Type: linux.DT_REG},
+ "meminfo": {Type: linux.DT_REG},
+ "mounts": {Type: linux.DT_LNK},
+ "self": selfLink,
+ "stat": {Type: linux.DT_REG},
+ "thread-self": threadSelfLink,
+ "version": {Type: linux.DT_REG},
+ }
+ return checkFiles(gots, wants)
+}
+
+func checkTaskStaticFiles(gots []vfs.Dirent) ([]vfs.Dirent, error) {
+ wants := map[string]vfs.Dirent{
+ "io": {Type: linux.DT_REG},
+ "maps": {Type: linux.DT_REG},
+ "smaps": {Type: linux.DT_REG},
+ "stat": {Type: linux.DT_REG},
+ "statm": {Type: linux.DT_REG},
+ "status": {Type: linux.DT_REG},
+ }
+ return checkFiles(gots, wants)
+}
+
+func checkFiles(gots []vfs.Dirent, wants map[string]vfs.Dirent) ([]vfs.Dirent, error) {
+ // Go over all files, when there is a match, the file is removed from both
+ // 'gots' and 'wants'. wants is expected to reach 0, as all files must
+ // be present. Remaining files in 'gots', is returned to caller to decide
+ // whether this is valid or not.
+ for i := 0; i < len(gots); i++ {
+ got := gots[i]
+ want, ok := wants[got.Name]
+ if !ok {
+ continue
+ }
+ if want.Type != got.Type {
+ return gots, fmt.Errorf("wrong file type, want: %v, got: %v: %+v", want.Type, got.Type, got)
+ }
+ if want.NextOff != 0 && want.NextOff != got.NextOff {
+ return gots, fmt.Errorf("wrong dirent offset, want: %v, got: %v: %+v", want.NextOff, got.NextOff, got)
+ }
+
+ delete(wants, got.Name)
+ gots = append(gots[0:i], gots[i+1:]...)
+ i--
+ }
+ if len(wants) != 0 {
+ return gots, fmt.Errorf("not all files were found, missing: %+v", wants)
+ }
+ return gots, nil
+}
+
+func setup() (context.Context, *vfs.VirtualFilesystem, vfs.VirtualDentry, error) {
+ k, err := boot()
+ if err != nil {
+ return nil, nil, vfs.VirtualDentry{}, fmt.Errorf("creating kernel: %v", err)
+ }
+
+ ctx := k.SupervisorContext()
+ creds := auth.CredentialsFromContext(ctx)
+
+ vfsObj := vfs.New()
+ vfsObj.MustRegisterFilesystemType("procfs", &procFSType{}, &vfs.RegisterFilesystemTypeOptions{
+ AllowUserMount: true,
+ })
+ mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "procfs", &vfs.GetFilesystemOptions{})
+ if err != nil {
+ return nil, nil, vfs.VirtualDentry{}, fmt.Errorf("NewMountNamespace(): %v", err)
+ }
+ return ctx, vfsObj, mntns.Root(), nil
+}
+
+func TestTasksEmpty(t *testing.T) {
+ ctx, vfsObj, root, err := setup()
+ if err != nil {
+ t.Fatalf("Setup failed: %v", err)
+ }
+ defer root.DecRef()
+
+ fd, err := vfsObj.OpenAt(
+ ctx,
+ auth.CredentialsFromContext(ctx),
+ &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/")},
+ &vfs.OpenOptions{},
+ )
+ if err != nil {
+ t.Fatalf("vfsfs.OpenAt failed: %v", err)
+ }
+
+ cb := testIterDirentsCallback{}
+ if err := fd.Impl().IterDirents(ctx, &cb); err != nil {
+ t.Fatalf("IterDirents(): %v", err)
+ }
+ cb.dirents, err = checkDots(cb.dirents)
+ if err != nil {
+ t.Error(err.Error())
+ }
+ cb.dirents, err = checkTasksStaticFiles(cb.dirents)
+ if err != nil {
+ t.Error(err.Error())
+ }
+ if len(cb.dirents) != 0 {
+ t.Errorf("found more files than expected: %+v", cb.dirents)
+ }
+}
+
+func TestTasks(t *testing.T) {
+ ctx, vfsObj, root, err := setup()
+ if err != nil {
+ t.Fatalf("Setup failed: %v", err)
+ }
+ defer root.DecRef()
+
+ k := kernel.KernelFromContext(ctx)
+ var tasks []*kernel.Task
+ for i := 0; i < 5; i++ {
+ tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
+ task, err := createTask(ctx, fmt.Sprintf("name-%d", i), tc)
+ if err != nil {
+ t.Fatalf("CreateTask(): %v", err)
+ }
+ tasks = append(tasks, task)
+ }
+
+ fd, err := vfsObj.OpenAt(
+ ctx,
+ auth.CredentialsFromContext(ctx),
+ &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/")},
+ &vfs.OpenOptions{},
+ )
+ if err != nil {
+ t.Fatalf("vfsfs.OpenAt(/) failed: %v", err)
+ }
+
+ cb := testIterDirentsCallback{}
+ if err := fd.Impl().IterDirents(ctx, &cb); err != nil {
+ t.Fatalf("IterDirents(): %v", err)
+ }
+ cb.dirents, err = checkDots(cb.dirents)
+ if err != nil {
+ t.Error(err.Error())
+ }
+ cb.dirents, err = checkTasksStaticFiles(cb.dirents)
+ if err != nil {
+ t.Error(err.Error())
+ }
+ lastPid := 0
+ for _, d := range cb.dirents {
+ pid, err := strconv.Atoi(d.Name)
+ if err != nil {
+ t.Fatalf("Invalid process directory %q", d.Name)
+ }
+ if lastPid > pid {
+ t.Errorf("pids not in order: %v", cb.dirents)
+ }
+ found := false
+ for _, t := range tasks {
+ if k.TaskSet().Root.IDOfTask(t) == kernel.ThreadID(pid) {
+ found = true
+ }
+ }
+ if !found {
+ t.Errorf("Additional task ID %d listed: %v", pid, tasks)
+ }
+ // Next offset starts at 256+2 ('self' and 'thread-self'), then adds the
+ // PID, and adds 1 for the next offset.
+ if want := int64(256 + 2 + pid + 1); d.NextOff != want {
+ t.Errorf("Wrong dirent offset want: %d got: %d: %+v", want, d.NextOff, d)
+ }
+ }
+
+ // Test lookup.
+ for _, path := range []string{"/1", "/2"} {
+ fd, err := vfsObj.OpenAt(
+ ctx,
+ auth.CredentialsFromContext(ctx),
+ &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse(path)},
+ &vfs.OpenOptions{},
+ )
+ if err != nil {
+ t.Fatalf("vfsfs.OpenAt(%q) failed: %v", path, err)
+ }
+ buf := make([]byte, 1)
+ bufIOSeq := usermem.BytesIOSequence(buf)
+ if _, err := fd.Read(ctx, bufIOSeq, vfs.ReadOptions{}); err != syserror.EISDIR {
+ t.Errorf("wrong error reading directory: %v", err)
+ }
+ }
+
+ if _, err := vfsObj.OpenAt(
+ ctx,
+ auth.CredentialsFromContext(ctx),
+ &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/9999")},
+ &vfs.OpenOptions{},
+ ); err != syserror.ENOENT {
+ t.Fatalf("wrong error from vfsfs.OpenAt(/9999): %v", err)
+ }
+}
+
+func TestTasksOffset(t *testing.T) {
+ ctx, vfsObj, root, err := setup()
+ if err != nil {
+ t.Fatalf("Setup failed: %v", err)
+ }
+ defer root.DecRef()
+
+ k := kernel.KernelFromContext(ctx)
+ for i := 0; i < 3; i++ {
+ tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
+ if _, err := createTask(ctx, fmt.Sprintf("name-%d", i), tc); err != nil {
+ t.Fatalf("CreateTask(): %v", err)
+ }
+ }
+
+ for _, tc := range []struct {
+ name string
+ offset int64
+ wants map[string]vfs.Dirent
+ }{
+ {
+ name: "small offset",
+ offset: 100,
+ wants: map[string]vfs.Dirent{
+ "self": selfLink,
+ "thread-self": threadSelfLink,
+ "1": proc1,
+ "2": proc2,
+ "3": proc3,
+ },
+ },
+ {
+ name: "offset at start",
+ offset: 256,
+ wants: map[string]vfs.Dirent{
+ "self": selfLink,
+ "thread-self": threadSelfLink,
+ "1": proc1,
+ "2": proc2,
+ "3": proc3,
+ },
+ },
+ {
+ name: "skip /proc/self",
+ offset: 257,
+ wants: map[string]vfs.Dirent{
+ "thread-self": threadSelfLink,
+ "1": proc1,
+ "2": proc2,
+ "3": proc3,
+ },
+ },
+ {
+ name: "skip symlinks",
+ offset: 258,
+ wants: map[string]vfs.Dirent{
+ "1": proc1,
+ "2": proc2,
+ "3": proc3,
+ },
+ },
+ {
+ name: "skip first process",
+ offset: 260,
+ wants: map[string]vfs.Dirent{
+ "2": proc2,
+ "3": proc3,
+ },
+ },
+ {
+ name: "last process",
+ offset: 261,
+ wants: map[string]vfs.Dirent{
+ "3": proc3,
+ },
+ },
+ {
+ name: "after last",
+ offset: 262,
+ wants: nil,
+ },
+ {
+ name: "TaskLimit+1",
+ offset: kernel.TasksLimit + 1,
+ wants: nil,
+ },
+ {
+ name: "max",
+ offset: math.MaxInt64,
+ wants: nil,
+ },
+ } {
+ t.Run(tc.name, func(t *testing.T) {
+ fd, err := vfsObj.OpenAt(
+ ctx,
+ auth.CredentialsFromContext(ctx),
+ &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/")},
+ &vfs.OpenOptions{},
+ )
+ if err != nil {
+ t.Fatalf("vfsfs.OpenAt(/) failed: %v", err)
+ }
+ if _, err := fd.Impl().Seek(ctx, tc.offset, linux.SEEK_SET); err != nil {
+ t.Fatalf("Seek(%d, SEEK_SET): %v", tc.offset, err)
+ }
+
+ cb := testIterDirentsCallback{}
+ if err := fd.Impl().IterDirents(ctx, &cb); err != nil {
+ t.Fatalf("IterDirents(): %v", err)
+ }
+ if cb.dirents, err = checkFiles(cb.dirents, tc.wants); err != nil {
+ t.Error(err.Error())
+ }
+ if len(cb.dirents) != 0 {
+ t.Errorf("found more files than expected: %+v", cb.dirents)
+ }
+ })
+ }
+}
+
+func TestTask(t *testing.T) {
+ ctx, vfsObj, root, err := setup()
+ if err != nil {
+ t.Fatalf("Setup failed: %v", err)
+ }
+ defer root.DecRef()
+
+ k := kernel.KernelFromContext(ctx)
+ tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
+ _, err = createTask(ctx, "name", tc)
+ if err != nil {
+ t.Fatalf("CreateTask(): %v", err)
+ }
+
+ fd, err := vfsObj.OpenAt(
+ ctx,
+ auth.CredentialsFromContext(ctx),
+ &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/1")},
+ &vfs.OpenOptions{},
+ )
+ if err != nil {
+ t.Fatalf("vfsfs.OpenAt(/1) failed: %v", err)
+ }
+
+ cb := testIterDirentsCallback{}
+ if err := fd.Impl().IterDirents(ctx, &cb); err != nil {
+ t.Fatalf("IterDirents(): %v", err)
+ }
+ cb.dirents, err = checkDots(cb.dirents)
+ if err != nil {
+ t.Error(err.Error())
+ }
+ cb.dirents, err = checkTaskStaticFiles(cb.dirents)
+ if err != nil {
+ t.Error(err.Error())
+ }
+ if len(cb.dirents) != 0 {
+ t.Errorf("found more files than expected: %+v", cb.dirents)
+ }
+}
+
+func TestProcSelf(t *testing.T) {
+ ctx, vfsObj, root, err := setup()
+ if err != nil {
+ t.Fatalf("Setup failed: %v", err)
+ }
+ defer root.DecRef()
+
+ k := kernel.KernelFromContext(ctx)
+ tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
+ task, err := createTask(ctx, "name", tc)
+ if err != nil {
+ t.Fatalf("CreateTask(): %v", err)
+ }
+
+ fd, err := vfsObj.OpenAt(
+ task,
+ auth.CredentialsFromContext(ctx),
+ &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/self/"), FollowFinalSymlink: true},
+ &vfs.OpenOptions{},
+ )
+ if err != nil {
+ t.Fatalf("vfsfs.OpenAt(/self/) failed: %v", err)
+ }
+
+ cb := testIterDirentsCallback{}
+ if err := fd.Impl().IterDirents(ctx, &cb); err != nil {
+ t.Fatalf("IterDirents(): %v", err)
+ }
+ cb.dirents, err = checkDots(cb.dirents)
+ if err != nil {
+ t.Error(err.Error())
+ }
+ cb.dirents, err = checkTaskStaticFiles(cb.dirents)
+ if err != nil {
+ t.Error(err.Error())
+ }
+ if len(cb.dirents) != 0 {
+ t.Errorf("found more files than expected: %+v", cb.dirents)
+ }
+}
+
+func iterateDir(ctx context.Context, t *testing.T, vfsObj *vfs.VirtualFilesystem, root vfs.VirtualDentry, fd *vfs.FileDescription) {
+ t.Logf("Iterating: /proc%s", fd.MappedName(ctx))
+
+ cb := testIterDirentsCallback{}
+ if err := fd.Impl().IterDirents(ctx, &cb); err != nil {
+ t.Fatalf("IterDirents(): %v", err)
+ }
+ var err error
+ cb.dirents, err = checkDots(cb.dirents)
+ if err != nil {
+ t.Error(err.Error())
+ }
+ for _, d := range cb.dirents {
+ childPath := path.Join(fd.MappedName(ctx), d.Name)
+ if d.Type == linux.DT_LNK {
+ link, err := vfsObj.ReadlinkAt(
+ ctx,
+ auth.CredentialsFromContext(ctx),
+ &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse(childPath)},
+ )
+ if err != nil {
+ t.Errorf("vfsfs.ReadlinkAt(%v) failed: %v", childPath, err)
+ } else {
+ t.Logf("Skipping symlink: /proc%s => %s", childPath, link)
+ }
+ continue
+ }
+
+ t.Logf("Opening: /proc%s", childPath)
+ child, err := vfsObj.OpenAt(
+ ctx,
+ auth.CredentialsFromContext(ctx),
+ &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse(childPath)},
+ &vfs.OpenOptions{},
+ )
+ if err != nil {
+ t.Errorf("vfsfs.OpenAt(%v) failed: %v", childPath, err)
+ continue
+ }
+ stat, err := child.Stat(ctx, vfs.StatOptions{})
+ if err != nil {
+ t.Errorf("Stat(%v) failed: %v", childPath, err)
+ }
+ if got := linux.FileMode(stat.Mode).DirentType(); got != d.Type {
+ t.Errorf("wrong file mode, stat: %v, dirent: %v", got, d.Type)
+ }
+ if d.Type == linux.DT_DIR {
+ // Found another dir, let's do it again!
+ iterateDir(ctx, t, vfsObj, root, child)
+ }
+ }
+}
+
+// TestTree iterates all directories and stats every file.
+func TestTree(t *testing.T) {
+ uberCtx, vfsObj, root, err := setup()
+ if err != nil {
+ t.Fatalf("Setup failed: %v", err)
+ }
+ defer root.DecRef()
+
+ k := kernel.KernelFromContext(uberCtx)
+ var tasks []*kernel.Task
+ for i := 0; i < 5; i++ {
+ tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
+ task, err := createTask(uberCtx, fmt.Sprintf("name-%d", i), tc)
+ if err != nil {
+ t.Fatalf("CreateTask(): %v", err)
+ }
+ tasks = append(tasks, task)
+ }
+
+ ctx := tasks[0]
+ fd, err := vfsObj.OpenAt(
+ ctx,
+ auth.CredentialsFromContext(uberCtx),
+ &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/")},
+ &vfs.OpenOptions{},
+ )
+ if err != nil {
+ t.Fatalf("vfsfs.OpenAt(/) failed: %v", err)
+ }
+ iterateDir(ctx, t, vfsObj, root, fd)
+}
diff --git a/pkg/sentry/fsimpl/proc/version.go b/pkg/sentry/fsimpl/proc/version.go
index e1643d4e0..367f2396b 100644
--- a/pkg/sentry/fsimpl/proc/version.go
+++ b/pkg/sentry/fsimpl/proc/version.go
@@ -19,19 +19,21 @@ import (
"fmt"
"gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
- "gvisor.dev/gvisor/pkg/sentry/vfs"
)
// versionData implements vfs.DynamicBytesSource for /proc/version.
//
// +stateify savable
type versionData struct {
+ kernfs.DynamicBytesFile
+
// k is the owning Kernel.
k *kernel.Kernel
}
-var _ vfs.DynamicBytesSource = (*versionData)(nil)
+var _ dynamicInode = (*versionData)(nil)
// Generate implements vfs.DynamicBytesSource.Generate.
func (v *versionData) Generate(ctx context.Context, buf *bytes.Buffer) error {
diff --git a/pkg/sentry/fsimpl/memfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD
index bc5c0b591..7601c7c04 100644
--- a/pkg/sentry/fsimpl/memfs/BUILD
+++ b/pkg/sentry/fsimpl/tmpfs/BUILD
@@ -1,14 +1,13 @@
load("//tools/go_stateify:defs.bzl", "go_library")
load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
package(licenses = ["notice"])
-load("//tools/go_generics:defs.bzl", "go_template_instance")
-
go_template_instance(
name = "dentry_list",
out = "dentry_list.go",
- package = "memfs",
+ package = "tmpfs",
prefix = "dentry",
template = "//pkg/ilist:generic_list",
types = {
@@ -18,26 +17,38 @@ go_template_instance(
)
go_library(
- name = "memfs",
+ name = "tmpfs",
srcs = [
"dentry_list.go",
"directory.go",
"filesystem.go",
- "memfs.go",
"named_pipe.go",
"regular_file.go",
"symlink.go",
+ "tmpfs.go",
],
- importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/memfs",
+ importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs",
deps = [
"//pkg/abi/linux",
"//pkg/amutex",
+ "//pkg/fspath",
+ "//pkg/log",
"//pkg/sentry/arch",
"//pkg/sentry/context",
+ "//pkg/sentry/fs",
+ "//pkg/sentry/fs/fsutil",
+ "//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/kernel/pipe",
+ "//pkg/sentry/kernel/time",
+ "//pkg/sentry/memmap",
+ "//pkg/sentry/pgalloc",
+ "//pkg/sentry/platform",
+ "//pkg/sentry/safemem",
+ "//pkg/sentry/usage",
"//pkg/sentry/usermem",
"//pkg/sentry/vfs",
+ "//pkg/sync",
"//pkg/syserror",
],
)
@@ -47,8 +58,9 @@ go_test(
size = "small",
srcs = ["benchmark_test.go"],
deps = [
- ":memfs",
+ ":tmpfs",
"//pkg/abi/linux",
+ "//pkg/fspath",
"//pkg/refs",
"//pkg/sentry/context",
"//pkg/sentry/context/contexttest",
@@ -61,15 +73,21 @@ go_test(
)
go_test(
- name = "memfs_test",
+ name = "tmpfs_test",
size = "small",
- srcs = ["pipe_test.go"],
- embed = [":memfs"],
+ srcs = [
+ "pipe_test.go",
+ "regular_file_test.go",
+ "stat_test.go",
+ ],
+ embed = [":tmpfs"],
deps = [
"//pkg/abi/linux",
+ "//pkg/fspath",
"//pkg/sentry/context",
"//pkg/sentry/context/contexttest",
"//pkg/sentry/kernel/auth",
+ "//pkg/sentry/kernel/contexttest",
"//pkg/sentry/usermem",
"//pkg/sentry/vfs",
"//pkg/syserror",
diff --git a/pkg/sentry/fsimpl/memfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
index ea6417ce7..d88c83499 100644
--- a/pkg/sentry/fsimpl/memfs/benchmark_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
@@ -21,12 +21,13 @@ import (
"testing"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
"gvisor.dev/gvisor/pkg/sentry/fs"
_ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
- "gvisor.dev/gvisor/pkg/sentry/fsimpl/memfs"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
@@ -175,8 +176,10 @@ func BenchmarkVFS2MemfsStat(b *testing.B) {
// Create VFS.
vfsObj := vfs.New()
- vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{})
- mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.GetFilesystemOptions{})
+ vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+ AllowUserMount: true,
+ })
+ mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
if err != nil {
b.Fatalf("failed to create tmpfs root mount: %v", err)
}
@@ -193,9 +196,9 @@ func BenchmarkVFS2MemfsStat(b *testing.B) {
for i := depth; i > 0; i-- {
name := fmt.Sprintf("%d", i)
pop := vfs.PathOperation{
- Root: root,
- Start: vd,
- Pathname: name,
+ Root: root,
+ Start: vd,
+ Path: fspath.Parse(name),
}
if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{
Mode: 0755,
@@ -216,7 +219,7 @@ func BenchmarkVFS2MemfsStat(b *testing.B) {
fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
Root: root,
Start: vd,
- Pathname: filename,
+ Path: fspath.Parse(filename),
FollowFinalSymlink: true,
}, &vfs.OpenOptions{
Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
@@ -237,7 +240,7 @@ func BenchmarkVFS2MemfsStat(b *testing.B) {
stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
Root: root,
Start: root,
- Pathname: filePath,
+ Path: fspath.Parse(filePath),
FollowFinalSymlink: true,
}, &vfs.StatOptions{})
if err != nil {
@@ -364,8 +367,10 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
// Create VFS.
vfsObj := vfs.New()
- vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{})
- mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.GetFilesystemOptions{})
+ vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+ AllowUserMount: true,
+ })
+ mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
if err != nil {
b.Fatalf("failed to create tmpfs root mount: %v", err)
}
@@ -378,9 +383,9 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
root := mntns.Root()
defer root.DecRef()
pop := vfs.PathOperation{
- Root: root,
- Start: root,
- Pathname: mountPointName,
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(mountPointName),
}
if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{
Mode: 0755,
@@ -394,7 +399,7 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
}
defer mountPoint.DecRef()
// Create and mount the submount.
- if err := vfsObj.NewMount(ctx, creds, "", &pop, "memfs", &vfs.GetFilesystemOptions{}); err != nil {
+ if err := vfsObj.MountAt(ctx, creds, "", &pop, "tmpfs", &vfs.MountOptions{}); err != nil {
b.Fatalf("failed to mount tmpfs submount: %v", err)
}
filePathBuilder.WriteString(mountPointName)
@@ -408,9 +413,9 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
for i := depth; i > 0; i-- {
name := fmt.Sprintf("%d", i)
pop := vfs.PathOperation{
- Root: root,
- Start: vd,
- Pathname: name,
+ Root: root,
+ Start: vd,
+ Path: fspath.Parse(name),
}
if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{
Mode: 0755,
@@ -438,7 +443,7 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
Root: root,
Start: vd,
- Pathname: filename,
+ Path: fspath.Parse(filename),
FollowFinalSymlink: true,
}, &vfs.OpenOptions{
Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
@@ -458,7 +463,7 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
Root: root,
Start: root,
- Pathname: filePath,
+ Path: fspath.Parse(filePath),
FollowFinalSymlink: true,
}, &vfs.StatOptions{})
if err != nil {
diff --git a/pkg/sentry/fsimpl/memfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go
index 0bd82e480..887ca2619 100644
--- a/pkg/sentry/fsimpl/memfs/directory.go
+++ b/pkg/sentry/fsimpl/tmpfs/directory.go
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-package memfs
+package tmpfs
import (
"gvisor.dev/gvisor/pkg/abi/linux"
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
new file mode 100644
index 000000000..4cd7e9aea
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -0,0 +1,696 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+ "fmt"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Sync implements vfs.FilesystemImpl.Sync.
+func (fs *filesystem) Sync(ctx context.Context) error {
+ // All filesystem state is in-memory.
+ return nil
+}
+
+// stepLocked resolves rp.Component() to an existing file, starting from the
+// given directory.
+//
+// stepLocked is loosely analogous to fs/namei.c:walk_component().
+//
+// Preconditions: filesystem.mu must be locked. !rp.Done().
+func stepLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) {
+ if !d.inode.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+ if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+ return nil, err
+ }
+afterSymlink:
+ nextVFSD, err := rp.ResolveComponent(&d.vfsd)
+ if err != nil {
+ return nil, err
+ }
+ if nextVFSD == nil {
+ // Since the Dentry tree is the sole source of truth for tmpfs, if it's
+ // not in the Dentry tree, it doesn't exist.
+ return nil, syserror.ENOENT
+ }
+ next := nextVFSD.Impl().(*dentry)
+ if symlink, ok := next.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
+ // TODO(gvisor.dev/issues/1197): Symlink traversals updates
+ // access time.
+ if err := rp.HandleSymlink(symlink.target); err != nil {
+ return nil, err
+ }
+ goto afterSymlink // don't check the current directory again
+ }
+ rp.Advance()
+ return next, nil
+}
+
+// walkParentDirLocked resolves all but the last path component of rp to an
+// existing directory, starting from the given directory (which is usually
+// rp.Start().Impl().(*dentry)). It does not check that the returned directory
+// is searchable by the provider of rp.
+//
+// walkParentDirLocked is loosely analogous to Linux's
+// fs/namei.c:path_parentat().
+//
+// Preconditions: filesystem.mu must be locked. !rp.Done().
+func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) {
+ for !rp.Final() {
+ next, err := stepLocked(rp, d)
+ if err != nil {
+ return nil, err
+ }
+ d = next
+ }
+ if !d.inode.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+ return d, nil
+}
+
+// resolveLocked resolves rp to an existing file.
+//
+// resolveLocked is loosely analogous to Linux's fs/namei.c:path_lookupat().
+//
+// Preconditions: filesystem.mu must be locked.
+func resolveLocked(rp *vfs.ResolvingPath) (*dentry, error) {
+ d := rp.Start().Impl().(*dentry)
+ for !rp.Done() {
+ next, err := stepLocked(rp, d)
+ if err != nil {
+ return nil, err
+ }
+ d = next
+ }
+ if rp.MustBeDir() && !d.inode.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+ return d, nil
+}
+
+// doCreateAt checks that creating a file at rp is permitted, then invokes
+// create to do so.
+//
+// doCreateAt is loosely analogous to a conjunction of Linux's
+// fs/namei.c:filename_create() and done_path_create().
+//
+// Preconditions: !rp.Done(). For the final path component in rp,
+// !rp.ShouldFollowSymlink().
+func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string) error) error {
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+ parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+ if err != nil {
+ return err
+ }
+ if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+ return err
+ }
+ name := rp.Component()
+ if name == "." || name == ".." {
+ return syserror.EEXIST
+ }
+ // Call parent.vfsd.Child() instead of stepLocked() or rp.ResolveChild(),
+ // because if the child exists we want to return EEXIST immediately instead
+ // of attempting symlink/mount traversal.
+ if parent.vfsd.Child(name) != nil {
+ return syserror.EEXIST
+ }
+ if !dir && rp.MustBeDir() {
+ return syserror.ENOENT
+ }
+ // In memfs, the only way to cause a dentry to be disowned is by removing
+ // it from the filesystem, so this check is equivalent to checking if
+ // parent has been removed.
+ if parent.vfsd.IsDisowned() {
+ return syserror.ENOENT
+ }
+ mnt := rp.Mount()
+ if err := mnt.CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer mnt.EndWrite()
+ return create(parent, name)
+}
+
+// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
+func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
+ fs.mu.RLock()
+ defer fs.mu.RUnlock()
+ d, err := resolveLocked(rp)
+ if err != nil {
+ return nil, err
+ }
+ if opts.CheckSearchable {
+ if !d.inode.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+ if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true /* isDir */); err != nil {
+ return nil, err
+ }
+ }
+ d.IncRef()
+ return &d.vfsd, nil
+}
+
+// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
+func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
+ fs.mu.RLock()
+ defer fs.mu.RUnlock()
+ d, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+ if err != nil {
+ return nil, err
+ }
+ d.IncRef()
+ return &d.vfsd, nil
+}
+
+// LinkAt implements vfs.FilesystemImpl.LinkAt.
+func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
+ return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error {
+ if rp.Mount() != vd.Mount() {
+ return syserror.EXDEV
+ }
+ d := vd.Dentry().Impl().(*dentry)
+ if d.inode.isDir() {
+ return syserror.EPERM
+ }
+ if d.inode.nlink == 0 {
+ return syserror.ENOENT
+ }
+ if d.inode.nlink == maxLinks {
+ return syserror.EMLINK
+ }
+ d.inode.incLinksLocked()
+ child := fs.newDentry(d.inode)
+ parent.vfsd.InsertChild(&child.vfsd, name)
+ parent.inode.impl.(*directory).childList.PushBack(child)
+ return nil
+ })
+}
+
+// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
+func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
+ return fs.doCreateAt(rp, true /* dir */, func(parent *dentry, name string) error {
+ if parent.inode.nlink == maxLinks {
+ return syserror.EMLINK
+ }
+ parent.inode.incLinksLocked() // from child's ".."
+ child := fs.newDentry(fs.newDirectory(rp.Credentials(), opts.Mode))
+ parent.vfsd.InsertChild(&child.vfsd, name)
+ parent.inode.impl.(*directory).childList.PushBack(child)
+ return nil
+ })
+}
+
+// MknodAt implements vfs.FilesystemImpl.MknodAt.
+func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
+ return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error {
+ switch opts.Mode.FileType() {
+ case 0, linux.S_IFREG:
+ child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode))
+ parent.vfsd.InsertChild(&child.vfsd, name)
+ parent.inode.impl.(*directory).childList.PushBack(child)
+ return nil
+ case linux.S_IFIFO:
+ child := fs.newDentry(fs.newNamedPipe(rp.Credentials(), opts.Mode))
+ parent.vfsd.InsertChild(&child.vfsd, name)
+ parent.inode.impl.(*directory).childList.PushBack(child)
+ return nil
+ case linux.S_IFBLK, linux.S_IFCHR, linux.S_IFSOCK:
+ // Not yet supported.
+ return syserror.EPERM
+ default:
+ return syserror.EINVAL
+ }
+ })
+}
+
+// OpenAt implements vfs.FilesystemImpl.OpenAt.
+func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ if opts.Flags&linux.O_TMPFILE != 0 {
+ // Not yet supported.
+ return nil, syserror.EOPNOTSUPP
+ }
+
+ // Handle O_CREAT and !O_CREAT separately, since in the latter case we
+ // don't need fs.mu for writing.
+ if opts.Flags&linux.O_CREAT == 0 {
+ fs.mu.RLock()
+ defer fs.mu.RUnlock()
+ d, err := resolveLocked(rp)
+ if err != nil {
+ return nil, err
+ }
+ return d.open(ctx, rp, opts.Flags, false /* afterCreate */)
+ }
+
+ mustCreate := opts.Flags&linux.O_EXCL != 0
+ start := rp.Start().Impl().(*dentry)
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+ if rp.Done() {
+ // Reject attempts to open directories with O_CREAT.
+ if rp.MustBeDir() {
+ return nil, syserror.EISDIR
+ }
+ if mustCreate {
+ return nil, syserror.EEXIST
+ }
+ return start.open(ctx, rp, opts.Flags, false /* afterCreate */)
+ }
+afterTrailingSymlink:
+ parent, err := walkParentDirLocked(rp, start)
+ if err != nil {
+ return nil, err
+ }
+ // Check for search permission in the parent directory.
+ if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+ return nil, err
+ }
+ // Reject attempts to open directories with O_CREAT.
+ if rp.MustBeDir() {
+ return nil, syserror.EISDIR
+ }
+ name := rp.Component()
+ if name == "." || name == ".." {
+ return nil, syserror.EISDIR
+ }
+ // Determine whether or not we need to create a file.
+ child, err := stepLocked(rp, parent)
+ if err == syserror.ENOENT {
+ // Already checked for searchability above; now check for writability.
+ if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil {
+ return nil, err
+ }
+ if err := rp.Mount().CheckBeginWrite(); err != nil {
+ return nil, err
+ }
+ defer rp.Mount().EndWrite()
+ // Create and open the child.
+ child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode))
+ parent.vfsd.InsertChild(&child.vfsd, name)
+ parent.inode.impl.(*directory).childList.PushBack(child)
+ return child.open(ctx, rp, opts.Flags, true)
+ }
+ if err != nil {
+ return nil, err
+ }
+ // Do we need to resolve a trailing symlink?
+ if !rp.Done() {
+ start = parent
+ goto afterTrailingSymlink
+ }
+ // Open existing file.
+ if mustCreate {
+ return nil, syserror.EEXIST
+ }
+ return child.open(ctx, rp, opts.Flags, false)
+}
+
+func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, flags uint32, afterCreate bool) (*vfs.FileDescription, error) {
+ ats := vfs.AccessTypesForOpenFlags(flags)
+ if !afterCreate {
+ if err := d.inode.checkPermissions(rp.Credentials(), ats, d.inode.isDir()); err != nil {
+ return nil, err
+ }
+ }
+ mnt := rp.Mount()
+ switch impl := d.inode.impl.(type) {
+ case *regularFile:
+ var fd regularFileFD
+ fd.readable = vfs.MayReadFileWithOpenFlags(flags)
+ fd.writable = vfs.MayWriteFileWithOpenFlags(flags)
+ if fd.writable {
+ if err := mnt.CheckBeginWrite(); err != nil {
+ return nil, err
+ }
+ // mnt.EndWrite() is called by regularFileFD.Release().
+ }
+ fd.vfsfd.Init(&fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{})
+ if flags&linux.O_TRUNC != 0 {
+ impl.mu.Lock()
+ impl.data.Truncate(0, impl.memFile)
+ atomic.StoreUint64(&impl.size, 0)
+ impl.mu.Unlock()
+ }
+ return &fd.vfsfd, nil
+ case *directory:
+ // Can't open directories writably.
+ if ats&vfs.MayWrite != 0 {
+ return nil, syserror.EISDIR
+ }
+ var fd directoryFD
+ fd.vfsfd.Init(&fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{})
+ return &fd.vfsfd, nil
+ case *symlink:
+ // Can't open symlinks without O_PATH (which is unimplemented).
+ return nil, syserror.ELOOP
+ case *namedPipe:
+ return newNamedPipeFD(ctx, impl, rp, &d.vfsd, flags)
+ default:
+ panic(fmt.Sprintf("unknown inode type: %T", d.inode.impl))
+ }
+}
+
+// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
+func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
+ fs.mu.RLock()
+ defer fs.mu.RUnlock()
+ d, err := resolveLocked(rp)
+ if err != nil {
+ return "", err
+ }
+ symlink, ok := d.inode.impl.(*symlink)
+ if !ok {
+ return "", syserror.EINVAL
+ }
+ return symlink.target, nil
+}
+
+// RenameAt implements vfs.FilesystemImpl.RenameAt.
+func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
+ if opts.Flags != 0 {
+ // TODO(b/145974740): Support renameat2 flags.
+ return syserror.EINVAL
+ }
+
+ // Resolve newParent first to verify that it's on this Mount.
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+ newParent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+ if err != nil {
+ return err
+ }
+ newName := rp.Component()
+ if newName == "." || newName == ".." {
+ return syserror.EBUSY
+ }
+ mnt := rp.Mount()
+ if mnt != oldParentVD.Mount() {
+ return syserror.EXDEV
+ }
+ if err := mnt.CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer mnt.EndWrite()
+
+ oldParent := oldParentVD.Dentry().Impl().(*dentry)
+ if err := oldParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+ return err
+ }
+ // Call vfs.Dentry.Child() instead of stepLocked() or rp.ResolveChild(),
+ // because if the existing child is a symlink or mount point then we want
+ // to rename over it rather than follow it.
+ renamedVFSD := oldParent.vfsd.Child(oldName)
+ if renamedVFSD == nil {
+ return syserror.ENOENT
+ }
+ renamed := renamedVFSD.Impl().(*dentry)
+ if renamed.inode.isDir() {
+ if renamed == newParent || renamedVFSD.IsAncestorOf(&newParent.vfsd) {
+ return syserror.EINVAL
+ }
+ if oldParent != newParent {
+ // Writability is needed to change renamed's "..".
+ if err := renamed.inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true /* isDir */); err != nil {
+ return err
+ }
+ }
+ } else {
+ if opts.MustBeDir || rp.MustBeDir() {
+ return syserror.ENOTDIR
+ }
+ }
+
+ if err := newParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+ return err
+ }
+ replacedVFSD := newParent.vfsd.Child(newName)
+ var replaced *dentry
+ if replacedVFSD != nil {
+ replaced = replacedVFSD.Impl().(*dentry)
+ if replaced.inode.isDir() {
+ if !renamed.inode.isDir() {
+ return syserror.EISDIR
+ }
+ if replaced.vfsd.HasChildren() {
+ return syserror.ENOTEMPTY
+ }
+ } else {
+ if rp.MustBeDir() {
+ return syserror.ENOTDIR
+ }
+ if renamed.inode.isDir() {
+ return syserror.ENOTDIR
+ }
+ }
+ } else {
+ if renamed.inode.isDir() && newParent.inode.nlink == maxLinks {
+ return syserror.EMLINK
+ }
+ }
+ if newParent.vfsd.IsDisowned() {
+ return syserror.ENOENT
+ }
+
+ // Linux places this check before some of those above; we do it here for
+ // simplicity, under the assumption that applications are not intentionally
+ // doing noop renames expecting them to succeed where non-noop renames
+ // would fail.
+ if renamedVFSD == replacedVFSD {
+ return nil
+ }
+ vfsObj := rp.VirtualFilesystem()
+ oldParentDir := oldParent.inode.impl.(*directory)
+ newParentDir := newParent.inode.impl.(*directory)
+ if err := vfsObj.PrepareRenameDentry(vfs.MountNamespaceFromContext(ctx), renamedVFSD, replacedVFSD); err != nil {
+ return err
+ }
+ if replaced != nil {
+ newParentDir.childList.Remove(replaced)
+ if replaced.inode.isDir() {
+ newParent.inode.decLinksLocked() // from replaced's ".."
+ }
+ replaced.inode.decLinksLocked()
+ }
+ oldParentDir.childList.Remove(renamed)
+ newParentDir.childList.PushBack(renamed)
+ if renamed.inode.isDir() {
+ oldParent.inode.decLinksLocked()
+ newParent.inode.incLinksLocked()
+ }
+ // TODO(gvisor.dev/issues/1197): Update timestamps and parent directory
+ // sizes.
+ vfsObj.CommitRenameReplaceDentry(renamedVFSD, &newParent.vfsd, newName, replacedVFSD)
+ return nil
+}
+
+// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
+func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+ parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+ if err != nil {
+ return err
+ }
+ if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+ return err
+ }
+ name := rp.Component()
+ if name == "." {
+ return syserror.EINVAL
+ }
+ if name == ".." {
+ return syserror.ENOTEMPTY
+ }
+ childVFSD := parent.vfsd.Child(name)
+ if childVFSD == nil {
+ return syserror.ENOENT
+ }
+ child := childVFSD.Impl().(*dentry)
+ if !child.inode.isDir() {
+ return syserror.ENOTDIR
+ }
+ if childVFSD.HasChildren() {
+ return syserror.ENOTEMPTY
+ }
+ mnt := rp.Mount()
+ if err := mnt.CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer mnt.EndWrite()
+ vfsObj := rp.VirtualFilesystem()
+ if err := vfsObj.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), childVFSD); err != nil {
+ return err
+ }
+ parent.inode.impl.(*directory).childList.Remove(child)
+ parent.inode.decLinksLocked() // from child's ".."
+ child.inode.decLinksLocked()
+ vfsObj.CommitDeleteDentry(childVFSD)
+ return nil
+}
+
+// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
+func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
+ fs.mu.RLock()
+ defer fs.mu.RUnlock()
+ d, err := resolveLocked(rp)
+ if err != nil {
+ return err
+ }
+ return d.inode.setStat(opts.Stat)
+}
+
+// StatAt implements vfs.FilesystemImpl.StatAt.
+func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
+ fs.mu.RLock()
+ defer fs.mu.RUnlock()
+ d, err := resolveLocked(rp)
+ if err != nil {
+ return linux.Statx{}, err
+ }
+ var stat linux.Statx
+ d.inode.statTo(&stat)
+ return stat, nil
+}
+
+// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
+func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
+ fs.mu.RLock()
+ defer fs.mu.RUnlock()
+ _, err := resolveLocked(rp)
+ if err != nil {
+ return linux.Statfs{}, err
+ }
+ // TODO(gvisor.dev/issues/1197): Actually implement statfs.
+ return linux.Statfs{}, syserror.ENOSYS
+}
+
+// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
+func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
+ return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error {
+ child := fs.newDentry(fs.newSymlink(rp.Credentials(), target))
+ parent.vfsd.InsertChild(&child.vfsd, name)
+ parent.inode.impl.(*directory).childList.PushBack(child)
+ return nil
+ })
+}
+
+// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
+func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+ parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+ if err != nil {
+ return err
+ }
+ if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+ return err
+ }
+ name := rp.Component()
+ if name == "." || name == ".." {
+ return syserror.EISDIR
+ }
+ childVFSD := parent.vfsd.Child(name)
+ if childVFSD == nil {
+ return syserror.ENOENT
+ }
+ child := childVFSD.Impl().(*dentry)
+ if child.inode.isDir() {
+ return syserror.EISDIR
+ }
+ if !rp.MustBeDir() {
+ return syserror.ENOTDIR
+ }
+ mnt := rp.Mount()
+ if err := mnt.CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer mnt.EndWrite()
+ vfsObj := rp.VirtualFilesystem()
+ if err := vfsObj.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), childVFSD); err != nil {
+ return err
+ }
+ parent.inode.impl.(*directory).childList.Remove(child)
+ child.inode.decLinksLocked()
+ vfsObj.CommitDeleteDentry(childVFSD)
+ return nil
+}
+
+// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
+func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) {
+ fs.mu.RLock()
+ defer fs.mu.RUnlock()
+ _, err := resolveLocked(rp)
+ if err != nil {
+ return nil, err
+ }
+ // TODO(b/127675828): support extended attributes
+ return nil, syserror.ENOTSUP
+}
+
+// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
+func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) {
+ fs.mu.RLock()
+ defer fs.mu.RUnlock()
+ _, err := resolveLocked(rp)
+ if err != nil {
+ return "", err
+ }
+ // TODO(b/127675828): support extended attributes
+ return "", syserror.ENOTSUP
+}
+
+// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
+func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+ fs.mu.RLock()
+ defer fs.mu.RUnlock()
+ _, err := resolveLocked(rp)
+ if err != nil {
+ return err
+ }
+ // TODO(b/127675828): support extended attributes
+ return syserror.ENOTSUP
+}
+
+// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
+func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+ fs.mu.RLock()
+ defer fs.mu.RUnlock()
+ _, err := resolveLocked(rp)
+ if err != nil {
+ return err
+ }
+ // TODO(b/127675828): support extended attributes
+ return syserror.ENOTSUP
+}
+
+// PrependPath implements vfs.FilesystemImpl.PrependPath.
+func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
+ fs.mu.RLock()
+ defer fs.mu.RUnlock()
+ return vfs.GenericPrependPath(vfsroot, vd, b)
+}
diff --git a/pkg/sentry/fsimpl/memfs/named_pipe.go b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
index 91cb4b1fc..40bde54de 100644
--- a/pkg/sentry/fsimpl/memfs/named_pipe.go
+++ b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-package memfs
+package tmpfs
import (
"gvisor.dev/gvisor/pkg/abi/linux"
@@ -55,8 +55,6 @@ func newNamedPipeFD(ctx context.Context, np *namedPipe, rp *vfs.ResolvingPath, v
return nil, err
}
mnt := rp.Mount()
- mnt.IncRef()
- vfsd.IncRef()
- fd.vfsfd.Init(&fd, mnt, vfsd)
+ fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
return &fd.vfsfd, nil
}
diff --git a/pkg/sentry/fsimpl/memfs/pipe_test.go b/pkg/sentry/fsimpl/tmpfs/pipe_test.go
index a3a870571..70b42a6ec 100644
--- a/pkg/sentry/fsimpl/memfs/pipe_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/pipe_test.go
@@ -12,13 +12,14 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-package memfs
+package tmpfs
import (
"bytes"
"testing"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -38,7 +39,7 @@ func TestSeparateFDs(t *testing.T) {
pop := vfs.PathOperation{
Root: root,
Start: root,
- Pathname: fileName,
+ Path: fspath.Parse(fileName),
FollowFinalSymlink: true,
}
rfdchan := make(chan *vfs.FileDescription)
@@ -76,7 +77,7 @@ func TestNonblockingRead(t *testing.T) {
pop := vfs.PathOperation{
Root: root,
Start: root,
- Pathname: fileName,
+ Path: fspath.Parse(fileName),
FollowFinalSymlink: true,
}
openOpts := vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_NONBLOCK}
@@ -108,7 +109,7 @@ func TestNonblockingWriteError(t *testing.T) {
pop := vfs.PathOperation{
Root: root,
Start: root,
- Pathname: fileName,
+ Path: fspath.Parse(fileName),
FollowFinalSymlink: true,
}
openOpts := vfs.OpenOptions{Flags: linux.O_WRONLY | linux.O_NONBLOCK}
@@ -126,7 +127,7 @@ func TestSingleFD(t *testing.T) {
pop := vfs.PathOperation{
Root: root,
Start: root,
- Pathname: fileName,
+ Path: fspath.Parse(fileName),
FollowFinalSymlink: true,
}
openOpts := vfs.OpenOptions{Flags: linux.O_RDWR}
@@ -151,8 +152,10 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy
// Create VFS.
vfsObj := vfs.New()
- vfsObj.MustRegisterFilesystemType("memfs", FilesystemType{})
- mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.GetFilesystemOptions{})
+ vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+ AllowUserMount: true,
+ })
+ mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
if err != nil {
t.Fatalf("failed to create tmpfs root mount: %v", err)
}
@@ -160,10 +163,9 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy
// Create the pipe.
root := mntns.Root()
pop := vfs.PathOperation{
- Root: root,
- Start: root,
- Pathname: fileName,
- FollowFinalSymlink: true,
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(fileName),
}
mknodOpts := vfs.MknodOptions{Mode: linux.ModeNamedPipe | 0644}
if err := vfsObj.MknodAt(ctx, creds, &pop, &mknodOpts); err != nil {
@@ -174,7 +176,7 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy
stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
Root: root,
Start: root,
- Pathname: fileName,
+ Path: fspath.Parse(fileName),
FollowFinalSymlink: true,
}, &vfs.StatOptions{})
if err != nil {
@@ -194,7 +196,7 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy
func checkEmpty(ctx context.Context, t *testing.T, fd *vfs.FileDescription) {
readData := make([]byte, 1)
dst := usermem.BytesIOSequence(readData)
- bytesRead, err := fd.Impl().Read(ctx, dst, vfs.ReadOptions{})
+ bytesRead, err := fd.Read(ctx, dst, vfs.ReadOptions{})
if err != syserror.ErrWouldBlock {
t.Fatalf("expected ErrWouldBlock reading from empty pipe %q, but got: %v", fileName, err)
}
@@ -207,7 +209,7 @@ func checkEmpty(ctx context.Context, t *testing.T, fd *vfs.FileDescription) {
func checkWrite(ctx context.Context, t *testing.T, fd *vfs.FileDescription, msg string) {
writeData := []byte(msg)
src := usermem.BytesIOSequence(writeData)
- bytesWritten, err := fd.Impl().Write(ctx, src, vfs.WriteOptions{})
+ bytesWritten, err := fd.Write(ctx, src, vfs.WriteOptions{})
if err != nil {
t.Fatalf("error writing to pipe %q: %v", fileName, err)
}
@@ -220,7 +222,7 @@ func checkWrite(ctx context.Context, t *testing.T, fd *vfs.FileDescription, msg
func checkRead(ctx context.Context, t *testing.T, fd *vfs.FileDescription, msg string) {
readData := make([]byte, len(msg))
dst := usermem.BytesIOSequence(readData)
- bytesRead, err := fd.Impl().Read(ctx, dst, vfs.ReadOptions{})
+ bytesRead, err := fd.Read(ctx, dst, vfs.ReadOptions{})
if err != nil {
t.Fatalf("error reading from pipe %q: %v", fileName, err)
}
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go
new file mode 100644
index 000000000..f200e767d
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go
@@ -0,0 +1,357 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+ "io"
+ "math"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
+ "gvisor.dev/gvisor/pkg/sentry/pgalloc"
+ "gvisor.dev/gvisor/pkg/sentry/safemem"
+ "gvisor.dev/gvisor/pkg/sentry/usage"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+type regularFile struct {
+ inode inode
+
+ // memFile is a platform.File used to allocate pages to this regularFile.
+ memFile *pgalloc.MemoryFile
+
+ // mu protects the fields below.
+ mu sync.RWMutex
+
+ // data maps offsets into the file to offsets into memFile that store
+ // the file's data.
+ data fsutil.FileRangeSet
+
+ // size is the size of data, but accessed using atomic memory
+ // operations to avoid locking in inode.stat().
+ size uint64
+
+ // seals represents file seals on this inode.
+ seals uint32
+}
+
+func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode linux.FileMode) *inode {
+ file := &regularFile{
+ memFile: fs.memFile,
+ }
+ file.inode.init(file, fs, creds, mode)
+ file.inode.nlink = 1 // from parent directory
+ return &file.inode
+}
+
+type regularFileFD struct {
+ fileDescription
+
+ // These are immutable.
+ readable bool
+ writable bool
+
+ // off is the file offset. off is accessed using atomic memory operations.
+ // offMu serializes operations that may mutate off.
+ off int64
+ offMu sync.Mutex
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *regularFileFD) Release() {
+ if fd.writable {
+ fd.vfsfd.VirtualDentry().Mount().EndWrite()
+ }
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+ if !fd.readable {
+ return 0, syserror.EINVAL
+ }
+ if offset < 0 {
+ return 0, syserror.EINVAL
+ }
+ if dst.NumBytes() == 0 {
+ return 0, nil
+ }
+ f := fd.inode().impl.(*regularFile)
+ rw := getRegularFileReadWriter(f, offset)
+ n, err := dst.CopyOutFrom(ctx, rw)
+ putRegularFileReadWriter(rw)
+ return int64(n), err
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+ fd.offMu.Lock()
+ n, err := fd.PRead(ctx, dst, fd.off, opts)
+ fd.off += n
+ fd.offMu.Unlock()
+ return n, err
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+ if !fd.writable {
+ return 0, syserror.EINVAL
+ }
+ if offset < 0 {
+ return 0, syserror.EINVAL
+ }
+ srclen := src.NumBytes()
+ if srclen == 0 {
+ return 0, nil
+ }
+ f := fd.inode().impl.(*regularFile)
+ end := offset + srclen
+ if end < offset {
+ // Overflow.
+ return 0, syserror.EFBIG
+ }
+ rw := getRegularFileReadWriter(f, offset)
+ n, err := src.CopyInTo(ctx, rw)
+ putRegularFileReadWriter(rw)
+ return n, err
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+ fd.offMu.Lock()
+ n, err := fd.PWrite(ctx, src, fd.off, opts)
+ fd.off += n
+ fd.offMu.Unlock()
+ return n, err
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+ fd.offMu.Lock()
+ defer fd.offMu.Unlock()
+ switch whence {
+ case linux.SEEK_SET:
+ // use offset as specified
+ case linux.SEEK_CUR:
+ offset += fd.off
+ case linux.SEEK_END:
+ offset += int64(atomic.LoadUint64(&fd.inode().impl.(*regularFile).size))
+ default:
+ return 0, syserror.EINVAL
+ }
+ if offset < 0 {
+ return 0, syserror.EINVAL
+ }
+ fd.off = offset
+ return offset, nil
+}
+
+// Sync implements vfs.FileDescriptionImpl.Sync.
+func (fd *regularFileFD) Sync(ctx context.Context) error {
+ return nil
+}
+
+// regularFileReadWriter implements safemem.Reader and Safemem.Writer.
+type regularFileReadWriter struct {
+ file *regularFile
+
+ // Offset into the file to read/write at. Note that this may be
+ // different from the FD offset if PRead/PWrite is used.
+ off uint64
+}
+
+var regularFileReadWriterPool = sync.Pool{
+ New: func() interface{} {
+ return &regularFileReadWriter{}
+ },
+}
+
+func getRegularFileReadWriter(file *regularFile, offset int64) *regularFileReadWriter {
+ rw := regularFileReadWriterPool.Get().(*regularFileReadWriter)
+ rw.file = file
+ rw.off = uint64(offset)
+ return rw
+}
+
+func putRegularFileReadWriter(rw *regularFileReadWriter) {
+ rw.file = nil
+ regularFileReadWriterPool.Put(rw)
+}
+
+// ReadToBlocks implements safemem.Reader.ReadToBlocks.
+func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
+ rw.file.mu.RLock()
+
+ // Compute the range to read (limited by file size and overflow-checked).
+ if rw.off >= rw.file.size {
+ rw.file.mu.RUnlock()
+ return 0, io.EOF
+ }
+ end := rw.file.size
+ if rend := rw.off + dsts.NumBytes(); rend > rw.off && rend < end {
+ end = rend
+ }
+
+ var done uint64
+ seg, gap := rw.file.data.Find(uint64(rw.off))
+ for rw.off < end {
+ mr := memmap.MappableRange{uint64(rw.off), uint64(end)}
+ switch {
+ case seg.Ok():
+ // Get internal mappings.
+ ims, err := rw.file.memFile.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Read)
+ if err != nil {
+ rw.file.mu.RUnlock()
+ return done, err
+ }
+
+ // Copy from internal mappings.
+ n, err := safemem.CopySeq(dsts, ims)
+ done += n
+ rw.off += uint64(n)
+ dsts = dsts.DropFirst64(n)
+ if err != nil {
+ rw.file.mu.RUnlock()
+ return done, err
+ }
+
+ // Continue.
+ seg, gap = seg.NextNonEmpty()
+
+ case gap.Ok():
+ // Tmpfs holes are zero-filled.
+ gapmr := gap.Range().Intersect(mr)
+ dst := dsts.TakeFirst64(gapmr.Length())
+ n, err := safemem.ZeroSeq(dst)
+ done += n
+ rw.off += uint64(n)
+ dsts = dsts.DropFirst64(n)
+ if err != nil {
+ rw.file.mu.RUnlock()
+ return done, err
+ }
+
+ // Continue.
+ seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{}
+ }
+ }
+ rw.file.mu.RUnlock()
+ return done, nil
+}
+
+// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
+func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
+ rw.file.mu.Lock()
+
+ // Compute the range to write (overflow-checked).
+ end := rw.off + srcs.NumBytes()
+ if end <= rw.off {
+ end = math.MaxInt64
+ }
+
+ // Check if seals prevent either file growth or all writes.
+ switch {
+ case rw.file.seals&linux.F_SEAL_WRITE != 0: // Write sealed
+ rw.file.mu.Unlock()
+ return 0, syserror.EPERM
+ case end > rw.file.size && rw.file.seals&linux.F_SEAL_GROW != 0: // Grow sealed
+ // When growth is sealed, Linux effectively allows writes which would
+ // normally grow the file to partially succeed up to the current EOF,
+ // rounded down to the page boundary before the EOF.
+ //
+ // This happens because writes (and thus the growth check) for tmpfs
+ // files proceed page-by-page on Linux, and the final write to the page
+ // containing EOF fails, resulting in a partial write up to the start of
+ // that page.
+ //
+ // To emulate this behaviour, artifically truncate the write to the
+ // start of the page containing the current EOF.
+ //
+ // See Linux, mm/filemap.c:generic_perform_write() and
+ // mm/shmem.c:shmem_write_begin().
+ if pgstart := uint64(usermem.Addr(rw.file.size).RoundDown()); end > pgstart {
+ end = pgstart
+ }
+ if end <= rw.off {
+ // Truncation would result in no data being written.
+ rw.file.mu.Unlock()
+ return 0, syserror.EPERM
+ }
+ }
+
+ // Page-aligned mr for when we need to allocate memory. RoundUp can't
+ // overflow since end is an int64.
+ pgstartaddr := usermem.Addr(rw.off).RoundDown()
+ pgendaddr, _ := usermem.Addr(end).RoundUp()
+ pgMR := memmap.MappableRange{uint64(pgstartaddr), uint64(pgendaddr)}
+
+ var (
+ done uint64
+ retErr error
+ )
+ seg, gap := rw.file.data.Find(uint64(rw.off))
+ for rw.off < end {
+ mr := memmap.MappableRange{uint64(rw.off), uint64(end)}
+ switch {
+ case seg.Ok():
+ // Get internal mappings.
+ ims, err := rw.file.memFile.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Write)
+ if err != nil {
+ retErr = err
+ goto exitLoop
+ }
+
+ // Copy to internal mappings.
+ n, err := safemem.CopySeq(ims, srcs)
+ done += n
+ rw.off += uint64(n)
+ srcs = srcs.DropFirst64(n)
+ if err != nil {
+ retErr = err
+ goto exitLoop
+ }
+
+ // Continue.
+ seg, gap = seg.NextNonEmpty()
+
+ case gap.Ok():
+ // Allocate memory for the write.
+ gapMR := gap.Range().Intersect(pgMR)
+ fr, err := rw.file.memFile.Allocate(gapMR.Length(), usage.Tmpfs)
+ if err != nil {
+ retErr = err
+ goto exitLoop
+ }
+
+ // Write to that memory as usual.
+ seg, gap = rw.file.data.Insert(gap, gapMR, fr.Start), fsutil.FileRangeGapIterator{}
+ }
+ }
+exitLoop:
+ // If the write ends beyond the file's previous size, it causes the
+ // file to grow.
+ if rw.off > rw.file.size {
+ atomic.StoreUint64(&rw.file.size, rw.off)
+ }
+
+ rw.file.mu.Unlock()
+ return done, retErr
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
new file mode 100644
index 000000000..7b0a962f0
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
@@ -0,0 +1,315 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+ "bytes"
+ "fmt"
+ "io"
+ "sync/atomic"
+ "testing"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/contexttest"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// nextFileID is used to generate unique file names.
+var nextFileID int64
+
+// newTmpfsRoot creates a new tmpfs mount, and returns the root. If the error
+// is not nil, then cleanup should be called when the root is no longer needed.
+func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentry, func(), error) {
+ creds := auth.CredentialsFromContext(ctx)
+
+ vfsObj := vfs.New()
+ vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+ AllowUserMount: true,
+ })
+ mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+ if err != nil {
+ return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("failed to create tmpfs root mount: %v", err)
+ }
+ root := mntns.Root()
+ return vfsObj, root, func() {
+ root.DecRef()
+ mntns.DecRef(vfsObj)
+ }, nil
+}
+
+// newFileFD creates a new file in a new tmpfs mount, and returns the FD. If
+// the returned err is not nil, then cleanup should be called when the FD is no
+// longer needed.
+func newFileFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) {
+ creds := auth.CredentialsFromContext(ctx)
+ vfsObj, root, cleanup, err := newTmpfsRoot(ctx)
+ if err != nil {
+ return nil, nil, err
+ }
+
+ filename := fmt.Sprintf("tmpfs-test-file-%d", atomic.AddInt64(&nextFileID, 1))
+
+ // Create the file that will be write/read.
+ fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(filename),
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
+ Mode: linux.ModeRegular | mode,
+ })
+ if err != nil {
+ cleanup()
+ return nil, nil, fmt.Errorf("failed to create file %q: %v", filename, err)
+ }
+
+ return fd, cleanup, nil
+}
+
+// newDirFD is like newFileFD, but for directories.
+func newDirFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) {
+ creds := auth.CredentialsFromContext(ctx)
+ vfsObj, root, cleanup, err := newTmpfsRoot(ctx)
+ if err != nil {
+ return nil, nil, err
+ }
+
+ dirname := fmt.Sprintf("tmpfs-test-dir-%d", atomic.AddInt64(&nextFileID, 1))
+
+ // Create the dir.
+ if err := vfsObj.MkdirAt(ctx, creds, &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(dirname),
+ }, &vfs.MkdirOptions{
+ Mode: linux.ModeDirectory | mode,
+ }); err != nil {
+ cleanup()
+ return nil, nil, fmt.Errorf("failed to create directory %q: %v", dirname, err)
+ }
+
+ // Open the dir and return it.
+ fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(dirname),
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDONLY | linux.O_DIRECTORY,
+ })
+ if err != nil {
+ cleanup()
+ return nil, nil, fmt.Errorf("failed to open directory %q: %v", dirname, err)
+ }
+
+ return fd, cleanup, nil
+}
+
+// newPipeFD is like newFileFD, but for pipes.
+func newPipeFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) {
+ creds := auth.CredentialsFromContext(ctx)
+ vfsObj, root, cleanup, err := newTmpfsRoot(ctx)
+ if err != nil {
+ return nil, nil, err
+ }
+
+ pipename := fmt.Sprintf("tmpfs-test-pipe-%d", atomic.AddInt64(&nextFileID, 1))
+
+ // Create the pipe.
+ if err := vfsObj.MknodAt(ctx, creds, &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(pipename),
+ }, &vfs.MknodOptions{
+ Mode: linux.ModeNamedPipe | mode,
+ }); err != nil {
+ cleanup()
+ return nil, nil, fmt.Errorf("failed to create pipe %q: %v", pipename, err)
+ }
+
+ // Open the pipe and return it.
+ fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(pipename),
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDWR,
+ })
+ if err != nil {
+ cleanup()
+ return nil, nil, fmt.Errorf("failed to open pipe %q: %v", pipename, err)
+ }
+
+ return fd, cleanup, nil
+}
+
+// Test that we can write some data to a file and read it back.`
+func TestSimpleWriteRead(t *testing.T) {
+ ctx := contexttest.Context(t)
+ fd, cleanup, err := newFileFD(ctx, 0644)
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer cleanup()
+
+ // Write.
+ data := []byte("foobarbaz")
+ n, err := fd.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{})
+ if err != nil {
+ t.Fatalf("fd.Write failed: %v", err)
+ }
+ if n != int64(len(data)) {
+ t.Errorf("fd.Write got short write length %d, want %d", n, len(data))
+ }
+ if got, want := fd.Impl().(*regularFileFD).off, int64(len(data)); got != want {
+ t.Errorf("fd.Write left offset at %d, want %d", got, want)
+ }
+
+ // Seek back to beginning.
+ if _, err := fd.Seek(ctx, 0, linux.SEEK_SET); err != nil {
+ t.Fatalf("fd.Seek failed: %v", err)
+ }
+ if got, want := fd.Impl().(*regularFileFD).off, int64(0); got != want {
+ t.Errorf("fd.Seek(0) left offset at %d, want %d", got, want)
+ }
+
+ // Read.
+ buf := make([]byte, len(data))
+ n, err = fd.Read(ctx, usermem.BytesIOSequence(buf), vfs.ReadOptions{})
+ if err != nil && err != io.EOF {
+ t.Fatalf("fd.Read failed: %v", err)
+ }
+ if n != int64(len(data)) {
+ t.Errorf("fd.Read got short read length %d, want %d", n, len(data))
+ }
+ if got, want := string(buf), string(data); got != want {
+ t.Errorf("Read got %q want %s", got, want)
+ }
+ if got, want := fd.Impl().(*regularFileFD).off, int64(len(data)); got != want {
+ t.Errorf("fd.Write left offset at %d, want %d", got, want)
+ }
+}
+
+func TestPWrite(t *testing.T) {
+ ctx := contexttest.Context(t)
+ fd, cleanup, err := newFileFD(ctx, 0644)
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer cleanup()
+
+ // Fill file with 1k 'a's.
+ data := bytes.Repeat([]byte{'a'}, 1000)
+ n, err := fd.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{})
+ if err != nil {
+ t.Fatalf("fd.Write failed: %v", err)
+ }
+ if n != int64(len(data)) {
+ t.Errorf("fd.Write got short write length %d, want %d", n, len(data))
+ }
+
+ // Write "gVisor is awesome" at various offsets.
+ buf := []byte("gVisor is awesome")
+ offsets := []int{0, 1, 2, 10, 20, 50, 100, len(data) - 100, len(data) - 1, len(data), len(data) + 1}
+ for _, offset := range offsets {
+ name := fmt.Sprintf("PWrite offset=%d", offset)
+ t.Run(name, func(t *testing.T) {
+ n, err := fd.PWrite(ctx, usermem.BytesIOSequence(buf), int64(offset), vfs.WriteOptions{})
+ if err != nil {
+ t.Errorf("fd.PWrite got err %v want nil", err)
+ }
+ if n != int64(len(buf)) {
+ t.Errorf("fd.PWrite got %d bytes want %d", n, len(buf))
+ }
+
+ // Update data to reflect expected file contents.
+ if len(data) < offset+len(buf) {
+ data = append(data, make([]byte, (offset+len(buf))-len(data))...)
+ }
+ copy(data[offset:], buf)
+
+ // Read the whole file and compare with data.
+ readBuf := make([]byte, len(data))
+ n, err = fd.PRead(ctx, usermem.BytesIOSequence(readBuf), 0, vfs.ReadOptions{})
+ if err != nil {
+ t.Fatalf("fd.PRead failed: %v", err)
+ }
+ if n != int64(len(data)) {
+ t.Errorf("fd.PRead got short read length %d, want %d", n, len(data))
+ }
+ if got, want := string(readBuf), string(data); got != want {
+ t.Errorf("PRead got %q want %s", got, want)
+ }
+
+ })
+ }
+}
+
+func TestPRead(t *testing.T) {
+ ctx := contexttest.Context(t)
+ fd, cleanup, err := newFileFD(ctx, 0644)
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer cleanup()
+
+ // Write 100 sequences of 'gVisor is awesome'.
+ data := bytes.Repeat([]byte("gVisor is awsome"), 100)
+ n, err := fd.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{})
+ if err != nil {
+ t.Fatalf("fd.Write failed: %v", err)
+ }
+ if n != int64(len(data)) {
+ t.Errorf("fd.Write got short write length %d, want %d", n, len(data))
+ }
+
+ // Read various sizes from various offsets.
+ sizes := []int{0, 1, 2, 10, 20, 50, 100, 1000}
+ offsets := []int{0, 1, 2, 10, 20, 50, 100, 1000, len(data) - 100, len(data) - 1, len(data), len(data) + 1}
+
+ for _, size := range sizes {
+ for _, offset := range offsets {
+ name := fmt.Sprintf("PRead offset=%d size=%d", offset, size)
+ t.Run(name, func(t *testing.T) {
+ var (
+ wantRead []byte
+ wantErr error
+ )
+ if offset < len(data) {
+ wantRead = data[offset:]
+ } else if size > 0 {
+ wantErr = io.EOF
+ }
+ if offset+size < len(data) {
+ wantRead = wantRead[:size]
+ }
+ buf := make([]byte, size)
+ n, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), int64(offset), vfs.ReadOptions{})
+ if err != wantErr {
+ t.Errorf("fd.PRead got err %v want %v", err, wantErr)
+ }
+ if n != int64(len(wantRead)) {
+ t.Errorf("fd.PRead got %d bytes want %d", n, len(wantRead))
+ }
+ if got := string(buf[:n]); got != string(wantRead) {
+ t.Errorf("fd.PRead got %q want %q", got, string(wantRead))
+ }
+ })
+ }
+ }
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/stat_test.go b/pkg/sentry/fsimpl/tmpfs/stat_test.go
new file mode 100644
index 000000000..ebe035dee
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/stat_test.go
@@ -0,0 +1,232 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+ "fmt"
+ "testing"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/contexttest"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+func TestStatAfterCreate(t *testing.T) {
+ ctx := contexttest.Context(t)
+ mode := linux.FileMode(0644)
+
+ // Run with different file types.
+ // TODO(gvisor.dev/issues/1197): Also test symlinks and sockets.
+ for _, typ := range []string{"file", "dir", "pipe"} {
+ t.Run(fmt.Sprintf("type=%q", typ), func(t *testing.T) {
+ var (
+ fd *vfs.FileDescription
+ cleanup func()
+ err error
+ )
+ switch typ {
+ case "file":
+ fd, cleanup, err = newFileFD(ctx, mode)
+ case "dir":
+ fd, cleanup, err = newDirFD(ctx, mode)
+ case "pipe":
+ fd, cleanup, err = newPipeFD(ctx, mode)
+ default:
+ panic(fmt.Sprintf("unknown typ %q", typ))
+ }
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer cleanup()
+
+ got, err := fd.Stat(ctx, vfs.StatOptions{})
+ if err != nil {
+ t.Fatalf("Stat failed: %v", err)
+ }
+
+ // Atime, Ctime, Mtime should all be current time (non-zero).
+ atime, ctime, mtime := got.Atime.ToNsec(), got.Ctime.ToNsec(), got.Mtime.ToNsec()
+ if atime != ctime || ctime != mtime {
+ t.Errorf("got atime=%d ctime=%d mtime=%d, wanted equal values", atime, ctime, mtime)
+ }
+ if atime == 0 {
+ t.Errorf("got atime=%d, want non-zero", atime)
+ }
+
+ // Btime should be 0, as it is not set by tmpfs.
+ if btime := got.Btime.ToNsec(); btime != 0 {
+ t.Errorf("got btime %d, want 0", got.Btime.ToNsec())
+ }
+
+ // Size should be 0.
+ if got.Size != 0 {
+ t.Errorf("got size %d, want 0", got.Size)
+ }
+
+ // Nlink should be 1 for files, 2 for dirs.
+ wantNlink := uint32(1)
+ if typ == "dir" {
+ wantNlink = 2
+ }
+ if got.Nlink != wantNlink {
+ t.Errorf("got nlink %d, want %d", got.Nlink, wantNlink)
+ }
+
+ // UID and GID are set from context creds.
+ creds := auth.CredentialsFromContext(ctx)
+ if got.UID != uint32(creds.EffectiveKUID) {
+ t.Errorf("got uid %d, want %d", got.UID, uint32(creds.EffectiveKUID))
+ }
+ if got.GID != uint32(creds.EffectiveKGID) {
+ t.Errorf("got gid %d, want %d", got.GID, uint32(creds.EffectiveKGID))
+ }
+
+ // Mode.
+ wantMode := uint16(mode)
+ switch typ {
+ case "file":
+ wantMode |= linux.S_IFREG
+ case "dir":
+ wantMode |= linux.S_IFDIR
+ case "pipe":
+ wantMode |= linux.S_IFIFO
+ default:
+ panic(fmt.Sprintf("unknown typ %q", typ))
+ }
+
+ if got.Mode != wantMode {
+ t.Errorf("got mode %x, want %x", got.Mode, wantMode)
+ }
+
+ // Ino.
+ if got.Ino == 0 {
+ t.Errorf("got ino %d, want not 0", got.Ino)
+ }
+ })
+ }
+}
+
+func TestSetStatAtime(t *testing.T) {
+ ctx := contexttest.Context(t)
+ fd, cleanup, err := newFileFD(ctx, 0644)
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer cleanup()
+
+ allStatOptions := vfs.StatOptions{Mask: linux.STATX_ALL}
+
+ // Get initial stat.
+ initialStat, err := fd.Stat(ctx, allStatOptions)
+ if err != nil {
+ t.Fatalf("Stat failed: %v", err)
+ }
+
+ // Set atime, but without the mask.
+ if err := fd.SetStat(ctx, vfs.SetStatOptions{Stat: linux.Statx{
+ Mask: 0,
+ Atime: linux.NsecToStatxTimestamp(100),
+ }}); err != nil {
+ t.Errorf("SetStat atime without mask failed: %v")
+ }
+ // Atime should be unchanged.
+ if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil {
+ t.Errorf("Stat got error: %v", err)
+ } else if gotStat.Atime != initialStat.Atime {
+ t.Errorf("Stat got atime %d, want %d", gotStat.Atime, initialStat.Atime)
+ }
+
+ // Set atime, this time included in the mask.
+ setStat := linux.Statx{
+ Mask: linux.STATX_ATIME,
+ Atime: linux.NsecToStatxTimestamp(100),
+ }
+ if err := fd.SetStat(ctx, vfs.SetStatOptions{Stat: setStat}); err != nil {
+ t.Errorf("SetStat atime with mask failed: %v")
+ }
+ if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil {
+ t.Errorf("Stat got error: %v", err)
+ } else if gotStat.Atime != setStat.Atime {
+ t.Errorf("Stat got atime %d, want %d", gotStat.Atime, setStat.Atime)
+ }
+}
+
+func TestSetStat(t *testing.T) {
+ ctx := contexttest.Context(t)
+ mode := linux.FileMode(0644)
+
+ // Run with different file types.
+ // TODO(gvisor.dev/issues/1197): Also test symlinks and sockets.
+ for _, typ := range []string{"file", "dir", "pipe"} {
+ t.Run(fmt.Sprintf("type=%q", typ), func(t *testing.T) {
+ var (
+ fd *vfs.FileDescription
+ cleanup func()
+ err error
+ )
+ switch typ {
+ case "file":
+ fd, cleanup, err = newFileFD(ctx, mode)
+ case "dir":
+ fd, cleanup, err = newDirFD(ctx, mode)
+ case "pipe":
+ fd, cleanup, err = newPipeFD(ctx, mode)
+ default:
+ panic(fmt.Sprintf("unknown typ %q", typ))
+ }
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer cleanup()
+
+ allStatOptions := vfs.StatOptions{Mask: linux.STATX_ALL}
+
+ // Get initial stat.
+ initialStat, err := fd.Stat(ctx, allStatOptions)
+ if err != nil {
+ t.Fatalf("Stat failed: %v", err)
+ }
+
+ // Set atime, but without the mask.
+ if err := fd.SetStat(ctx, vfs.SetStatOptions{Stat: linux.Statx{
+ Mask: 0,
+ Atime: linux.NsecToStatxTimestamp(100),
+ }}); err != nil {
+ t.Errorf("SetStat atime without mask failed: %v")
+ }
+ // Atime should be unchanged.
+ if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil {
+ t.Errorf("Stat got error: %v", err)
+ } else if gotStat.Atime != initialStat.Atime {
+ t.Errorf("Stat got atime %d, want %d", gotStat.Atime, initialStat.Atime)
+ }
+
+ // Set atime, this time included in the mask.
+ setStat := linux.Statx{
+ Mask: linux.STATX_ATIME,
+ Atime: linux.NsecToStatxTimestamp(100),
+ }
+ if err := fd.SetStat(ctx, vfs.SetStatOptions{Stat: setStat}); err != nil {
+ t.Errorf("SetStat atime with mask failed: %v")
+ }
+ if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil {
+ t.Errorf("Stat got error: %v", err)
+ } else if gotStat.Atime != setStat.Atime {
+ t.Errorf("Stat got atime %d, want %d", gotStat.Atime, setStat.Atime)
+ }
+ })
+ }
+}
diff --git a/pkg/sentry/fsimpl/memfs/symlink.go b/pkg/sentry/fsimpl/tmpfs/symlink.go
index b2ac2cbeb..5246aca84 100644
--- a/pkg/sentry/fsimpl/memfs/symlink.go
+++ b/pkg/sentry/fsimpl/tmpfs/symlink.go
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-package memfs
+package tmpfs
import (
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
diff --git a/pkg/sentry/fsimpl/memfs/memfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index 4cb2a4e0f..d6960ee47 100644
--- a/pkg/sentry/fsimpl/memfs/memfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -12,31 +12,29 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-// Package memfs provides a filesystem implementation that behaves like tmpfs:
+// Package tmpfs provides a filesystem implementation that behaves like tmpfs:
// the Dentry tree is the sole source of truth for the state of the filesystem.
//
-// memfs is intended primarily to demonstrate filesystem implementation
-// patterns. Real uses cases for an in-memory filesystem should use tmpfs
-// instead.
-//
// Lock order:
//
// filesystem.mu
// regularFileFD.offMu
// regularFile.mu
// inode.mu
-package memfs
+package tmpfs
import (
"fmt"
- "sync"
+ "math"
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+ "gvisor.dev/gvisor/pkg/sentry/pgalloc"
"gvisor.dev/gvisor/pkg/sentry/vfs"
- "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/sync"
)
// FilesystemType implements vfs.FilesystemType.
@@ -46,6 +44,12 @@ type FilesystemType struct{}
type filesystem struct {
vfsfs vfs.Filesystem
+ // memFile is used to allocate pages to for regular files.
+ memFile *pgalloc.MemoryFile
+
+ // clock is a realtime clock used to set timestamps in file operations.
+ clock time.Clock
+
// mu serializes changes to the Dentry tree.
mu sync.RWMutex
@@ -54,7 +58,15 @@ type filesystem struct {
// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
- var fs filesystem
+ memFileProvider := pgalloc.MemoryFileProviderFromContext(ctx)
+ if memFileProvider == nil {
+ panic("MemoryFileProviderFromContext returned nil")
+ }
+ clock := time.RealtimeClockFromContext(ctx)
+ fs := filesystem{
+ memFile: memFileProvider.MemoryFile(),
+ clock: clock,
+ }
fs.vfsfs.Init(vfsObj, &fs)
root := fs.newDentry(fs.newDirectory(creds, 01777))
return &fs.vfsfs, &root.vfsd, nil
@@ -64,12 +76,6 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
func (fs *filesystem) Release() {
}
-// Sync implements vfs.FilesystemImpl.Sync.
-func (fs *filesystem) Sync(ctx context.Context) error {
- // All filesystem state is in-memory.
- return nil
-}
-
// dentry implements vfs.DentryImpl.
type dentry struct {
vfsd vfs.Dentry
@@ -79,11 +85,11 @@ type dentry struct {
// immutable.
inode *inode
- // memfs doesn't count references on dentries; because the dentry tree is
+ // tmpfs doesn't count references on dentries; because the dentry tree is
// the sole source of truth, it is by definition always consistent with the
// state of the filesystem. However, it does count references on inodes,
// because inode resources are released when all references are dropped.
- // (memfs doesn't really have resources to release, but we implement
+ // (tmpfs doesn't really have resources to release, but we implement
// reference counting because tmpfs regular files will.)
// dentryEntry (ugh) links dentries into their parent directory.childList.
@@ -125,47 +131,67 @@ type inode struct {
// filesystem.RmdirAt() drops the reference.
refs int64
- // Inode metadata; protected by mu and accessed using atomic memory
- // operations unless otherwise specified.
- mu sync.RWMutex
+ // Inode metadata. Writing multiple fields atomically requires holding
+ // mu, othewise atomic operations can be used.
+ mu sync.Mutex
mode uint32 // excluding file type bits, which are based on impl
nlink uint32 // protected by filesystem.mu instead of inode.mu
uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
gid uint32 // auth.KGID, but ...
ino uint64 // immutable
+ // Linux's tmpfs has no concept of btime.
+ atime int64 // nanoseconds
+ ctime int64 // nanoseconds
+ mtime int64 // nanoseconds
+
impl interface{} // immutable
}
+const maxLinks = math.MaxUint32
+
func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, mode linux.FileMode) {
+ now := fs.clock.Now().Nanoseconds()
i.refs = 1
i.mode = uint32(mode)
i.uid = uint32(creds.EffectiveKUID)
i.gid = uint32(creds.EffectiveKGID)
i.ino = atomic.AddUint64(&fs.nextInoMinusOne, 1)
+ // Tmpfs creation sets atime, ctime, and mtime to current time.
+ i.atime = now
+ i.ctime = now
+ i.mtime = now
// i.nlink initialized by caller
i.impl = impl
}
-// Preconditions: filesystem.mu must be locked for writing.
+// incLinksLocked increments i's link count.
+//
+// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0.
+// i.nlink < maxLinks.
func (i *inode) incLinksLocked() {
- if atomic.AddUint32(&i.nlink, 1) <= 1 {
- panic("memfs.inode.incLinksLocked() called with no existing links")
+ if i.nlink == 0 {
+ panic("tmpfs.inode.incLinksLocked() called with no existing links")
}
+ if i.nlink == maxLinks {
+ panic("memfs.inode.incLinksLocked() called with maximum link count")
+ }
+ atomic.AddUint32(&i.nlink, 1)
}
-// Preconditions: filesystem.mu must be locked for writing.
+// decLinksLocked decrements i's link count.
+//
+// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0.
func (i *inode) decLinksLocked() {
- if nlink := atomic.AddUint32(&i.nlink, ^uint32(0)); nlink == 0 {
- i.decRef()
- } else if nlink == ^uint32(0) { // negative overflow
- panic("memfs.inode.decLinksLocked() called with no existing links")
+ if i.nlink == 0 {
+ panic("tmpfs.inode.decLinksLocked() called with no existing links")
}
+ atomic.AddUint32(&i.nlink, ^uint32(0))
}
func (i *inode) incRef() {
if atomic.AddInt64(&i.refs, 1) <= 1 {
- panic("memfs.inode.incRef() called without holding a reference")
+ panic("tmpfs.inode.incRef() called without holding a reference")
}
}
@@ -184,14 +210,14 @@ func (i *inode) tryIncRef() bool {
func (i *inode) decRef() {
if refs := atomic.AddInt64(&i.refs, -1); refs == 0 {
// This is unnecessary; it's mostly to simulate what tmpfs would do.
- if regfile, ok := i.impl.(*regularFile); ok {
- regfile.mu.Lock()
- regfile.data = nil
- atomic.StoreInt64(&regfile.dataLen, 0)
- regfile.mu.Unlock()
+ if regFile, ok := i.impl.(*regularFile); ok {
+ regFile.mu.Lock()
+ regFile.data.DropAll(regFile.memFile)
+ atomic.StoreUint64(&regFile.size, 0)
+ regFile.mu.Unlock()
}
} else if refs < 0 {
- panic("memfs.inode.decRef() called without holding a reference")
+ panic("tmpfs.inode.decRef() called without holding a reference")
}
}
@@ -202,20 +228,29 @@ func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, i
// Go won't inline this function, and returning linux.Statx (which is quite
// big) means spending a lot of time in runtime.duffcopy(), so instead it's an
// output parameter.
+//
+// Note that Linux does not guarantee to return consistent data (in the case of
+// a concurrent modification), so we do not require holding inode.mu.
func (i *inode) statTo(stat *linux.Statx) {
- stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
+ stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK |
+ linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_ATIME |
+ linux.STATX_BTIME | linux.STATX_CTIME | linux.STATX_MTIME
stat.Blksize = 1 // usermem.PageSize in tmpfs
stat.Nlink = atomic.LoadUint32(&i.nlink)
stat.UID = atomic.LoadUint32(&i.uid)
stat.GID = atomic.LoadUint32(&i.gid)
stat.Mode = uint16(atomic.LoadUint32(&i.mode))
stat.Ino = i.ino
- // TODO: device number
+ // Linux's tmpfs has no concept of btime, so zero-value is returned.
+ stat.Atime = linux.NsecToStatxTimestamp(i.atime)
+ stat.Ctime = linux.NsecToStatxTimestamp(i.ctime)
+ stat.Mtime = linux.NsecToStatxTimestamp(i.mtime)
+ // TODO(gvisor.dev/issues/1197): Device number.
switch impl := i.impl.(type) {
case *regularFile:
stat.Mode |= linux.S_IFREG
stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
- stat.Size = uint64(atomic.LoadInt64(&impl.dataLen))
+ stat.Size = uint64(atomic.LoadUint64(&impl.size))
// In tmpfs, this will be FileRangeSet.Span() / 512 (but also cached in
// a uint64 accessed using atomic memory operations to avoid taking
// locks).
@@ -234,6 +269,36 @@ func (i *inode) statTo(stat *linux.Statx) {
}
}
+func (i *inode) setStat(stat linux.Statx) error {
+ // TODO(gvisor.dev/issues/1197): Handle stat.Size by growing/shrinking
+ // the file.
+ if stat.Mask == 0 {
+ return nil
+ }
+ i.mu.Lock()
+ mask := stat.Mask
+ if mask&linux.STATX_MODE != 0 {
+ atomic.StoreUint32(&i.mode, uint32(stat.Mode))
+ }
+ if mask&linux.STATX_UID != 0 {
+ atomic.StoreUint32(&i.uid, stat.UID)
+ }
+ if mask&linux.STATX_GID != 0 {
+ atomic.StoreUint32(&i.gid, stat.GID)
+ }
+ if mask&linux.STATX_ATIME != 0 {
+ atomic.StoreInt64(&i.atime, stat.Atime.ToNsecCapped())
+ }
+ if mask&linux.STATX_CTIME != 0 {
+ atomic.StoreInt64(&i.ctime, stat.Ctime.ToNsecCapped())
+ }
+ if mask&linux.STATX_MTIME != 0 {
+ atomic.StoreInt64(&i.mtime, stat.Mtime.ToNsecCapped())
+ }
+ i.mu.Unlock()
+ return nil
+}
+
// allocatedBlocksForSize returns the number of 512B blocks needed to
// accommodate the given size in bytes, as appropriate for struct
// stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block
@@ -256,13 +321,11 @@ func (i *inode) direntType() uint8 {
}
}
-// fileDescription is embedded by memfs implementations of
+// fileDescription is embedded by tmpfs implementations of
// vfs.FileDescriptionImpl.
type fileDescription struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
-
- flags uint32 // status flags; immutable
}
func (fd *fileDescription) filesystem() *filesystem {
@@ -273,18 +336,6 @@ func (fd *fileDescription) inode() *inode {
return fd.vfsfd.Dentry().Impl().(*dentry).inode
}
-// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags.
-func (fd *fileDescription) StatusFlags(ctx context.Context) (uint32, error) {
- return fd.flags, nil
-}
-
-// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags.
-func (fd *fileDescription) SetStatusFlags(ctx context.Context, flags uint32) error {
- // None of the flags settable by fcntl(F_SETFL) are supported, so this is a
- // no-op.
- return nil
-}
-
// Stat implements vfs.FileDescriptionImpl.Stat.
func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
var stat linux.Statx
@@ -294,9 +345,5 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu
// SetStat implements vfs.FileDescriptionImpl.SetStat.
func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
- if opts.Stat.Mask == 0 {
- return nil
- }
- // TODO: implement inode.setStat
- return syserror.EPERM
+ return fd.inode().setStat(opts.Stat)
}
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index 2706927ff..ac85ba0c8 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -35,7 +35,7 @@ go_template_instance(
out = "seqatomic_taskgoroutineschedinfo_unsafe.go",
package = "kernel",
suffix = "TaskGoroutineSchedInfo",
- template = "//pkg/syncutil:generic_seqatomic",
+ template = "//pkg/sync:generic_seqatomic",
types = {
"Value": "TaskGoroutineSchedInfo",
},
@@ -209,7 +209,7 @@ go_library(
"//pkg/sentry/usermem",
"//pkg/state",
"//pkg/state/statefile",
- "//pkg/syncutil",
+ "//pkg/sync",
"//pkg/syserr",
"//pkg/syserror",
"//pkg/tcpip",
@@ -241,6 +241,7 @@ go_test(
"//pkg/sentry/time",
"//pkg/sentry/usage",
"//pkg/sentry/usermem",
+ "//pkg/sync",
"//pkg/syserror",
],
)
diff --git a/pkg/sentry/kernel/abstract_socket_namespace.go b/pkg/sentry/kernel/abstract_socket_namespace.go
index 244655b5c..920fe4329 100644
--- a/pkg/sentry/kernel/abstract_socket_namespace.go
+++ b/pkg/sentry/kernel/abstract_socket_namespace.go
@@ -15,11 +15,11 @@
package kernel
import (
- "sync"
"syscall"
"gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+ "gvisor.dev/gvisor/pkg/sync"
)
// +stateify savable
diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD
index 04c244447..1aa72fa47 100644
--- a/pkg/sentry/kernel/auth/BUILD
+++ b/pkg/sentry/kernel/auth/BUILD
@@ -8,7 +8,7 @@ go_template_instance(
out = "atomicptr_credentials_unsafe.go",
package = "auth",
suffix = "Credentials",
- template = "//pkg/syncutil:generic_atomicptr",
+ template = "//pkg/sync:generic_atomicptr",
types = {
"Value": "Credentials",
},
@@ -64,6 +64,7 @@ go_library(
"//pkg/bits",
"//pkg/log",
"//pkg/sentry/context",
+ "//pkg/sync",
"//pkg/syserror",
],
)
diff --git a/pkg/sentry/kernel/auth/user_namespace.go b/pkg/sentry/kernel/auth/user_namespace.go
index af28ccc65..9dd52c860 100644
--- a/pkg/sentry/kernel/auth/user_namespace.go
+++ b/pkg/sentry/kernel/auth/user_namespace.go
@@ -16,8 +16,8 @@ package auth
import (
"math"
- "sync"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
)
diff --git a/pkg/sentry/kernel/epoll/BUILD b/pkg/sentry/kernel/epoll/BUILD
index 3361e8b7d..c47f6b6fc 100644
--- a/pkg/sentry/kernel/epoll/BUILD
+++ b/pkg/sentry/kernel/epoll/BUILD
@@ -32,6 +32,7 @@ go_library(
"//pkg/sentry/fs/anon",
"//pkg/sentry/fs/fsutil",
"//pkg/sentry/usermem",
+ "//pkg/sync",
"//pkg/waiter",
],
)
diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go
index 9c0a4e1b4..430311cc0 100644
--- a/pkg/sentry/kernel/epoll/epoll.go
+++ b/pkg/sentry/kernel/epoll/epoll.go
@@ -18,7 +18,6 @@ package epoll
import (
"fmt"
- "sync"
"syscall"
"gvisor.dev/gvisor/pkg/refs"
@@ -27,6 +26,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs/anon"
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
"gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/waiter"
)
diff --git a/pkg/sentry/kernel/eventfd/BUILD b/pkg/sentry/kernel/eventfd/BUILD
index e65b961e8..c831fbab2 100644
--- a/pkg/sentry/kernel/eventfd/BUILD
+++ b/pkg/sentry/kernel/eventfd/BUILD
@@ -16,6 +16,7 @@ go_library(
"//pkg/sentry/fs/anon",
"//pkg/sentry/fs/fsutil",
"//pkg/sentry/usermem",
+ "//pkg/sync",
"//pkg/syserror",
"//pkg/waiter",
],
diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go
index 12f0d429b..687690679 100644
--- a/pkg/sentry/kernel/eventfd/eventfd.go
+++ b/pkg/sentry/kernel/eventfd/eventfd.go
@@ -18,7 +18,6 @@ package eventfd
import (
"math"
- "sync"
"syscall"
"gvisor.dev/gvisor/pkg/abi/linux"
@@ -28,6 +27,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs/anon"
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
"gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/waiter"
)
diff --git a/pkg/sentry/kernel/fasync/BUILD b/pkg/sentry/kernel/fasync/BUILD
index 49d81b712..6b36bc63e 100644
--- a/pkg/sentry/kernel/fasync/BUILD
+++ b/pkg/sentry/kernel/fasync/BUILD
@@ -12,6 +12,7 @@ go_library(
"//pkg/sentry/fs",
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
+ "//pkg/sync",
"//pkg/waiter",
],
)
diff --git a/pkg/sentry/kernel/fasync/fasync.go b/pkg/sentry/kernel/fasync/fasync.go
index 6b0bb0324..d32c3e90a 100644
--- a/pkg/sentry/kernel/fasync/fasync.go
+++ b/pkg/sentry/kernel/fasync/fasync.go
@@ -16,12 +16,11 @@
package fasync
import (
- "sync"
-
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/waiter"
)
diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go
index 11f613a11..cd1501f85 100644
--- a/pkg/sentry/kernel/fd_table.go
+++ b/pkg/sentry/kernel/fd_table.go
@@ -18,7 +18,6 @@ import (
"bytes"
"fmt"
"math"
- "sync"
"sync/atomic"
"syscall"
@@ -28,6 +27,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/limits"
+ "gvisor.dev/gvisor/pkg/sync"
)
// FDFlags define flags for an individual descriptor.
diff --git a/pkg/sentry/kernel/fd_table_test.go b/pkg/sentry/kernel/fd_table_test.go
index 2bcb6216a..eccb7d1e7 100644
--- a/pkg/sentry/kernel/fd_table_test.go
+++ b/pkg/sentry/kernel/fd_table_test.go
@@ -16,7 +16,6 @@ package kernel
import (
"runtime"
- "sync"
"testing"
"gvisor.dev/gvisor/pkg/sentry/context"
@@ -24,6 +23,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/filetest"
"gvisor.dev/gvisor/pkg/sentry/limits"
+ "gvisor.dev/gvisor/pkg/sync"
)
const (
diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go
index ded27d668..2448c1d99 100644
--- a/pkg/sentry/kernel/fs_context.go
+++ b/pkg/sentry/kernel/fs_context.go
@@ -16,10 +16,10 @@ package kernel
import (
"fmt"
- "sync"
"gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sync"
)
// FSContext contains filesystem context.
diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD
index 75ec31761..50db443ce 100644
--- a/pkg/sentry/kernel/futex/BUILD
+++ b/pkg/sentry/kernel/futex/BUILD
@@ -9,7 +9,7 @@ go_template_instance(
out = "atomicptr_bucket_unsafe.go",
package = "futex",
suffix = "Bucket",
- template = "//pkg/syncutil:generic_atomicptr",
+ template = "//pkg/sync:generic_atomicptr",
types = {
"Value": "bucket",
},
@@ -42,6 +42,7 @@ go_library(
"//pkg/sentry/context",
"//pkg/sentry/memmap",
"//pkg/sentry/usermem",
+ "//pkg/sync",
"//pkg/syserror",
],
)
@@ -51,5 +52,8 @@ go_test(
size = "small",
srcs = ["futex_test.go"],
embed = [":futex"],
- deps = ["//pkg/sentry/usermem"],
+ deps = [
+ "//pkg/sentry/usermem",
+ "//pkg/sync",
+ ],
)
diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go
index 278cc8143..d1931c8f4 100644
--- a/pkg/sentry/kernel/futex/futex.go
+++ b/pkg/sentry/kernel/futex/futex.go
@@ -18,11 +18,10 @@
package futex
import (
- "sync"
-
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
)
diff --git a/pkg/sentry/kernel/futex/futex_test.go b/pkg/sentry/kernel/futex/futex_test.go
index 65e5d1428..c23126ca5 100644
--- a/pkg/sentry/kernel/futex/futex_test.go
+++ b/pkg/sentry/kernel/futex/futex_test.go
@@ -17,13 +17,13 @@ package futex
import (
"math"
"runtime"
- "sync"
"sync/atomic"
"syscall"
"testing"
"unsafe"
"gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sync"
)
// testData implements the Target interface, and allows us to
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 28ba950bd..c85e97fef 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -36,7 +36,6 @@ import (
"fmt"
"io"
"path/filepath"
- "sync"
"sync/atomic"
"time"
@@ -67,6 +66,7 @@ import (
uspb "gvisor.dev/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto"
"gvisor.dev/gvisor/pkg/sentry/uniqueid"
"gvisor.dev/gvisor/pkg/state"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/tcpip"
)
@@ -762,7 +762,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
mounts.IncRef()
}
- tg := k.newThreadGroup(mounts, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits, k.monotonicClock)
+ tg := k.NewThreadGroup(mounts, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits)
ctx := args.NewContext(k)
// Get the root directory from the MountNamespace.
@@ -841,9 +841,11 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
AbstractSocketNamespace: args.AbstractSocketNamespace,
ContainerID: args.ContainerID,
}
- if _, err := k.tasks.NewTask(config); err != nil {
+ t, err := k.tasks.NewTask(config)
+ if err != nil {
return nil, 0, err
}
+ t.traceExecEvent(tc) // Simulate exec for tracing.
// Success.
tgid := k.tasks.Root.IDOfThreadGroup(tg)
@@ -1118,6 +1120,22 @@ func (k *Kernel) SendContainerSignal(cid string, info *arch.SignalInfo) error {
return lastErr
}
+// RebuildTraceContexts rebuilds the trace context for all tasks.
+//
+// Unfortunately, if these are built while tracing is not enabled, then we will
+// not have meaningful trace data. Rebuilding here ensures that we can do so
+// after tracing has been enabled.
+func (k *Kernel) RebuildTraceContexts() {
+ k.extMu.Lock()
+ defer k.extMu.Unlock()
+ k.tasks.mu.RLock()
+ defer k.tasks.mu.RUnlock()
+
+ for t, tid := range k.tasks.Root.tids {
+ t.rebuildTraceContext(tid)
+ }
+}
+
// FeatureSet returns the FeatureSet.
func (k *Kernel) FeatureSet() *cpuid.FeatureSet {
return k.featureSet
@@ -1173,6 +1191,11 @@ func (k *Kernel) GlobalInit() *ThreadGroup {
return k.globalInit
}
+// TestOnly_SetGlobalInit sets the thread group with ID 1 in the root PID namespace.
+func (k *Kernel) TestOnly_SetGlobalInit(tg *ThreadGroup) {
+ k.globalInit = tg
+}
+
// ApplicationCores returns the number of CPUs visible to sandboxed
// applications.
func (k *Kernel) ApplicationCores() uint {
diff --git a/pkg/sentry/kernel/memevent/BUILD b/pkg/sentry/kernel/memevent/BUILD
index d7a7d1169..7f36252a9 100644
--- a/pkg/sentry/kernel/memevent/BUILD
+++ b/pkg/sentry/kernel/memevent/BUILD
@@ -16,6 +16,7 @@ go_library(
"//pkg/metric",
"//pkg/sentry/kernel",
"//pkg/sentry/usage",
+ "//pkg/sync",
],
)
diff --git a/pkg/sentry/kernel/memevent/memory_events.go b/pkg/sentry/kernel/memevent/memory_events.go
index b0d98e7f0..200565bb8 100644
--- a/pkg/sentry/kernel/memevent/memory_events.go
+++ b/pkg/sentry/kernel/memevent/memory_events.go
@@ -17,7 +17,6 @@
package memevent
import (
- "sync"
"time"
"gvisor.dev/gvisor/pkg/eventchannel"
@@ -26,6 +25,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel"
pb "gvisor.dev/gvisor/pkg/sentry/kernel/memevent/memory_events_go_proto"
"gvisor.dev/gvisor/pkg/sentry/usage"
+ "gvisor.dev/gvisor/pkg/sync"
)
var totalTicks = metric.MustCreateNewUint64Metric("/memory_events/ticks", false /*sync*/, "Total number of memory event periods that have elapsed since startup.")
diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD
index 9d34f6d4d..5eeaeff66 100644
--- a/pkg/sentry/kernel/pipe/BUILD
+++ b/pkg/sentry/kernel/pipe/BUILD
@@ -43,6 +43,7 @@ go_library(
"//pkg/sentry/safemem",
"//pkg/sentry/usermem",
"//pkg/sentry/vfs",
+ "//pkg/sync",
"//pkg/syserror",
"//pkg/waiter",
],
diff --git a/pkg/sentry/kernel/pipe/buffer.go b/pkg/sentry/kernel/pipe/buffer.go
index 95bee2d37..1c0f34269 100644
--- a/pkg/sentry/kernel/pipe/buffer.go
+++ b/pkg/sentry/kernel/pipe/buffer.go
@@ -16,9 +16,9 @@ package pipe
import (
"io"
- "sync"
"gvisor.dev/gvisor/pkg/sentry/safemem"
+ "gvisor.dev/gvisor/pkg/sync"
)
// buffer encapsulates a queueable byte buffer.
diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go
index 4a19ab7ce..716f589af 100644
--- a/pkg/sentry/kernel/pipe/node.go
+++ b/pkg/sentry/kernel/pipe/node.go
@@ -15,12 +15,11 @@
package pipe
import (
- "sync"
-
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
)
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index 1a1b38f83..e4fd7d420 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -17,12 +17,12 @@ package pipe
import (
"fmt"
- "sync"
"sync/atomic"
"syscall"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/waiter"
)
diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go
index ef9641e6a..8394eb78b 100644
--- a/pkg/sentry/kernel/pipe/pipe_util.go
+++ b/pkg/sentry/kernel/pipe/pipe_util.go
@@ -17,7 +17,6 @@ package pipe
import (
"io"
"math"
- "sync"
"syscall"
"gvisor.dev/gvisor/pkg/abi/linux"
@@ -25,6 +24,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/waiter"
)
diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go
index 6416e0dd8..bf7461cbb 100644
--- a/pkg/sentry/kernel/pipe/vfs.go
+++ b/pkg/sentry/kernel/pipe/vfs.go
@@ -15,13 +15,12 @@
package pipe
import (
- "sync"
-
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/waiter"
)
diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go
index 24ea002ba..b14429854 100644
--- a/pkg/sentry/kernel/rseq.go
+++ b/pkg/sentry/kernel/rseq.go
@@ -15,17 +15,29 @@
package kernel
import (
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/hostcpu"
"gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserror"
)
-// Restartable sequences, as described in https://lwn.net/Articles/650333/.
+// Restartable sequences.
+//
+// We support two different APIs for restartable sequences.
+//
+// 1. The upstream interface added in v4.18.
+// 2. The interface described in https://lwn.net/Articles/650333/.
+//
+// Throughout this file and other parts of the kernel, the latter is referred
+// to as "old rseq". This interface was never merged upstream, but is supported
+// for a limited set of applications that use it regardless.
-// RSEQCriticalRegion describes a restartable sequence critical region.
+// OldRSeqCriticalRegion describes an old rseq critical region.
//
// +stateify savable
-type RSEQCriticalRegion struct {
+type OldRSeqCriticalRegion struct {
// When a task in this thread group has its CPU preempted (as defined by
// platform.ErrContextCPUPreempted) or has a signal delivered to an
// application handler while its instruction pointer is in CriticalSection,
@@ -35,86 +47,359 @@ type RSEQCriticalRegion struct {
Restart usermem.Addr
}
-// RSEQAvailable returns true if t supports restartable sequences.
-func (t *Task) RSEQAvailable() bool {
+// RSeqAvailable returns true if t supports (old and new) restartable sequences.
+func (t *Task) RSeqAvailable() bool {
return t.k.useHostCores && t.k.Platform.DetectsCPUPreemption()
}
-// RSEQCriticalRegion returns a copy of t's thread group's current restartable
-// sequence.
-func (t *Task) RSEQCriticalRegion() RSEQCriticalRegion {
- return *t.tg.rscr.Load().(*RSEQCriticalRegion)
+// SetRSeq registers addr as this thread's rseq structure.
+//
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) SetRSeq(addr usermem.Addr, length, signature uint32) error {
+ if t.rseqAddr != 0 {
+ if t.rseqAddr != addr {
+ return syserror.EINVAL
+ }
+ if t.rseqSignature != signature {
+ return syserror.EINVAL
+ }
+ return syserror.EBUSY
+ }
+
+ // rseq must be aligned and correctly sized.
+ if addr&(linux.AlignOfRSeq-1) != 0 {
+ return syserror.EINVAL
+ }
+ if length != linux.SizeOfRSeq {
+ return syserror.EINVAL
+ }
+ if _, ok := t.MemoryManager().CheckIORange(addr, linux.SizeOfRSeq); !ok {
+ return syserror.EFAULT
+ }
+
+ t.rseqAddr = addr
+ t.rseqSignature = signature
+
+ // Initialize the CPUID.
+ //
+ // Linux implicitly does this on return from userspace, where failure
+ // would cause SIGSEGV.
+ if err := t.rseqUpdateCPU(); err != nil {
+ t.rseqAddr = 0
+ t.rseqSignature = 0
+
+ t.Debugf("Failed to copy CPU to %#x for rseq: %v", t.rseqAddr, err)
+ t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+ t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+ return syserror.EFAULT
+ }
+
+ return nil
}
-// SetRSEQCriticalRegion replaces t's thread group's restartable sequence.
+// ClearRSeq unregisters addr as this thread's rseq structure.
//
-// Preconditions: t.RSEQAvailable() == true.
-func (t *Task) SetRSEQCriticalRegion(rscr RSEQCriticalRegion) error {
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) ClearRSeq(addr usermem.Addr, length, signature uint32) error {
+ if t.rseqAddr == 0 {
+ return syserror.EINVAL
+ }
+ if t.rseqAddr != addr {
+ return syserror.EINVAL
+ }
+ if length != linux.SizeOfRSeq {
+ return syserror.EINVAL
+ }
+ if t.rseqSignature != signature {
+ return syserror.EPERM
+ }
+
+ if err := t.rseqClearCPU(); err != nil {
+ return err
+ }
+
+ t.rseqAddr = 0
+ t.rseqSignature = 0
+
+ if t.oldRSeqCPUAddr == 0 {
+ // rseqCPU no longer needed.
+ t.rseqCPU = -1
+ }
+
+ return nil
+}
+
+// OldRSeqCriticalRegion returns a copy of t's thread group's current
+// old restartable sequence.
+func (t *Task) OldRSeqCriticalRegion() OldRSeqCriticalRegion {
+ return *t.tg.oldRSeqCritical.Load().(*OldRSeqCriticalRegion)
+}
+
+// SetOldRSeqCriticalRegion replaces t's thread group's old restartable
+// sequence.
+//
+// Preconditions: t.RSeqAvailable() == true.
+func (t *Task) SetOldRSeqCriticalRegion(r OldRSeqCriticalRegion) error {
// These checks are somewhat more lenient than in Linux, which (bizarrely)
- // requires rscr.CriticalSection to be non-empty and rscr.Restart to be
- // outside of rscr.CriticalSection, even if rscr.CriticalSection.Start == 0
+ // requires r.CriticalSection to be non-empty and r.Restart to be
+ // outside of r.CriticalSection, even if r.CriticalSection.Start == 0
// (which disables the critical region).
- if rscr.CriticalSection.Start == 0 {
- rscr.CriticalSection.End = 0
- rscr.Restart = 0
- t.tg.rscr.Store(&rscr)
+ if r.CriticalSection.Start == 0 {
+ r.CriticalSection.End = 0
+ r.Restart = 0
+ t.tg.oldRSeqCritical.Store(&r)
return nil
}
- if rscr.CriticalSection.Start >= rscr.CriticalSection.End {
+ if r.CriticalSection.Start >= r.CriticalSection.End {
return syserror.EINVAL
}
- if rscr.CriticalSection.Contains(rscr.Restart) {
+ if r.CriticalSection.Contains(r.Restart) {
return syserror.EINVAL
}
- // TODO(jamieliu): check that rscr.CriticalSection and rscr.Restart are in
- // the application address range, for consistency with Linux
- t.tg.rscr.Store(&rscr)
+ // TODO(jamieliu): check that r.CriticalSection and r.Restart are in
+ // the application address range, for consistency with Linux.
+ t.tg.oldRSeqCritical.Store(&r)
return nil
}
-// RSEQCPUAddr returns the address that RSEQ will keep updated with t's CPU
-// number.
+// OldRSeqCPUAddr returns the address that old rseq will keep updated with t's
+// CPU number.
//
// Preconditions: The caller must be running on the task goroutine.
-func (t *Task) RSEQCPUAddr() usermem.Addr {
- return t.rseqCPUAddr
+func (t *Task) OldRSeqCPUAddr() usermem.Addr {
+ return t.oldRSeqCPUAddr
}
-// SetRSEQCPUAddr replaces the address that RSEQ will keep updated with t's CPU
-// number.
+// SetOldRSeqCPUAddr replaces the address that old rseq will keep updated with
+// t's CPU number.
//
-// Preconditions: t.RSEQAvailable() == true. The caller must be running on the
+// Preconditions: t.RSeqAvailable() == true. The caller must be running on the
// task goroutine. t's AddressSpace must be active.
-func (t *Task) SetRSEQCPUAddr(addr usermem.Addr) error {
- t.rseqCPUAddr = addr
- if addr != 0 {
- t.rseqCPU = int32(hostcpu.GetCPU())
- if err := t.rseqCopyOutCPU(); err != nil {
- t.rseqCPUAddr = 0
- t.rseqCPU = -1
- return syserror.EINVAL // yes, EINVAL, not err or EFAULT
- }
- } else {
- t.rseqCPU = -1
+func (t *Task) SetOldRSeqCPUAddr(addr usermem.Addr) error {
+ t.oldRSeqCPUAddr = addr
+
+ // Check that addr is writable.
+ //
+ // N.B. rseqUpdateCPU may fail on a bad t.rseqAddr as well. That's
+ // unfortunate, but unlikely in a correct program.
+ if err := t.rseqUpdateCPU(); err != nil {
+ t.oldRSeqCPUAddr = 0
+ return syserror.EINVAL // yes, EINVAL, not err or EFAULT
}
return nil
}
// Preconditions: The caller must be running on the task goroutine. t's
// AddressSpace must be active.
-func (t *Task) rseqCopyOutCPU() error {
+func (t *Task) rseqUpdateCPU() error {
+ if t.rseqAddr == 0 && t.oldRSeqCPUAddr == 0 {
+ t.rseqCPU = -1
+ return nil
+ }
+
+ t.rseqCPU = int32(hostcpu.GetCPU())
+
+ // Update both CPUs, even if one fails.
+ rerr := t.rseqCopyOutCPU()
+ oerr := t.oldRSeqCopyOutCPU()
+
+ if rerr != nil {
+ return rerr
+ }
+ return oerr
+}
+
+// Preconditions: The caller must be running on the task goroutine. t's
+// AddressSpace must be active.
+func (t *Task) oldRSeqCopyOutCPU() error {
+ if t.oldRSeqCPUAddr == 0 {
+ return nil
+ }
+
buf := t.CopyScratchBuffer(4)
usermem.ByteOrder.PutUint32(buf, uint32(t.rseqCPU))
- _, err := t.CopyOutBytes(t.rseqCPUAddr, buf)
+ _, err := t.CopyOutBytes(t.oldRSeqCPUAddr, buf)
+ return err
+}
+
+// Preconditions: The caller must be running on the task goroutine. t's
+// AddressSpace must be active.
+func (t *Task) rseqCopyOutCPU() error {
+ if t.rseqAddr == 0 {
+ return nil
+ }
+
+ buf := t.CopyScratchBuffer(8)
+ // CPUIDStart and CPUID are the first two fields in linux.RSeq.
+ usermem.ByteOrder.PutUint32(buf, uint32(t.rseqCPU)) // CPUIDStart
+ usermem.ByteOrder.PutUint32(buf[4:], uint32(t.rseqCPU)) // CPUID
+ // N.B. This write is not atomic, but since this occurs on the task
+ // goroutine then as long as userspace uses a single-instruction read
+ // it can't see an invalid value.
+ _, err := t.CopyOutBytes(t.rseqAddr, buf)
+ return err
+}
+
+// Preconditions: The caller must be running on the task goroutine. t's
+// AddressSpace must be active.
+func (t *Task) rseqClearCPU() error {
+ buf := t.CopyScratchBuffer(8)
+ // CPUIDStart and CPUID are the first two fields in linux.RSeq.
+ usermem.ByteOrder.PutUint32(buf, 0) // CPUIDStart
+ usermem.ByteOrder.PutUint32(buf[4:], linux.RSEQ_CPU_ID_UNINITIALIZED) // CPUID
+ // N.B. This write is not atomic, but since this occurs on the task
+ // goroutine then as long as userspace uses a single-instruction read
+ // it can't see an invalid value.
+ _, err := t.CopyOutBytes(t.rseqAddr, buf)
return err
}
+// rseqAddrInterrupt checks if IP is in a critical section, and aborts if so.
+//
+// This is a bit complex since both the RSeq and RSeqCriticalSection structs
+// are stored in userspace. So we must:
+//
+// 1. Copy in the address of RSeqCriticalSection from RSeq.
+// 2. Copy in RSeqCriticalSection itself.
+// 3. Validate critical section struct version, address range, abort address.
+// 4. Validate the abort signature (4 bytes preceding abort IP match expected
+// signature).
+// 5. Clear address of RSeqCriticalSection from RSeq.
+// 6. Finally, conditionally abort.
+//
+// See kernel/rseq.c:rseq_ip_fixup for reference.
+//
+// Preconditions: The caller must be running on the task goroutine. t's
+// AddressSpace must be active.
+func (t *Task) rseqAddrInterrupt() {
+ if t.rseqAddr == 0 {
+ return
+ }
+
+ critAddrAddr, ok := t.rseqAddr.AddLength(linux.OffsetOfRSeqCriticalSection)
+ if !ok {
+ // SetRSeq should validate this.
+ panic(fmt.Sprintf("t.rseqAddr (%#x) not large enough", t.rseqAddr))
+ }
+
+ if t.Arch().Width() != 8 {
+ // We only handle 64-bit for now.
+ t.Debugf("Only 64-bit rseq supported.")
+ t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+ t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+ return
+ }
+
+ buf := t.CopyScratchBuffer(8)
+ if _, err := t.CopyInBytes(critAddrAddr, buf); err != nil {
+ t.Debugf("Failed to copy critical section address from %#x for rseq: %v", critAddrAddr, err)
+ t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+ t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+ return
+ }
+
+ critAddr := usermem.Addr(usermem.ByteOrder.Uint64(buf))
+ if critAddr == 0 {
+ return
+ }
+
+ buf = t.CopyScratchBuffer(linux.SizeOfRSeqCriticalSection)
+ if _, err := t.CopyInBytes(critAddr, buf); err != nil {
+ t.Debugf("Failed to copy critical section from %#x for rseq: %v", critAddr, err)
+ t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+ t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+ return
+ }
+
+ // Manually marshal RSeqCriticalSection as this is in the hot path when
+ // rseq is enabled. It must be as fast as possible.
+ //
+ // TODO(b/130243041): Replace with go_marshal.
+ cs := linux.RSeqCriticalSection{
+ Version: usermem.ByteOrder.Uint32(buf[0:4]),
+ Flags: usermem.ByteOrder.Uint32(buf[4:8]),
+ Start: usermem.ByteOrder.Uint64(buf[8:16]),
+ PostCommitOffset: usermem.ByteOrder.Uint64(buf[16:24]),
+ Abort: usermem.ByteOrder.Uint64(buf[24:32]),
+ }
+
+ if cs.Version != 0 {
+ t.Debugf("Unknown version in %+v", cs)
+ t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+ t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+ return
+ }
+
+ start := usermem.Addr(cs.Start)
+ critRange, ok := start.ToRange(cs.PostCommitOffset)
+ if !ok {
+ t.Debugf("Invalid start and offset in %+v", cs)
+ t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+ t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+ return
+ }
+
+ abort := usermem.Addr(cs.Abort)
+ if critRange.Contains(abort) {
+ t.Debugf("Abort in critical section in %+v", cs)
+ t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+ t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+ return
+ }
+
+ // Verify signature.
+ sigAddr := abort - linux.SizeOfRSeqSignature
+
+ buf = t.CopyScratchBuffer(linux.SizeOfRSeqSignature)
+ if _, err := t.CopyInBytes(sigAddr, buf); err != nil {
+ t.Debugf("Failed to copy critical section signature from %#x for rseq: %v", sigAddr, err)
+ t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+ t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+ return
+ }
+
+ sig := usermem.ByteOrder.Uint32(buf)
+ if sig != t.rseqSignature {
+ t.Debugf("Mismatched rseq signature %d != %d", sig, t.rseqSignature)
+ t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+ t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+ return
+ }
+
+ // Clear the critical section address.
+ //
+ // NOTE(b/143949567): We don't support any rseq flags, so we always
+ // restart if we are in the critical section, and thus *always* clear
+ // critAddrAddr.
+ if _, err := t.MemoryManager().ZeroOut(t, critAddrAddr, int64(t.Arch().Width()), usermem.IOOpts{
+ AddressSpaceActive: true,
+ }); err != nil {
+ t.Debugf("Failed to clear critical section address from %#x for rseq: %v", critAddrAddr, err)
+ t.forceSignal(linux.SIGSEGV, false /* unconditional */)
+ t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+ return
+ }
+
+ // Finally we can actually decide whether or not to restart.
+ if !critRange.Contains(usermem.Addr(t.Arch().IP())) {
+ return
+ }
+
+ t.Arch().SetIP(uintptr(cs.Abort))
+}
+
// Preconditions: The caller must be running on the task goroutine.
-func (t *Task) rseqInterrupt() {
- rscr := t.tg.rscr.Load().(*RSEQCriticalRegion)
- if ip := t.Arch().IP(); rscr.CriticalSection.Contains(usermem.Addr(ip)) {
- t.Debugf("Interrupted RSEQ critical section at %#x; restarting at %#x", ip, rscr.Restart)
- t.Arch().SetIP(uintptr(rscr.Restart))
- t.Arch().SetRSEQInterruptedIP(ip)
+func (t *Task) oldRSeqInterrupt() {
+ r := t.tg.oldRSeqCritical.Load().(*OldRSeqCriticalRegion)
+ if ip := t.Arch().IP(); r.CriticalSection.Contains(usermem.Addr(ip)) {
+ t.Debugf("Interrupted rseq critical section at %#x; restarting at %#x", ip, r.Restart)
+ t.Arch().SetIP(uintptr(r.Restart))
+ t.Arch().SetOldRSeqInterruptedIP(ip)
}
}
+
+// Preconditions: The caller must be running on the task goroutine.
+func (t *Task) rseqInterrupt() {
+ t.rseqAddrInterrupt()
+ t.oldRSeqInterrupt()
+}
diff --git a/pkg/sentry/kernel/semaphore/BUILD b/pkg/sentry/kernel/semaphore/BUILD
index f4c00cd86..13a961594 100644
--- a/pkg/sentry/kernel/semaphore/BUILD
+++ b/pkg/sentry/kernel/semaphore/BUILD
@@ -31,6 +31,7 @@ go_library(
"//pkg/sentry/fs",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/kernel/time",
+ "//pkg/sync",
"//pkg/syserror",
],
)
diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go
index 93fe68a3e..18299814e 100644
--- a/pkg/sentry/kernel/semaphore/semaphore.go
+++ b/pkg/sentry/kernel/semaphore/semaphore.go
@@ -17,7 +17,6 @@ package semaphore
import (
"fmt"
- "sync"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/log"
@@ -25,6 +24,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -302,7 +302,7 @@ func (s *Set) SetVal(ctx context.Context, num int32, val int16, creds *auth.Cred
return syserror.ERANGE
}
- // TODO(b/29354920): Clear undo entries in all processes
+ // TODO(gvisor.dev/issue/137): Clear undo entries in all processes.
sem.value = val
sem.pid = pid
s.changeTime = ktime.NowFromContext(ctx)
@@ -336,7 +336,7 @@ func (s *Set) SetValAll(ctx context.Context, vals []uint16, creds *auth.Credenti
for i, val := range vals {
sem := &s.sems[i]
- // TODO(b/29354920): Clear undo entries in all processes
+ // TODO(gvisor.dev/issue/137): Clear undo entries in all processes.
sem.value = int16(val)
sem.pid = pid
sem.wakeWaiters()
@@ -481,7 +481,7 @@ func (s *Set) executeOps(ctx context.Context, ops []linux.Sembuf, pid int32) (ch
}
// All operations succeeded, apply them.
- // TODO(b/29354920): handle undo operations.
+ // TODO(gvisor.dev/issue/137): handle undo operations.
for i, v := range tmpVals {
s.sems[i].value = v
s.sems[i].wakeWaiters()
diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD
index cd48945e6..7321b22ed 100644
--- a/pkg/sentry/kernel/shm/BUILD
+++ b/pkg/sentry/kernel/shm/BUILD
@@ -24,6 +24,7 @@ go_library(
"//pkg/sentry/platform",
"//pkg/sentry/usage",
"//pkg/sentry/usermem",
+ "//pkg/sync",
"//pkg/syserror",
],
)
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index 5bd610f68..8ddef7eb8 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -35,7 +35,6 @@ package shm
import (
"fmt"
- "sync"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/log"
@@ -49,6 +48,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -71,9 +71,20 @@ type Registry struct {
mu sync.Mutex `state:"nosave"`
// shms maps segment ids to segments.
+ //
+ // shms holds all referenced segments, which are removed on the last
+ // DecRef. Thus, it cannot itself hold a reference on the Shm.
+ //
+ // Since removal only occurs after the last (unlocked) DecRef, there
+ // exists a short window during which a Shm still exists in Shm, but is
+ // unreferenced. Users must use TryIncRef to determine if the Shm is
+ // still valid.
shms map[ID]*Shm
// keysToShms maps segment keys to segments.
+ //
+ // Shms in keysToShms are guaranteed to be referenced, as they are
+ // removed by disassociateKey before the last DecRef.
keysToShms map[Key]*Shm
// Sum of the sizes of all existing segments rounded up to page size, in
@@ -95,10 +106,18 @@ func NewRegistry(userNS *auth.UserNamespace) *Registry {
}
// FindByID looks up a segment given an ID.
+//
+// FindByID returns a reference on Shm.
func (r *Registry) FindByID(id ID) *Shm {
r.mu.Lock()
defer r.mu.Unlock()
- return r.shms[id]
+ s := r.shms[id]
+ // Take a reference on s. If TryIncRef fails, s has reached the last
+ // DecRef, but hasn't quite been removed from r.shms yet.
+ if s != nil && s.TryIncRef() {
+ return s
+ }
+ return nil
}
// dissociateKey removes the association between a segment and its key,
@@ -119,6 +138,8 @@ func (r *Registry) dissociateKey(s *Shm) {
// FindOrCreate looks up or creates a segment in the registry. It's functionally
// analogous to open(2).
+//
+// FindOrCreate returns a reference on Shm.
func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key Key, size uint64, mode linux.FileMode, private, create, exclusive bool) (*Shm, error) {
if (create || private) && (size < linux.SHMMIN || size > linux.SHMMAX) {
// "A new segment was to be created and size is less than SHMMIN or
@@ -166,6 +187,7 @@ func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key Key, size ui
return nil, syserror.EEXIST
}
+ shm.IncRef()
return shm, nil
}
@@ -193,7 +215,14 @@ func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key Key, size ui
// Need to create a new segment.
creator := fs.FileOwnerFromContext(ctx)
perms := fs.FilePermsFromMode(mode)
- return r.newShm(ctx, pid, key, creator, perms, size)
+ s, err := r.newShm(ctx, pid, key, creator, perms, size)
+ if err != nil {
+ return nil, err
+ }
+ // The initial reference is held by s itself. Take another to return to
+ // the caller.
+ s.IncRef()
+ return s, nil
}
// newShm creates a new segment in the registry.
@@ -296,22 +325,26 @@ func (r *Registry) remove(s *Shm) {
// Shm represents a single shared memory segment.
//
-// Shm segment are backed directly by an allocation from platform
-// memory. Segments are always mapped as a whole, greatly simplifying how
-// mappings are tracked. However note that mremap and munmap calls may cause the
-// vma for a segment to become fragmented; which requires special care when
-// unmapping a segment. See mm/shm.go.
+// Shm segment are backed directly by an allocation from platform memory.
+// Segments are always mapped as a whole, greatly simplifying how mappings are
+// tracked. However note that mremap and munmap calls may cause the vma for a
+// segment to become fragmented; which requires special care when unmapping a
+// segment. See mm/shm.go.
//
// Segments persist until they are explicitly marked for destruction via
-// shmctl(SHM_RMID).
+// MarkDestroyed().
//
// Shm implements memmap.Mappable and memmap.MappingIdentity.
//
// +stateify savable
type Shm struct {
- // AtomicRefCount tracks the number of references to this segment from
- // maps. A segment always holds a reference to itself, until it's marked for
+ // AtomicRefCount tracks the number of references to this segment.
+ //
+ // A segment holds a reference to itself until it is marked for
// destruction.
+ //
+ // In addition to direct users, the MemoryManager will hold references
+ // via MappingIdentity.
refs.AtomicRefCount
mfp pgalloc.MemoryFileProvider
@@ -484,9 +517,8 @@ type AttachOpts struct {
// ConfigureAttach creates an mmap configuration for the segment with the
// requested attach options.
//
-// ConfigureAttach returns with a ref on s on success. The caller should drop
-// this once the map is installed. This reference prevents s from being
-// destroyed before the returned configuration is used.
+// Postconditions: The returned MMapOpts are valid only as long as a reference
+// continues to be held on s.
func (s *Shm) ConfigureAttach(ctx context.Context, addr usermem.Addr, opts AttachOpts) (memmap.MMapOpts, error) {
s.mu.Lock()
defer s.mu.Unlock()
@@ -504,7 +536,6 @@ func (s *Shm) ConfigureAttach(ctx context.Context, addr usermem.Addr, opts Attac
// in the user namespace that governs its IPC namespace." - man shmat(2)
return memmap.MMapOpts{}, syserror.EACCES
}
- s.IncRef()
return memmap.MMapOpts{
Length: s.size,
Offset: 0,
@@ -549,10 +580,15 @@ func (s *Shm) IPCStat(ctx context.Context) (*linux.ShmidDS, error) {
}
creds := auth.CredentialsFromContext(ctx)
- nattach := uint64(s.ReadRefs())
- // Don't report the self-reference we keep prior to being marked for
- // destruction. However, also don't report a count of -1 for segments marked
- // as destroyed, with no mappings.
+ // Use the reference count as a rudimentary count of the number of
+ // attaches. We exclude:
+ //
+ // 1. The reference the caller holds.
+ // 2. The self-reference held by s prior to destruction.
+ //
+ // Note that this may still overcount by including transient references
+ // used in concurrent calls.
+ nattach := uint64(s.ReadRefs()) - 1
if !s.pendingDestruction {
nattach--
}
@@ -620,18 +656,17 @@ func (s *Shm) MarkDestroyed() {
s.registry.dissociateKey(s)
s.mu.Lock()
- // Only drop the segment's self-reference once, when destruction is
- // requested. Otherwise, repeated calls to shmctl(IPC_RMID) would force a
- // segment to be destroyed prematurely, potentially with active maps to the
- // segment's address range. Remaining references are dropped when the
- // segment is detached or unmaped.
+ defer s.mu.Unlock()
if !s.pendingDestruction {
s.pendingDestruction = true
- s.mu.Unlock() // Must release s.mu before calling s.DecRef.
+ // Drop the self-reference so destruction occurs when all
+ // external references are gone.
+ //
+ // N.B. This cannot be the final DecRef, as the caller also
+ // holds a reference.
s.DecRef()
return
}
- s.mu.Unlock()
}
// checkOwnership verifies whether a segment may be accessed by ctx as an
diff --git a/pkg/sentry/kernel/signal_handlers.go b/pkg/sentry/kernel/signal_handlers.go
index a16f3d57f..768fda220 100644
--- a/pkg/sentry/kernel/signal_handlers.go
+++ b/pkg/sentry/kernel/signal_handlers.go
@@ -15,10 +15,9 @@
package kernel
import (
- "sync"
-
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sync"
)
// SignalHandlers holds information about signal actions.
diff --git a/pkg/sentry/kernel/signalfd/BUILD b/pkg/sentry/kernel/signalfd/BUILD
index 9f7e19b4d..89e4d84b1 100644
--- a/pkg/sentry/kernel/signalfd/BUILD
+++ b/pkg/sentry/kernel/signalfd/BUILD
@@ -16,6 +16,7 @@ go_library(
"//pkg/sentry/fs/fsutil",
"//pkg/sentry/kernel",
"//pkg/sentry/usermem",
+ "//pkg/sync",
"//pkg/syserror",
"//pkg/waiter",
],
diff --git a/pkg/sentry/kernel/signalfd/signalfd.go b/pkg/sentry/kernel/signalfd/signalfd.go
index 4b08d7d72..28be4a939 100644
--- a/pkg/sentry/kernel/signalfd/signalfd.go
+++ b/pkg/sentry/kernel/signalfd/signalfd.go
@@ -16,8 +16,6 @@
package signalfd
import (
- "sync"
-
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/binary"
"gvisor.dev/gvisor/pkg/sentry/context"
@@ -26,6 +24,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/waiter"
)
diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go
index 220fa73a2..d2d01add4 100644
--- a/pkg/sentry/kernel/syscalls.go
+++ b/pkg/sentry/kernel/syscalls.go
@@ -16,13 +16,13 @@ package kernel
import (
"fmt"
- "sync"
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi"
"gvisor.dev/gvisor/pkg/bits"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sync"
)
// maxSyscallNum is the highest supported syscall number.
@@ -339,6 +339,14 @@ func (s *SyscallTable) Lookup(sysno uintptr) SyscallFn {
return nil
}
+// LookupName looks up a syscall name.
+func (s *SyscallTable) LookupName(sysno uintptr) string {
+ if sc, ok := s.Table[sysno]; ok {
+ return sc.Name
+ }
+ return fmt.Sprintf("sys_%d", sysno) // Unlikely.
+}
+
// LookupEmulate looks up an emulation syscall number.
func (s *SyscallTable) LookupEmulate(addr usermem.Addr) (uintptr, bool) {
sysno, ok := s.Emulate[addr]
diff --git a/pkg/sentry/kernel/syslog.go b/pkg/sentry/kernel/syslog.go
index 8227ecf1d..4607cde2f 100644
--- a/pkg/sentry/kernel/syslog.go
+++ b/pkg/sentry/kernel/syslog.go
@@ -17,7 +17,8 @@ package kernel
import (
"fmt"
"math/rand"
- "sync"
+
+ "gvisor.dev/gvisor/pkg/sync"
)
// syslog represents a sentry-global kernel log.
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index 80c8e5464..978d66da8 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -15,7 +15,8 @@
package kernel
import (
- "sync"
+ gocontext "context"
+ "runtime/trace"
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
@@ -35,7 +36,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/uniqueid"
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/sentry/usermem"
- "gvisor.dev/gvisor/pkg/syncutil"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -83,7 +84,7 @@ type Task struct {
//
// gosched is protected by goschedSeq. gosched is owned by the task
// goroutine.
- goschedSeq syncutil.SeqCount `state:"nosave"`
+ goschedSeq sync.SeqCount `state:"nosave"`
gosched TaskGoroutineSchedInfo
// yieldCount is the number of times the task goroutine has called
@@ -390,7 +391,14 @@ type Task struct {
// logPrefix is a string containing the task's thread ID in the root PID
// namespace, and is prepended to log messages emitted by Task.Infof etc.
- logPrefix atomic.Value `state:".(string)"`
+ logPrefix atomic.Value `state:"nosave"`
+
+ // traceContext and traceTask are both used for tracing, and are
+ // updated along with the logPrefix in updateInfoLocked.
+ //
+ // These are exclusive to the task goroutine.
+ traceContext gocontext.Context `state:"nosave"`
+ traceTask *trace.Task `state:"nosave"`
// creds is the task's credentials.
//
@@ -480,18 +488,43 @@ type Task struct {
// netns is protected by mu. netns is owned by the task goroutine.
netns bool
- // If rseqPreempted is true, before the next call to p.Switch(), interrupt
- // RSEQ critical regions as defined by tg.rseq and write the task
- // goroutine's CPU number to rseqCPUAddr. rseqCPU is the last CPU number
- // written to rseqCPUAddr.
+ // If rseqPreempted is true, before the next call to p.Switch(),
+ // interrupt rseq critical regions as defined by rseqAddr and
+ // tg.oldRSeqCritical and write the task goroutine's CPU number to
+ // rseqAddr/oldRSeqCPUAddr.
//
- // If rseqCPUAddr is 0, rseqCPU is -1.
+ // We support two ABIs for restartable sequences:
//
- // rseqCPUAddr, rseqCPU, and rseqPreempted are exclusive to the task
- // goroutine.
+ // 1. The upstream interface added in v4.18,
+ // 2. An "old" interface never merged upstream. In the implementation,
+ // this is referred to as "old rseq".
+ //
+ // rseqPreempted is exclusive to the task goroutine.
rseqPreempted bool `state:"nosave"`
- rseqCPUAddr usermem.Addr
- rseqCPU int32
+
+ // rseqCPU is the last CPU number written to rseqAddr/oldRSeqCPUAddr.
+ //
+ // If rseq is unused, rseqCPU is -1 for convenient use in
+ // platform.Context.Switch.
+ //
+ // rseqCPU is exclusive to the task goroutine.
+ rseqCPU int32
+
+ // oldRSeqCPUAddr is a pointer to the userspace old rseq CPU variable.
+ //
+ // oldRSeqCPUAddr is exclusive to the task goroutine.
+ oldRSeqCPUAddr usermem.Addr
+
+ // rseqAddr is a pointer to the userspace linux.RSeq structure.
+ //
+ // rseqAddr is exclusive to the task goroutine.
+ rseqAddr usermem.Addr
+
+ // rseqSignature is the signature that the rseq abort IP must be signed
+ // with.
+ //
+ // rseqSignature is exclusive to the task goroutine.
+ rseqSignature uint32
// copyScratchBuffer is a buffer available to CopyIn/CopyOut
// implementations that require an intermediate buffer to copy data
@@ -528,14 +561,6 @@ func (t *Task) loadPtraceTracer(tracer *Task) {
t.ptraceTracer.Store(tracer)
}
-func (t *Task) saveLogPrefix() string {
- return t.logPrefix.Load().(string)
-}
-
-func (t *Task) loadLogPrefix(prefix string) {
- t.logPrefix.Store(prefix)
-}
-
func (t *Task) saveSyscallFilters() []bpf.Program {
if f := t.syscallFilters.Load(); f != nil {
return f.([]bpf.Program)
@@ -549,6 +574,7 @@ func (t *Task) loadSyscallFilters(filters []bpf.Program) {
// afterLoad is invoked by stateify.
func (t *Task) afterLoad() {
+ t.updateInfoLocked()
t.interruptChan = make(chan struct{}, 1)
t.gosched.State = TaskGoroutineNonexistent
if t.stop != nil {
diff --git a/pkg/sentry/kernel/task_block.go b/pkg/sentry/kernel/task_block.go
index dd69939f9..4a4a69ee2 100644
--- a/pkg/sentry/kernel/task_block.go
+++ b/pkg/sentry/kernel/task_block.go
@@ -16,6 +16,7 @@ package kernel
import (
"runtime"
+ "runtime/trace"
"time"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
@@ -133,19 +134,24 @@ func (t *Task) block(C <-chan struct{}, timerChan <-chan struct{}) error {
runtime.Gosched()
}
+ region := trace.StartRegion(t.traceContext, blockRegion)
select {
case <-C:
+ region.End()
t.SleepFinish(true)
+ // Woken by event.
return nil
case <-interrupt:
+ region.End()
t.SleepFinish(false)
// Return the indicated error on interrupt.
return syserror.ErrInterrupted
case <-timerChan:
- // We've timed out.
+ region.End()
t.SleepFinish(true)
+ // We've timed out.
return syserror.ETIMEDOUT
}
}
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index 0916fd658..247bd4aba 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -236,14 +236,19 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
} else if opts.NewPIDNamespace {
pidns = pidns.NewChild(userns)
}
+
tg := t.tg
+ rseqAddr := usermem.Addr(0)
+ rseqSignature := uint32(0)
if opts.NewThreadGroup {
tg.mounts.IncRef()
sh := t.tg.signalHandlers
if opts.NewSignalHandlers {
sh = sh.Fork()
}
- tg = t.k.newThreadGroup(tg.mounts, pidns, sh, opts.TerminationSignal, tg.limits.GetCopy(), t.k.monotonicClock)
+ tg = t.k.NewThreadGroup(tg.mounts, pidns, sh, opts.TerminationSignal, tg.limits.GetCopy())
+ rseqAddr = t.rseqAddr
+ rseqSignature = t.rseqSignature
}
cfg := &TaskConfig{
@@ -260,6 +265,8 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
UTSNamespace: utsns,
IPCNamespace: ipcns,
AbstractSocketNamespace: t.abstractSockets,
+ RSeqAddr: rseqAddr,
+ RSeqSignature: rseqSignature,
ContainerID: t.ContainerID(),
}
if opts.NewThreadGroup {
@@ -299,6 +306,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
// nt that it must receive before its task goroutine starts running.
tid := nt.k.tasks.Root.IDOfTask(nt)
defer nt.Start(tid)
+ t.traceCloneEvent(tid)
// "If fork/clone and execve are allowed by @prog, any child processes will
// be constrained to the same filters and system call ABI as the parent." -
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
index 17a089b90..fa6528386 100644
--- a/pkg/sentry/kernel/task_exec.go
+++ b/pkg/sentry/kernel/task_exec.go
@@ -129,6 +129,7 @@ type runSyscallAfterExecStop struct {
}
func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
+ t.traceExecEvent(r.tc)
t.tg.pidns.owner.mu.Lock()
t.tg.execing = nil
if t.killed() {
@@ -189,9 +190,11 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
t.updateRSSLocked()
// Restartable sequence state is discarded.
t.rseqPreempted = false
- t.rseqCPUAddr = 0
t.rseqCPU = -1
- t.tg.rscr.Store(&RSEQCriticalRegion{})
+ t.rseqAddr = 0
+ t.rseqSignature = 0
+ t.oldRSeqCPUAddr = 0
+ t.tg.oldRSeqCritical.Store(&OldRSeqCriticalRegion{})
t.tg.pidns.owner.mu.Unlock()
// Remove FDs with the CloseOnExec flag set.
@@ -253,7 +256,7 @@ func (t *Task) promoteLocked() {
t.tg.leader = t
t.Infof("Becoming TID %d (in root PID namespace)", t.tg.pidns.owner.Root.tids[t])
- t.updateLogPrefixLocked()
+ t.updateInfoLocked()
// Reap the original leader. If it has a tracer, detach it instead of
// waiting for it to acknowledge the original leader's death.
oldLeader.exitParentNotified = true
diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go
index 535f03e50..435761e5a 100644
--- a/pkg/sentry/kernel/task_exit.go
+++ b/pkg/sentry/kernel/task_exit.go
@@ -236,6 +236,7 @@ func (*runExit) execute(t *Task) taskRunState {
type runExitMain struct{}
func (*runExitMain) execute(t *Task) taskRunState {
+ t.traceExitEvent()
lastExiter := t.exitThreadGroup()
// If the task has a cleartid, and the thread group wasn't killed by a
diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go
index a29e9b9eb..0fb3661de 100644
--- a/pkg/sentry/kernel/task_log.go
+++ b/pkg/sentry/kernel/task_log.go
@@ -16,6 +16,7 @@ package kernel
import (
"fmt"
+ "runtime/trace"
"sort"
"gvisor.dev/gvisor/pkg/log"
@@ -127,11 +128,88 @@ func (t *Task) debugDumpStack() {
}
}
-// updateLogPrefix updates the task's cached log prefix to reflect its
-// current thread ID.
+// trace definitions.
+//
+// Note that all region names are prefixed by ':' in order to ensure that they
+// are lexically ordered before all system calls, which use the naked system
+// call name (e.g. "read") for maximum clarity.
+const (
+ traceCategory = "task"
+ runRegion = ":run"
+ blockRegion = ":block"
+ cpuidRegion = ":cpuid"
+ faultRegion = ":fault"
+)
+
+// updateInfoLocked updates the task's cached log prefix and tracing
+// information to reflect its current thread ID.
//
// Preconditions: The task's owning TaskSet.mu must be locked.
-func (t *Task) updateLogPrefixLocked() {
+func (t *Task) updateInfoLocked() {
// Use the task's TID in the root PID namespace for logging.
- t.logPrefix.Store(fmt.Sprintf("[% 4d] ", t.tg.pidns.owner.Root.tids[t]))
+ tid := t.tg.pidns.owner.Root.tids[t]
+ t.logPrefix.Store(fmt.Sprintf("[% 4d] ", tid))
+ t.rebuildTraceContext(tid)
+}
+
+// rebuildTraceContext rebuilds the trace context.
+//
+// Precondition: the passed tid must be the tid in the root namespace.
+func (t *Task) rebuildTraceContext(tid ThreadID) {
+ // Re-initialize the trace context.
+ if t.traceTask != nil {
+ t.traceTask.End()
+ }
+
+ // Note that we define the "task type" to be the dynamic TID. This does
+ // not align perfectly with the documentation for "tasks" in the
+ // tracing package. Tasks may be assumed to be bounded by analysis
+ // tools. However, if we just use a generic "task" type here, then the
+ // "user-defined tasks" page on the tracing dashboard becomes nearly
+ // unusable, as it loads all traces from all tasks.
+ //
+ // We can assume that the number of tasks in the system is not
+ // arbitrarily large (in general it won't be, especially for cases
+ // where we're collecting a brief profile), so using the TID is a
+ // reasonable compromise in this case.
+ t.traceContext, t.traceTask = trace.NewTask(t, fmt.Sprintf("tid:%d", tid))
+}
+
+// traceCloneEvent is called when a new task is spawned.
+//
+// ntid must be the new task's ThreadID in the root namespace.
+func (t *Task) traceCloneEvent(ntid ThreadID) {
+ if !trace.IsEnabled() {
+ return
+ }
+ trace.Logf(t.traceContext, traceCategory, "spawn: %d", ntid)
+}
+
+// traceExitEvent is called when a task exits.
+func (t *Task) traceExitEvent() {
+ if !trace.IsEnabled() {
+ return
+ }
+ trace.Logf(t.traceContext, traceCategory, "exit status: 0x%x", t.exitStatus.Status())
+}
+
+// traceExecEvent is called when a task calls exec.
+func (t *Task) traceExecEvent(tc *TaskContext) {
+ if !trace.IsEnabled() {
+ return
+ }
+ d := tc.MemoryManager.Executable()
+ if d == nil {
+ trace.Logf(t.traceContext, traceCategory, "exec: << unknown >>")
+ return
+ }
+ defer d.DecRef()
+ root := t.fsContext.RootDirectory()
+ if root == nil {
+ trace.Logf(t.traceContext, traceCategory, "exec: << no root directory >>")
+ return
+ }
+ defer root.DecRef()
+ n, _ := d.FullName(root)
+ trace.Logf(t.traceContext, traceCategory, "exec: %s", n)
}
diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go
index c92266c59..6357273d3 100644
--- a/pkg/sentry/kernel/task_run.go
+++ b/pkg/sentry/kernel/task_run.go
@@ -17,6 +17,7 @@ package kernel
import (
"bytes"
"runtime"
+ "runtime/trace"
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
@@ -168,12 +169,22 @@ func (*runApp) execute(t *Task) taskRunState {
// Apply restartable sequences.
if t.rseqPreempted {
t.rseqPreempted = false
- if t.rseqCPUAddr != 0 {
+ if t.rseqAddr != 0 || t.oldRSeqCPUAddr != 0 {
+ // Linux writes the CPU on every preemption. We only do
+ // so if it changed. Thus we may delay delivery of
+ // SIGSEGV if rseqAddr/oldRSeqCPUAddr is invalid.
cpu := int32(hostcpu.GetCPU())
if t.rseqCPU != cpu {
t.rseqCPU = cpu
if err := t.rseqCopyOutCPU(); err != nil {
- t.Warningf("Failed to copy CPU to %#x for RSEQ: %v", t.rseqCPUAddr, err)
+ t.Debugf("Failed to copy CPU to %#x for rseq: %v", t.rseqAddr, err)
+ t.forceSignal(linux.SIGSEGV, false)
+ t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
+ // Re-enter the task run loop for signal delivery.
+ return (*runApp)(nil)
+ }
+ if err := t.oldRSeqCopyOutCPU(); err != nil {
+ t.Debugf("Failed to copy CPU to %#x for old rseq: %v", t.oldRSeqCPUAddr, err)
t.forceSignal(linux.SIGSEGV, false)
t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
// Re-enter the task run loop for signal delivery.
@@ -205,9 +216,11 @@ func (*runApp) execute(t *Task) taskRunState {
t.tg.pidns.owner.mu.RUnlock()
}
+ region := trace.StartRegion(t.traceContext, runRegion)
t.accountTaskGoroutineEnter(TaskGoroutineRunningApp)
info, at, err := t.p.Switch(t.MemoryManager().AddressSpace(), t.Arch(), t.rseqCPU)
t.accountTaskGoroutineLeave(TaskGoroutineRunningApp)
+ region.End()
if clearSinglestep {
t.Arch().ClearSingleStep()
@@ -225,6 +238,7 @@ func (*runApp) execute(t *Task) taskRunState {
case platform.ErrContextSignalCPUID:
// Is this a CPUID instruction?
+ region := trace.StartRegion(t.traceContext, cpuidRegion)
expected := arch.CPUIDInstruction[:]
found := make([]byte, len(expected))
_, err := t.CopyIn(usermem.Addr(t.Arch().IP()), &found)
@@ -232,10 +246,12 @@ func (*runApp) execute(t *Task) taskRunState {
// Skip the cpuid instruction.
t.Arch().CPUIDEmulate(t)
t.Arch().SetIP(t.Arch().IP() + uintptr(len(expected)))
+ region.End()
// Resume execution.
return (*runApp)(nil)
}
+ region.End() // Not an actual CPUID, but required copy-in.
// The instruction at the given RIP was not a CPUID, and we
// fallthrough to the default signal deliver behavior below.
@@ -251,8 +267,10 @@ func (*runApp) execute(t *Task) taskRunState {
// an application-generated signal and we should continue execution
// normally.
if at.Any() {
+ region := trace.StartRegion(t.traceContext, faultRegion)
addr := usermem.Addr(info.Addr())
err := t.MemoryManager().HandleUserFault(t, addr, at, usermem.Addr(t.Arch().Stack()))
+ region.End()
if err == nil {
// The fault was handled appropriately.
// We can resume running the application.
@@ -260,6 +278,12 @@ func (*runApp) execute(t *Task) taskRunState {
}
// Is this a vsyscall that we need emulate?
+ //
+ // Note that we don't track vsyscalls as part of a
+ // specific trace region. This is because regions don't
+ // stack, and the actual system call will count as a
+ // region. We should be able to easily identify
+ // vsyscalls by having a <fault><syscall> pair.
if at.Execute {
if sysno, ok := t.tc.st.LookupEmulate(addr); ok {
return t.doVsyscall(addr, sysno)
@@ -306,7 +330,7 @@ func (*runApp) execute(t *Task) taskRunState {
return (*runApp)(nil)
case platform.ErrContextCPUPreempted:
- // Ensure that RSEQ critical sections are interrupted and per-thread
+ // Ensure that rseq critical sections are interrupted and per-thread
// CPU values are updated before the next platform.Context.Switch().
t.rseqPreempted = true
return (*runApp)(nil)
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index ae6fc4025..58af16ee2 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -21,6 +21,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
"gvisor.dev/gvisor/pkg/sentry/usage"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -79,6 +80,13 @@ type TaskConfig struct {
// AbstractSocketNamespace is the AbstractSocketNamespace of the new task.
AbstractSocketNamespace *AbstractSocketNamespace
+ // RSeqAddr is a pointer to the the userspace linux.RSeq structure.
+ RSeqAddr usermem.Addr
+
+ // RSeqSignature is the signature that the rseq abort IP must be signed
+ // with.
+ RSeqSignature uint32
+
// ContainerID is the container the new task belongs to.
ContainerID string
}
@@ -126,6 +134,8 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
ipcns: cfg.IPCNamespace,
abstractSockets: cfg.AbstractSocketNamespace,
rseqCPU: -1,
+ rseqAddr: cfg.RSeqAddr,
+ rseqSignature: cfg.RSeqSignature,
futexWaiter: futex.NewWaiter(),
containerID: cfg.ContainerID,
}
@@ -154,10 +164,10 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
// Below this point, newTask is expected not to fail (there is no rollback
// of assignTIDsLocked or any of the following).
- // Logging on t's behalf will panic if t.logPrefix hasn't been initialized.
- // This is the earliest point at which we can do so (since t now has thread
- // IDs).
- t.updateLogPrefixLocked()
+ // Logging on t's behalf will panic if t.logPrefix hasn't been
+ // initialized. This is the earliest point at which we can do so
+ // (since t now has thread IDs).
+ t.updateInfoLocked()
if cfg.InheritParent != nil {
t.parent = cfg.InheritParent.parent
diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go
index b543d536a..3180f5560 100644
--- a/pkg/sentry/kernel/task_syscall.go
+++ b/pkg/sentry/kernel/task_syscall.go
@@ -17,6 +17,7 @@ package kernel
import (
"fmt"
"os"
+ "runtime/trace"
"syscall"
"gvisor.dev/gvisor/pkg/abi/linux"
@@ -160,6 +161,10 @@ func (t *Task) executeSyscall(sysno uintptr, args arch.SyscallArguments) (rval u
ctrl = ctrlStopAndReinvokeSyscall
} else {
fn := s.Lookup(sysno)
+ var region *trace.Region // Only non-nil if tracing == true.
+ if trace.IsEnabled() {
+ region = trace.StartRegion(t.traceContext, s.LookupName(sysno))
+ }
if fn != nil {
// Call our syscall implementation.
rval, ctrl, err = fn(t, args)
@@ -167,6 +172,9 @@ func (t *Task) executeSyscall(sysno uintptr, args arch.SyscallArguments) (rval u
// Use the missing function if not found.
rval, err = t.SyscallTable().Missing(t, sysno, args)
}
+ if region != nil {
+ region.End()
+ }
}
if bits.IsOn32(fe, ExternalAfterEnable) && (s.ExternalFilterAfter == nil || s.ExternalFilterAfter(t, sysno, args)) {
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
index 72568d296..768e958d2 100644
--- a/pkg/sentry/kernel/thread_group.go
+++ b/pkg/sentry/kernel/thread_group.go
@@ -15,7 +15,6 @@
package kernel
import (
- "sync"
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
@@ -25,6 +24,7 @@ import (
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/limits"
"gvisor.dev/gvisor/pkg/sentry/usage"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -238,8 +238,8 @@ type ThreadGroup struct {
// execed is protected by the TaskSet mutex.
execed bool
- // rscr is the thread group's RSEQ critical region.
- rscr atomic.Value `state:".(*RSEQCriticalRegion)"`
+ // oldRSeqCritical is the thread group's old rseq critical region.
+ oldRSeqCritical atomic.Value `state:".(*OldRSeqCriticalRegion)"`
// mounts is the thread group's mount namespace. This does not really
// correspond to a "mount namespace" in Linux, but is more like a
@@ -256,35 +256,35 @@ type ThreadGroup struct {
tty *TTY
}
-// newThreadGroup returns a new, empty thread group in PID namespace ns. The
+// NewThreadGroup returns a new, empty thread group in PID namespace ns. The
// thread group leader will send its parent terminationSignal when it exits.
// The new thread group isn't visible to the system until a task has been
// created inside of it by a successful call to TaskSet.NewTask.
-func (k *Kernel) newThreadGroup(mounts *fs.MountNamespace, ns *PIDNamespace, sh *SignalHandlers, terminationSignal linux.Signal, limits *limits.LimitSet, monotonicClock *timekeeperClock) *ThreadGroup {
+func (k *Kernel) NewThreadGroup(mntns *fs.MountNamespace, pidns *PIDNamespace, sh *SignalHandlers, terminationSignal linux.Signal, limits *limits.LimitSet) *ThreadGroup {
tg := &ThreadGroup{
threadGroupNode: threadGroupNode{
- pidns: ns,
+ pidns: pidns,
},
signalHandlers: sh,
terminationSignal: terminationSignal,
ioUsage: &usage.IO{},
limits: limits,
- mounts: mounts,
+ mounts: mntns,
}
tg.itimerRealTimer = ktime.NewTimer(k.monotonicClock, &itimerRealListener{tg: tg})
tg.timers = make(map[linux.TimerID]*IntervalTimer)
- tg.rscr.Store(&RSEQCriticalRegion{})
+ tg.oldRSeqCritical.Store(&OldRSeqCriticalRegion{})
return tg
}
-// saveRscr is invoked by stateify.
-func (tg *ThreadGroup) saveRscr() *RSEQCriticalRegion {
- return tg.rscr.Load().(*RSEQCriticalRegion)
+// saveOldRSeqCritical is invoked by stateify.
+func (tg *ThreadGroup) saveOldRSeqCritical() *OldRSeqCriticalRegion {
+ return tg.oldRSeqCritical.Load().(*OldRSeqCriticalRegion)
}
-// loadRscr is invoked by stateify.
-func (tg *ThreadGroup) loadRscr(rscr *RSEQCriticalRegion) {
- tg.rscr.Store(rscr)
+// loadOldRSeqCritical is invoked by stateify.
+func (tg *ThreadGroup) loadOldRSeqCritical(r *OldRSeqCriticalRegion) {
+ tg.oldRSeqCritical.Store(r)
}
// SignalHandlers returns the signal handlers used by tg.
diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go
index 8267929a6..bf2dabb6e 100644
--- a/pkg/sentry/kernel/threads.go
+++ b/pkg/sentry/kernel/threads.go
@@ -16,9 +16,9 @@ package kernel
import (
"fmt"
- "sync"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/waiter"
)
diff --git a/pkg/sentry/kernel/time/BUILD b/pkg/sentry/kernel/time/BUILD
index 31847e1df..4e4de0512 100644
--- a/pkg/sentry/kernel/time/BUILD
+++ b/pkg/sentry/kernel/time/BUILD
@@ -13,6 +13,7 @@ go_library(
deps = [
"//pkg/abi/linux",
"//pkg/sentry/context",
+ "//pkg/sync",
"//pkg/syserror",
"//pkg/waiter",
],
diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go
index 107394183..706de83ef 100644
--- a/pkg/sentry/kernel/time/time.go
+++ b/pkg/sentry/kernel/time/time.go
@@ -19,10 +19,10 @@ package time
import (
"fmt"
"math"
- "sync"
"time"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/waiter"
)
diff --git a/pkg/sentry/kernel/timekeeper.go b/pkg/sentry/kernel/timekeeper.go
index 76417342a..dc99301de 100644
--- a/pkg/sentry/kernel/timekeeper.go
+++ b/pkg/sentry/kernel/timekeeper.go
@@ -16,7 +16,6 @@ package kernel
import (
"fmt"
- "sync"
"time"
"gvisor.dev/gvisor/pkg/log"
@@ -24,6 +23,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
"gvisor.dev/gvisor/pkg/sentry/platform"
sentrytime "gvisor.dev/gvisor/pkg/sentry/time"
+ "gvisor.dev/gvisor/pkg/sync"
)
// Timekeeper manages all of the kernel clocks.
diff --git a/pkg/sentry/kernel/tty.go b/pkg/sentry/kernel/tty.go
index 34f84487a..464d2306a 100644
--- a/pkg/sentry/kernel/tty.go
+++ b/pkg/sentry/kernel/tty.go
@@ -14,15 +14,26 @@
package kernel
-import "sync"
+import "gvisor.dev/gvisor/pkg/sync"
// TTY defines the relationship between a thread group and its controlling
// terminal.
//
// +stateify savable
type TTY struct {
+ // Index is the terminal index. It is immutable.
+ Index uint32
+
mu sync.Mutex `state:"nosave"`
// tg is protected by mu.
tg *ThreadGroup
}
+
+// TTY returns the thread group's controlling terminal. If nil, there is no
+// controlling terminal.
+func (tg *ThreadGroup) TTY() *TTY {
+ tg.signalHandlers.mu.Lock()
+ defer tg.signalHandlers.mu.Unlock()
+ return tg.tty
+}
diff --git a/pkg/sentry/kernel/uts_namespace.go b/pkg/sentry/kernel/uts_namespace.go
index 0a563e715..8ccf04bd1 100644
--- a/pkg/sentry/kernel/uts_namespace.go
+++ b/pkg/sentry/kernel/uts_namespace.go
@@ -15,9 +15,8 @@
package kernel
import (
- "sync"
-
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sync"
)
// UTSNamespace represents a UTS namespace, a holder of two system identifiers:
diff --git a/pkg/sentry/limits/BUILD b/pkg/sentry/limits/BUILD
index 156e67bf8..9fa841e8b 100644
--- a/pkg/sentry/limits/BUILD
+++ b/pkg/sentry/limits/BUILD
@@ -15,6 +15,7 @@ go_library(
deps = [
"//pkg/abi/linux",
"//pkg/sentry/context",
+ "//pkg/sync",
],
)
diff --git a/pkg/sentry/limits/limits.go b/pkg/sentry/limits/limits.go
index b6c22656b..31b9e9ff6 100644
--- a/pkg/sentry/limits/limits.go
+++ b/pkg/sentry/limits/limits.go
@@ -16,8 +16,9 @@
package limits
import (
- "sync"
"syscall"
+
+ "gvisor.dev/gvisor/pkg/sync"
)
// LimitType defines a type of resource limit.
diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go
index c2c3ec06e..6299a3e2f 100644
--- a/pkg/sentry/loader/elf.go
+++ b/pkg/sentry/loader/elf.go
@@ -408,6 +408,8 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, info el
start = vaddr
}
if vaddr < end {
+ // NOTE(b/37474556): Linux allows out-of-order
+ // segments, in violation of the spec.
ctx.Infof("PT_LOAD headers out-of-order. %#x < %#x", vaddr, end)
return loadedELF{}, syserror.ENOEXEC
}
diff --git a/pkg/sentry/memmap/BUILD b/pkg/sentry/memmap/BUILD
index 3ef84245b..112794e9c 100644
--- a/pkg/sentry/memmap/BUILD
+++ b/pkg/sentry/memmap/BUILD
@@ -41,7 +41,6 @@ go_library(
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/log",
- "//pkg/refs",
"//pkg/sentry/context",
"//pkg/sentry/platform",
"//pkg/sentry/usermem",
diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go
index 03b99aaea..16a722a13 100644
--- a/pkg/sentry/memmap/memmap.go
+++ b/pkg/sentry/memmap/memmap.go
@@ -18,7 +18,6 @@ package memmap
import (
"fmt"
- "gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/usermem"
@@ -235,8 +234,11 @@ type InvalidateOpts struct {
// coincidental; fs.File implements MappingIdentity, and some
// fs.InodeOperations implement Mappable.)
type MappingIdentity interface {
- // MappingIdentity is reference-counted.
- refs.RefCounter
+ // IncRef increments the MappingIdentity's reference count.
+ IncRef()
+
+ // DecRef decrements the MappingIdentity's reference count.
+ DecRef()
// MappedName returns the application-visible name shown in
// /proc/[pid]/maps.
diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD
index 839931f67..83e248431 100644
--- a/pkg/sentry/mm/BUILD
+++ b/pkg/sentry/mm/BUILD
@@ -118,7 +118,7 @@ go_library(
"//pkg/sentry/safemem",
"//pkg/sentry/usage",
"//pkg/sentry/usermem",
- "//pkg/syncutil",
+ "//pkg/sync",
"//pkg/syserror",
"//pkg/tcpip/buffer",
],
diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go
index 1b746d030..4b48866ad 100644
--- a/pkg/sentry/mm/aio_context.go
+++ b/pkg/sentry/mm/aio_context.go
@@ -15,8 +15,6 @@
package mm
import (
- "sync"
-
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/context"
@@ -25,6 +23,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
)
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index 58a5c186d..fa86ebced 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -35,8 +35,6 @@
package mm
import (
- "sync"
-
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/memmap"
@@ -44,7 +42,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/safemem"
"gvisor.dev/gvisor/pkg/sentry/usermem"
- "gvisor.dev/gvisor/pkg/syncutil"
+ "gvisor.dev/gvisor/pkg/sync"
)
// MemoryManager implements a virtual address space.
@@ -82,7 +80,7 @@ type MemoryManager struct {
users int32
// mappingMu is analogous to Linux's struct mm_struct::mmap_sem.
- mappingMu syncutil.DowngradableRWMutex `state:"nosave"`
+ mappingMu sync.DowngradableRWMutex `state:"nosave"`
// vmas stores virtual memory areas. Since vmas are stored by value,
// clients should usually use vmaIterator.ValuePtr() instead of
@@ -125,7 +123,7 @@ type MemoryManager struct {
// activeMu is loosely analogous to Linux's struct
// mm_struct::page_table_lock.
- activeMu syncutil.DowngradableRWMutex `state:"nosave"`
+ activeMu sync.DowngradableRWMutex `state:"nosave"`
// pmas stores platform mapping areas used to implement vmas. Since pmas
// are stored by value, clients should usually use pmaIterator.ValuePtr()
diff --git a/pkg/sentry/mm/procfs.go b/pkg/sentry/mm/procfs.go
index 8c2246bb4..79610acb7 100644
--- a/pkg/sentry/mm/procfs.go
+++ b/pkg/sentry/mm/procfs.go
@@ -66,8 +66,6 @@ func (mm *MemoryManager) ReadMapsDataInto(ctx context.Context, buf *bytes.Buffer
var start usermem.Addr
for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() {
- // FIXME(b/30793614): If we use a usermem.Addr for the handle, we get
- // "panic: autosave error: type usermem.Addr is not registered".
mm.appendVMAMapsEntryLocked(ctx, vseg, buf)
}
@@ -81,7 +79,6 @@ func (mm *MemoryManager) ReadMapsDataInto(ctx context.Context, buf *bytes.Buffer
//
// Artifically adjust the seqfile handle so we only output vsyscall entry once.
if start != vsyscallEnd {
- // FIXME(b/30793614): Can't get a pointer to constant vsyscallEnd.
buf.WriteString(vsyscallMapsEntry)
}
}
@@ -97,8 +94,6 @@ func (mm *MemoryManager) ReadMapsSeqFileData(ctx context.Context, handle seqfile
start = *handle.(*usermem.Addr)
}
for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() {
- // FIXME(b/30793614): If we use a usermem.Addr for the handle, we get
- // "panic: autosave error: type usermem.Addr is not registered".
vmaAddr := vseg.End()
data = append(data, seqfile.SeqData{
Buf: mm.vmaMapsEntryLocked(ctx, vseg),
@@ -116,7 +111,6 @@ func (mm *MemoryManager) ReadMapsSeqFileData(ctx context.Context, handle seqfile
//
// Artifically adjust the seqfile handle so we only output vsyscall entry once.
if start != vsyscallEnd {
- // FIXME(b/30793614): Can't get a pointer to constant vsyscallEnd.
vmaAddr := vsyscallEnd
data = append(data, seqfile.SeqData{
Buf: []byte(vsyscallMapsEntry),
@@ -187,15 +181,12 @@ func (mm *MemoryManager) ReadSmapsDataInto(ctx context.Context, buf *bytes.Buffe
var start usermem.Addr
for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() {
- // FIXME(b/30793614): If we use a usermem.Addr for the handle, we get
- // "panic: autosave error: type usermem.Addr is not registered".
mm.vmaSmapsEntryIntoLocked(ctx, vseg, buf)
}
// We always emulate vsyscall, so advertise it here. See
// ReadMapsSeqFileData for additional commentary.
if start != vsyscallEnd {
- // FIXME(b/30793614): Can't get a pointer to constant vsyscallEnd.
buf.WriteString(vsyscallSmapsEntry)
}
}
@@ -211,8 +202,6 @@ func (mm *MemoryManager) ReadSmapsSeqFileData(ctx context.Context, handle seqfil
start = *handle.(*usermem.Addr)
}
for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() {
- // FIXME(b/30793614): If we use a usermem.Addr for the handle, we get
- // "panic: autosave error: type usermem.Addr is not registered".
vmaAddr := vseg.End()
data = append(data, seqfile.SeqData{
Buf: mm.vmaSmapsEntryLocked(ctx, vseg),
@@ -223,7 +212,6 @@ func (mm *MemoryManager) ReadSmapsSeqFileData(ctx context.Context, handle seqfil
// We always emulate vsyscall, so advertise it here. See
// ReadMapsSeqFileData for additional commentary.
if start != vsyscallEnd {
- // FIXME(b/30793614): Can't get a pointer to constant vsyscallEnd.
vmaAddr := vsyscallEnd
data = append(data, seqfile.SeqData{
Buf: []byte(vsyscallSmapsEntry),
diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD
index f404107af..a9a2642c5 100644
--- a/pkg/sentry/pgalloc/BUILD
+++ b/pkg/sentry/pgalloc/BUILD
@@ -73,6 +73,7 @@ go_library(
"//pkg/sentry/usage",
"//pkg/sentry/usermem",
"//pkg/state",
+ "//pkg/sync",
"//pkg/syserror",
],
)
diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go
index f7f7298c4..c99e023d9 100644
--- a/pkg/sentry/pgalloc/pgalloc.go
+++ b/pkg/sentry/pgalloc/pgalloc.go
@@ -25,7 +25,6 @@ import (
"fmt"
"math"
"os"
- "sync"
"sync/atomic"
"syscall"
"time"
@@ -37,6 +36,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/safemem"
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
)
diff --git a/pkg/sentry/platform/interrupt/BUILD b/pkg/sentry/platform/interrupt/BUILD
index b6d008dbe..85e882df9 100644
--- a/pkg/sentry/platform/interrupt/BUILD
+++ b/pkg/sentry/platform/interrupt/BUILD
@@ -10,6 +10,7 @@ go_library(
],
importpath = "gvisor.dev/gvisor/pkg/sentry/platform/interrupt",
visibility = ["//pkg/sentry:internal"],
+ deps = ["//pkg/sync"],
)
go_test(
diff --git a/pkg/sentry/platform/interrupt/interrupt.go b/pkg/sentry/platform/interrupt/interrupt.go
index a4651f500..57be41647 100644
--- a/pkg/sentry/platform/interrupt/interrupt.go
+++ b/pkg/sentry/platform/interrupt/interrupt.go
@@ -17,7 +17,8 @@ package interrupt
import (
"fmt"
- "sync"
+
+ "gvisor.dev/gvisor/pkg/sync"
)
// Receiver receives interrupt notifications from a Forwarder.
diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index 6803d488c..6a358d1d4 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -12,20 +12,30 @@ go_library(
"bluepill_amd64.go",
"bluepill_amd64.s",
"bluepill_amd64_unsafe.go",
+ "bluepill_arm64.go",
+ "bluepill_arm64.s",
+ "bluepill_arm64_unsafe.go",
"bluepill_fault.go",
"bluepill_unsafe.go",
"context.go",
- "filters.go",
+ "filters_amd64.go",
+ "filters_arm64.go",
"kvm.go",
"kvm_amd64.go",
"kvm_amd64_unsafe.go",
+ "kvm_arm64.go",
+ "kvm_arm64_unsafe.go",
"kvm_const.go",
+ "kvm_const_arm64.go",
"machine.go",
"machine_amd64.go",
"machine_amd64_unsafe.go",
"machine_arm64.go",
+ "machine_arm64_unsafe.go",
"machine_unsafe.go",
"physical_map.go",
+ "physical_map_amd64.go",
+ "physical_map_arm64.go",
"virtual_map.go",
],
importpath = "gvisor.dev/gvisor/pkg/sentry/platform/kvm",
@@ -45,6 +55,7 @@ go_library(
"//pkg/sentry/platform/safecopy",
"//pkg/sentry/time",
"//pkg/sentry/usermem",
+ "//pkg/sync",
],
)
diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go
index ea8b9632e..a25f3c449 100644
--- a/pkg/sentry/platform/kvm/address_space.go
+++ b/pkg/sentry/platform/kvm/address_space.go
@@ -15,13 +15,13 @@
package kvm
import (
- "sync"
"sync/atomic"
"gvisor.dev/gvisor/pkg/atomicbitops"
"gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
"gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sync"
)
// dirtySet tracks vCPUs for invalidation.
diff --git a/pkg/sentry/platform/kvm/bluepill.go b/pkg/sentry/platform/kvm/bluepill.go
index 043de51b3..30dbb74d6 100644
--- a/pkg/sentry/platform/kvm/bluepill.go
+++ b/pkg/sentry/platform/kvm/bluepill.go
@@ -20,6 +20,7 @@ import (
"syscall"
"gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/platform/ring0"
"gvisor.dev/gvisor/pkg/sentry/platform/safecopy"
)
@@ -36,6 +37,18 @@ func sighandler()
func dieTrampoline()
var (
+ // bounceSignal is the signal used for bouncing KVM.
+ //
+ // We use SIGCHLD because it is not masked by the runtime, and
+ // it will be ignored properly by other parts of the kernel.
+ bounceSignal = syscall.SIGCHLD
+
+ // bounceSignalMask has only bounceSignal set.
+ bounceSignalMask = uint64(1 << (uint64(bounceSignal) - 1))
+
+ // bounce is the interrupt vector used to return to the kernel.
+ bounce = uint32(ring0.VirtualizationException)
+
// savedHandler is a pointer to the previous handler.
//
// This is called by bluepillHandler.
@@ -45,6 +58,13 @@ var (
dieTrampolineAddr uintptr
)
+// redpill invokes a syscall with -1.
+//
+//go:nosplit
+func redpill() {
+ syscall.RawSyscall(^uintptr(0), 0, 0, 0)
+}
+
// dieHandler is called by dieTrampoline.
//
//go:nosplit
@@ -73,8 +93,8 @@ func (c *vCPU) die(context *arch.SignalContext64, msg string) {
func init() {
// Install the handler.
- if err := safecopy.ReplaceSignalHandler(syscall.SIGSEGV, reflect.ValueOf(sighandler).Pointer(), &savedHandler); err != nil {
- panic(fmt.Sprintf("Unable to set handler for signal %d: %v", syscall.SIGSEGV, err))
+ if err := safecopy.ReplaceSignalHandler(bluepillSignal, reflect.ValueOf(sighandler).Pointer(), &savedHandler); err != nil {
+ panic(fmt.Sprintf("Unable to set handler for signal %d: %v", bluepillSignal, err))
}
// Extract the address for the trampoline.
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64.go b/pkg/sentry/platform/kvm/bluepill_amd64.go
index 421c88220..133c2203d 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64.go
+++ b/pkg/sentry/platform/kvm/bluepill_amd64.go
@@ -24,26 +24,10 @@ import (
)
var (
- // bounceSignal is the signal used for bouncing KVM.
- //
- // We use SIGCHLD because it is not masked by the runtime, and
- // it will be ignored properly by other parts of the kernel.
- bounceSignal = syscall.SIGCHLD
-
- // bounceSignalMask has only bounceSignal set.
- bounceSignalMask = uint64(1 << (uint64(bounceSignal) - 1))
-
- // bounce is the interrupt vector used to return to the kernel.
- bounce = uint32(ring0.VirtualizationException)
+ // The action for bluepillSignal is changed by sigaction().
+ bluepillSignal = syscall.SIGSEGV
)
-// redpill on amd64 invokes a syscall with -1.
-//
-//go:nosplit
-func redpill() {
- syscall.RawSyscall(^uintptr(0), 0, 0, 0)
-}
-
// bluepillArchEnter is called during bluepillEnter.
//
//go:nosplit
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
index 9d8af143e..a63a6a071 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
@@ -23,13 +23,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
)
-// bluepillArchContext returns the arch-specific context.
-//
-//go:nosplit
-func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 {
- return &((*arch.UContext64)(context).MContext)
-}
-
// dieArchSetup initializes the state for dieTrampoline.
//
// The amd64 dieTrampoline requires the vCPU to be set in BX, and the last RIP
diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.go b/pkg/sentry/platform/kvm/bluepill_arm64.go
new file mode 100644
index 000000000..552341721
--- /dev/null
+++ b/pkg/sentry/platform/kvm/bluepill_arm64.go
@@ -0,0 +1,79 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package kvm
+
+import (
+ "syscall"
+
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/platform/ring0"
+)
+
+var (
+ // The action for bluepillSignal is changed by sigaction().
+ bluepillSignal = syscall.SIGILL
+)
+
+// bluepillArchEnter is called during bluepillEnter.
+//
+//go:nosplit
+func bluepillArchEnter(context *arch.SignalContext64) (c *vCPU) {
+ c = vCPUPtr(uintptr(context.Regs[8]))
+ regs := c.CPU.Registers()
+ regs.Regs = context.Regs
+ regs.Sp = context.Sp
+ regs.Pc = context.Pc
+ regs.Pstate = context.Pstate
+ regs.Pstate &^= uint64(ring0.KernelFlagsClear)
+ regs.Pstate |= ring0.KernelFlagsSet
+ return
+}
+
+// bluepillArchExit is called during bluepillEnter.
+//
+//go:nosplit
+func bluepillArchExit(c *vCPU, context *arch.SignalContext64) {
+ regs := c.CPU.Registers()
+ context.Regs = regs.Regs
+ context.Sp = regs.Sp
+ context.Pc = regs.Pc
+ context.Pstate = regs.Pstate
+ context.Pstate &^= uint64(ring0.UserFlagsClear)
+ context.Pstate |= ring0.UserFlagsSet
+}
+
+// KernelSyscall handles kernel syscalls.
+//
+//go:nosplit
+func (c *vCPU) KernelSyscall() {
+ regs := c.Registers()
+ if regs.Regs[8] != ^uint64(0) {
+ regs.Pc -= 4 // Rewind.
+ }
+ ring0.Halt()
+}
+
+// KernelException handles kernel exceptions.
+//
+//go:nosplit
+func (c *vCPU) KernelException(vector ring0.Vector) {
+ regs := c.Registers()
+ if vector == ring0.Vector(bounce) {
+ regs.Pc = 0
+ }
+ ring0.Halt()
+}
diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.s b/pkg/sentry/platform/kvm/bluepill_arm64.s
new file mode 100644
index 000000000..c61700892
--- /dev/null
+++ b/pkg/sentry/platform/kvm/bluepill_arm64.s
@@ -0,0 +1,87 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "textflag.h"
+
+// VCPU_CPU is the location of the CPU in the vCPU struct.
+//
+// This is guaranteed to be zero.
+#define VCPU_CPU 0x0
+
+// CPU_SELF is the self reference in ring0's percpu.
+//
+// This is guaranteed to be zero.
+#define CPU_SELF 0x0
+
+// Context offsets.
+//
+// Only limited use of the context is done in the assembly stub below, most is
+// done in the Go handlers.
+#define SIGINFO_SIGNO 0x0
+#define CONTEXT_PC 0x1B8
+#define CONTEXT_R0 0xB8
+
+// See bluepill.go.
+TEXT ·bluepill(SB),NOSPLIT,$0
+begin:
+ MOVD vcpu+0(FP), R8
+ MOVD $VCPU_CPU(R8), R9
+ ORR $0xffff000000000000, R9, R9
+ // Trigger sigill.
+ // In ring0.Start(), the value of R8 will be stored into tpidr_el1.
+ // When the context was loaded into vcpu successfully,
+ // we will check if the value of R10 and R9 are the same.
+ WORD $0xd538d08a // MRS TPIDR_EL1, R10
+check_vcpu:
+ CMP R10, R9
+ BEQ right_vCPU
+wrong_vcpu:
+ CALL ·redpill(SB)
+ B begin
+right_vCPU:
+ RET
+
+// sighandler: see bluepill.go for documentation.
+//
+// The arguments are the following:
+//
+// R0 - The signal number.
+// R1 - Pointer to siginfo_t structure.
+// R2 - Pointer to ucontext structure.
+//
+TEXT ·sighandler(SB),NOSPLIT,$0
+ // si_signo should be sigill.
+ MOVD SIGINFO_SIGNO(R1), R7
+ CMPW $4, R7
+ BNE fallback
+
+ MOVD CONTEXT_PC(R2), R7
+ CMPW $0, R7
+ BEQ fallback
+
+ MOVD R2, 8(RSP)
+ BL ·bluepillHandler(SB) // Call the handler.
+
+ RET
+
+fallback:
+ // Jump to the previous signal handler.
+ MOVD ·savedHandler(SB), R7
+ B (R7)
+
+// dieTrampoline: see bluepill.go, bluepill_arm64_unsafe.go for documentation.
+TEXT ·dieTrampoline(SB),NOSPLIT,$0
+ // TODO(gvisor.dev/issue/1249): dieTrampoline supporting for Arm64.
+ MOVD R9, 8(RSP)
+ BL ·dieHandler(SB)
diff --git a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
new file mode 100644
index 000000000..2f02c03cf
--- /dev/null
+++ b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
@@ -0,0 +1,26 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package kvm
+
+import (
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+)
+
+//go:nosplit
+func dieArchSetup(c *vCPU, context *arch.SignalContext64, guestRegs *userRegs) {
+ // TODO(gvisor.dev/issue/1249): dieTrampoline supporting for Arm64.
+}
diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
index ca011ef78..9add7c944 100644
--- a/pkg/sentry/platform/kvm/bluepill_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
@@ -23,6 +23,8 @@ import (
"sync/atomic"
"syscall"
"unsafe"
+
+ "gvisor.dev/gvisor/pkg/sentry/arch"
)
//go:linkname throw runtime.throw
@@ -49,6 +51,13 @@ func uintptrValue(addr *byte) uintptr {
return (uintptr)(unsafe.Pointer(addr))
}
+// bluepillArchContext returns the UContext64.
+//
+//go:nosplit
+func bluepillArchContext(context unsafe.Pointer) *arch.SignalContext64 {
+ return &((*arch.UContext64)(context).MContext)
+}
+
// bluepillHandler is called from the signal stub.
//
// The world may be stopped while this is executing, and it executes on the
diff --git a/pkg/sentry/platform/kvm/filters.go b/pkg/sentry/platform/kvm/filters_amd64.go
index 7d949f1dd..7d949f1dd 100644
--- a/pkg/sentry/platform/kvm/filters.go
+++ b/pkg/sentry/platform/kvm/filters_amd64.go
diff --git a/pkg/sentry/platform/kvm/filters_arm64.go b/pkg/sentry/platform/kvm/filters_arm64.go
new file mode 100644
index 000000000..9245d07c2
--- /dev/null
+++ b/pkg/sentry/platform/kvm/filters_arm64.go
@@ -0,0 +1,32 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+import (
+ "syscall"
+
+ "gvisor.dev/gvisor/pkg/seccomp"
+)
+
+// SyscallFilters returns syscalls made exclusively by the KVM platform.
+func (*KVM) SyscallFilters() seccomp.SyscallRules {
+ return seccomp.SyscallRules{
+ syscall.SYS_IOCTL: {},
+ syscall.SYS_MMAP: {},
+ syscall.SYS_RT_SIGSUSPEND: {},
+ syscall.SYS_RT_SIGTIMEDWAIT: {},
+ 0xffffffffffffffff: {}, // KVM uses syscall -1 to transition to host.
+ }
+}
diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go
index ee4cd2f4d..a7850faed 100644
--- a/pkg/sentry/platform/kvm/kvm.go
+++ b/pkg/sentry/platform/kvm/kvm.go
@@ -18,14 +18,13 @@ package kvm
import (
"fmt"
"os"
- "sync"
"syscall"
- "gvisor.dev/gvisor/pkg/cpuid"
"gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
"gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sync"
)
// KVM represents a lightweight VM context.
@@ -56,9 +55,7 @@ func New(deviceFile *os.File) (*KVM, error) {
// Ensure global initialization is done.
globalOnce.Do(func() {
- physicalInit()
- globalErr = updateSystemValues(int(fd))
- ring0.Init(cpuid.HostFeatureSet())
+ globalErr = updateGlobalOnce(int(fd))
})
if globalErr != nil {
return nil, globalErr
diff --git a/pkg/sentry/platform/kvm/kvm_amd64.go b/pkg/sentry/platform/kvm/kvm_amd64.go
index 5d8ef4761..c5a6f9c7d 100644
--- a/pkg/sentry/platform/kvm/kvm_amd64.go
+++ b/pkg/sentry/platform/kvm/kvm_amd64.go
@@ -17,6 +17,7 @@
package kvm
import (
+ "gvisor.dev/gvisor/pkg/cpuid"
"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
)
@@ -211,3 +212,11 @@ type cpuidEntries struct {
_ uint32
entries [_KVM_NR_CPUID_ENTRIES]cpuidEntry
}
+
+// updateGlobalOnce does global initialization. It has to be called only once.
+func updateGlobalOnce(fd int) error {
+ physicalInit()
+ err := updateSystemValues(int(fd))
+ ring0.Init(cpuid.HostFeatureSet())
+ return err
+}
diff --git a/pkg/sentry/platform/kvm/kvm_arm64.go b/pkg/sentry/platform/kvm/kvm_arm64.go
new file mode 100644
index 000000000..2319c86d3
--- /dev/null
+++ b/pkg/sentry/platform/kvm/kvm_arm64.go
@@ -0,0 +1,83 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package kvm
+
+import (
+ "syscall"
+)
+
+// userMemoryRegion is a region of physical memory.
+//
+// This mirrors kvm_memory_region.
+type userMemoryRegion struct {
+ slot uint32
+ flags uint32
+ guestPhysAddr uint64
+ memorySize uint64
+ userspaceAddr uint64
+}
+
+type kvmOneReg struct {
+ id uint64
+ addr uint64
+}
+
+const KVM_NR_SPSR = 5
+
+type userFpsimdState struct {
+ vregs [64]uint64
+ fpsr uint32
+ fpcr uint32
+ reserved [2]uint32
+}
+
+type userRegs struct {
+ Regs syscall.PtraceRegs
+ sp_el1 uint64
+ elr_el1 uint64
+ spsr [KVM_NR_SPSR]uint64
+ fpRegs userFpsimdState
+}
+
+// runData is the run structure. This may be mapped for synchronous register
+// access (although that doesn't appear to be supported by my kernel at least).
+//
+// This mirrors kvm_run.
+type runData struct {
+ requestInterruptWindow uint8
+ _ [7]uint8
+
+ exitReason uint32
+ readyForInterruptInjection uint8
+ ifFlag uint8
+ _ [2]uint8
+
+ cr8 uint64
+ apicBase uint64
+
+ // This is the union data for exits. Interpretation depends entirely on
+ // the exitReason above (see vCPU code for more information).
+ data [32]uint64
+}
+
+// updateGlobalOnce does global initialization. It has to be called only once.
+func updateGlobalOnce(fd int) error {
+ physicalInit()
+ err := updateSystemValues(int(fd))
+ updateVectorTable()
+ return err
+}
diff --git a/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go b/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go
new file mode 100644
index 000000000..6531bae1d
--- /dev/null
+++ b/pkg/sentry/platform/kvm/kvm_arm64_unsafe.go
@@ -0,0 +1,39 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package kvm
+
+import (
+ "fmt"
+ "syscall"
+)
+
+var (
+ runDataSize int
+)
+
+func updateSystemValues(fd int) error {
+ // Extract the mmap size.
+ sz, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(fd), _KVM_GET_VCPU_MMAP_SIZE, 0)
+ if errno != 0 {
+ return fmt.Errorf("getting VCPU mmap size: %v", errno)
+ }
+ // Save the data.
+ runDataSize = int(sz)
+
+ // Success.
+ return nil
+}
diff --git a/pkg/sentry/platform/kvm/kvm_const.go b/pkg/sentry/platform/kvm/kvm_const.go
index 766131d60..1d5c77ff4 100644
--- a/pkg/sentry/platform/kvm/kvm_const.go
+++ b/pkg/sentry/platform/kvm/kvm_const.go
@@ -49,11 +49,13 @@ const (
_KVM_EXIT_SHUTDOWN = 0x8
_KVM_EXIT_FAIL_ENTRY = 0x9
_KVM_EXIT_INTERNAL_ERROR = 0x11
+ _KVM_EXIT_SYSTEM_EVENT = 0x18
)
// KVM capability options.
const (
- _KVM_CAP_MAX_VCPUS = 0x42
+ _KVM_CAP_MAX_VCPUS = 0x42
+ _KVM_CAP_ARM_VM_IPA_SIZE = 0xa5
)
// KVM limits.
diff --git a/pkg/sentry/platform/kvm/kvm_const_arm64.go b/pkg/sentry/platform/kvm/kvm_const_arm64.go
new file mode 100644
index 000000000..5a74c6e36
--- /dev/null
+++ b/pkg/sentry/platform/kvm/kvm_const_arm64.go
@@ -0,0 +1,132 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+// KVM ioctls for Arm64.
+const (
+ _KVM_GET_ONE_REG = 0x4010aeab
+ _KVM_SET_ONE_REG = 0x4010aeac
+
+ _KVM_ARM_PREFERRED_TARGET = 0x8020aeaf
+ _KVM_ARM_VCPU_INIT = 0x4020aeae
+ _KVM_ARM64_REGS_PSTATE = 0x6030000000100042
+ _KVM_ARM64_REGS_SP_EL1 = 0x6030000000100044
+ _KVM_ARM64_REGS_R0 = 0x6030000000100000
+ _KVM_ARM64_REGS_R1 = 0x6030000000100002
+ _KVM_ARM64_REGS_R2 = 0x6030000000100004
+ _KVM_ARM64_REGS_R3 = 0x6030000000100006
+ _KVM_ARM64_REGS_R8 = 0x6030000000100010
+ _KVM_ARM64_REGS_R18 = 0x6030000000100024
+ _KVM_ARM64_REGS_PC = 0x6030000000100040
+ _KVM_ARM64_REGS_MAIR_EL1 = 0x603000000013c510
+ _KVM_ARM64_REGS_TCR_EL1 = 0x603000000013c102
+ _KVM_ARM64_REGS_TTBR0_EL1 = 0x603000000013c100
+ _KVM_ARM64_REGS_TTBR1_EL1 = 0x603000000013c101
+ _KVM_ARM64_REGS_SCTLR_EL1 = 0x603000000013c080
+ _KVM_ARM64_REGS_CPACR_EL1 = 0x603000000013c082
+ _KVM_ARM64_REGS_VBAR_EL1 = 0x603000000013c600
+)
+
+// Arm64: Architectural Feature Access Control Register EL1.
+const (
+ _FPEN_NOTRAP = 0x3
+ _FPEN_SHIFT = 0x20
+)
+
+// Arm64: System Control Register EL1.
+const (
+ _SCTLR_M = 1 << 0
+ _SCTLR_C = 1 << 2
+ _SCTLR_I = 1 << 12
+)
+
+// Arm64: Translation Control Register EL1.
+const (
+ _TCR_IPS_40BITS = 2 << 32 // PA=40
+ _TCR_IPS_48BITS = 5 << 32 // PA=48
+
+ _TCR_T0SZ_OFFSET = 0
+ _TCR_T1SZ_OFFSET = 16
+ _TCR_IRGN0_SHIFT = 8
+ _TCR_IRGN1_SHIFT = 24
+ _TCR_ORGN0_SHIFT = 10
+ _TCR_ORGN1_SHIFT = 26
+ _TCR_SH0_SHIFT = 12
+ _TCR_SH1_SHIFT = 28
+ _TCR_TG0_SHIFT = 14
+ _TCR_TG1_SHIFT = 30
+
+ _TCR_T0SZ_VA48 = 64 - 48 // VA=48
+ _TCR_T1SZ_VA48 = 64 - 48 // VA=48
+
+ _TCR_ASID16 = 1 << 36
+ _TCR_TBI0 = 1 << 37
+
+ _TCR_TXSZ_VA48 = (_TCR_T0SZ_VA48 << _TCR_T0SZ_OFFSET) | (_TCR_T1SZ_VA48 << _TCR_T1SZ_OFFSET)
+
+ _TCR_TG0_4K = 0 << _TCR_TG0_SHIFT // 4K
+ _TCR_TG0_64K = 1 << _TCR_TG0_SHIFT // 64K
+
+ _TCR_TG1_4K = 2 << _TCR_TG1_SHIFT
+
+ _TCR_TG_FLAGS = _TCR_TG0_4K | _TCR_TG1_4K
+
+ _TCR_IRGN0_WBWA = 1 << _TCR_IRGN0_SHIFT
+ _TCR_IRGN1_WBWA = 1 << _TCR_IRGN1_SHIFT
+ _TCR_IRGN_WBWA = _TCR_IRGN0_WBWA | _TCR_IRGN1_WBWA
+
+ _TCR_ORGN0_WBWA = 1 << _TCR_ORGN0_SHIFT
+ _TCR_ORGN1_WBWA = 1 << _TCR_ORGN1_SHIFT
+
+ _TCR_ORGN_WBWA = _TCR_ORGN0_WBWA | _TCR_ORGN1_WBWA
+
+ _TCR_SHARED = (3 << _TCR_SH0_SHIFT) | (3 << _TCR_SH1_SHIFT)
+
+ _TCR_CACHE_FLAGS = _TCR_IRGN_WBWA | _TCR_ORGN_WBWA
+)
+
+// Arm64: Memory Attribute Indirection Register EL1.
+const (
+ _MT_DEVICE_nGnRnE = 0
+ _MT_DEVICE_nGnRE = 1
+ _MT_DEVICE_GRE = 2
+ _MT_NORMAL_NC = 3
+ _MT_NORMAL = 4
+ _MT_NORMAL_WT = 5
+ _MT_EL1_INIT = (0 << _MT_DEVICE_nGnRnE) | (0x4 << _MT_DEVICE_nGnRE * 8) | (0xc << _MT_DEVICE_GRE * 8) | (0x44 << _MT_NORMAL_NC * 8) | (0xff << _MT_NORMAL * 8) | (0xbb << _MT_NORMAL_WT * 8)
+)
+
+const (
+ _KVM_ARM_VCPU_POWER_OFF = 0 // CPU is started in OFF state
+ _KVM_ARM_VCPU_PSCI_0_2 = 2 // CPU uses PSCI v0.2
+)
+
+// Arm64: Exception Syndrome Register EL1.
+const (
+ _ESR_ELx_FSC = 0x3F
+
+ _ESR_SEGV_MAPERR_L0 = 0x4
+ _ESR_SEGV_MAPERR_L1 = 0x5
+ _ESR_SEGV_MAPERR_L2 = 0x6
+ _ESR_SEGV_MAPERR_L3 = 0x7
+
+ _ESR_SEGV_ACCERR_L1 = 0x9
+ _ESR_SEGV_ACCERR_L2 = 0xa
+ _ESR_SEGV_ACCERR_L3 = 0xb
+
+ _ESR_SEGV_PEMERR_L1 = 0xd
+ _ESR_SEGV_PEMERR_L2 = 0xe
+ _ESR_SEGV_PEMERR_L3 = 0xf
+)
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index 7d02ebf19..e6d912168 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -17,7 +17,6 @@ package kvm
import (
"fmt"
"runtime"
- "sync"
"sync/atomic"
"syscall"
@@ -27,6 +26,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/platform/ring0"
"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
"gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sync"
)
// machine contains state associated with the VM as a whole.
diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
index b99fe425e..873e39dc7 100644
--- a/pkg/sentry/platform/kvm/machine_amd64.go
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -90,7 +90,9 @@ func (m *machine) dropPageTables(pt *pagetables.PageTables) {
// Clear from all PCIDs.
for _, c := range m.vCPUs {
- c.PCIDs.Drop(pt)
+ if c.PCIDs != nil {
+ c.PCIDs.Drop(pt)
+ }
}
}
diff --git a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
index 61227cafb..7156c245f 100644
--- a/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_amd64_unsafe.go
@@ -135,3 +135,43 @@ func (c *vCPU) setSignalMask() error {
}
return nil
}
+
+// setUserRegisters sets user registers in the vCPU.
+func (c *vCPU) setUserRegisters(uregs *userRegs) error {
+ if _, _, errno := syscall.RawSyscall(
+ syscall.SYS_IOCTL,
+ uintptr(c.fd),
+ _KVM_SET_REGS,
+ uintptr(unsafe.Pointer(uregs))); errno != 0 {
+ return fmt.Errorf("error setting user registers: %v", errno)
+ }
+ return nil
+}
+
+// getUserRegisters reloads user registers in the vCPU.
+//
+// This is safe to call from a nosplit context.
+//
+//go:nosplit
+func (c *vCPU) getUserRegisters(uregs *userRegs) syscall.Errno {
+ if _, _, errno := syscall.RawSyscall(
+ syscall.SYS_IOCTL,
+ uintptr(c.fd),
+ _KVM_GET_REGS,
+ uintptr(unsafe.Pointer(uregs))); errno != 0 {
+ return errno
+ }
+ return 0
+}
+
+// setSystemRegisters sets system registers.
+func (c *vCPU) setSystemRegisters(sregs *systemRegs) error {
+ if _, _, errno := syscall.RawSyscall(
+ syscall.SYS_IOCTL,
+ uintptr(c.fd),
+ _KVM_SET_SREGS,
+ uintptr(unsafe.Pointer(sregs))); errno != 0 {
+ return fmt.Errorf("error setting system registers: %v", errno)
+ }
+ return nil
+}
diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go
index b7e2cfb9d..3b1f20219 100644
--- a/pkg/sentry/platform/kvm/machine_arm64.go
+++ b/pkg/sentry/platform/kvm/machine_arm64.go
@@ -12,8 +12,38 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+// +build arm64
+
package kvm
+import (
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/platform"
+ "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
+)
+
+type vCPUArchState struct {
+ // PCIDs is the set of PCIDs for this vCPU.
+ //
+ // This starts above fixedKernelPCID.
+ PCIDs *pagetables.PCIDs
+}
+
+const (
+ // fixedKernelPCID is a fixed kernel PCID used for the kernel page
+ // tables. We must start allocating user PCIDs above this in order to
+ // avoid any conflict (see below).
+ fixedKernelPCID = 1
+
+ // poolPCIDs is the number of PCIDs to record in the database. As this
+ // grows, assignment can take longer, since it is a simple linear scan.
+ // Beyond a relatively small number, there are likely few perform
+ // benefits, since the TLB has likely long since lost any translations
+ // from more than a few PCIDs past.
+ poolPCIDs = 8
+)
+
// Get all read-only physicalRegions.
func rdonlyRegionsForSetMem() (phyRegions []physicalRegion) {
var rdonlyRegions []region
@@ -59,3 +89,97 @@ func availableRegionsForSetMem() (phyRegions []physicalRegion) {
return phyRegions
}
+
+// dropPageTables drops cached page table entries.
+func (m *machine) dropPageTables(pt *pagetables.PageTables) {
+ m.mu.Lock()
+ defer m.mu.Unlock()
+
+ // Clear from all PCIDs.
+ for _, c := range m.vCPUs {
+ if c.PCIDs != nil {
+ c.PCIDs.Drop(pt)
+ }
+ }
+}
+
+// nonCanonical generates a canonical address return.
+//
+//go:nosplit
+func nonCanonical(addr uint64, signal int32, info *arch.SignalInfo) (usermem.AccessType, error) {
+ *info = arch.SignalInfo{
+ Signo: signal,
+ Code: arch.SignalInfoKernel,
+ }
+ info.SetAddr(addr) // Include address.
+ return usermem.NoAccess, platform.ErrContextSignal
+}
+
+// fault generates an appropriate fault return.
+//
+//go:nosplit
+func (c *vCPU) fault(signal int32, info *arch.SignalInfo) (usermem.AccessType, error) {
+ faultAddr := c.GetFaultAddr()
+ code, user := c.ErrorCode()
+
+ // Reset the pointed SignalInfo.
+ *info = arch.SignalInfo{Signo: signal}
+ info.SetAddr(uint64(faultAddr))
+
+ read := true
+ write := false
+ execute := true
+
+ ret := code & _ESR_ELx_FSC
+ switch ret {
+ case _ESR_SEGV_MAPERR_L0, _ESR_SEGV_MAPERR_L1, _ESR_SEGV_MAPERR_L2, _ESR_SEGV_MAPERR_L3:
+ info.Code = 1 //SEGV_MAPERR
+ read = false
+ write = true
+ execute = false
+ case _ESR_SEGV_ACCERR_L1, _ESR_SEGV_ACCERR_L2, _ESR_SEGV_ACCERR_L3, _ESR_SEGV_PEMERR_L1, _ESR_SEGV_PEMERR_L2, _ESR_SEGV_PEMERR_L3:
+ info.Code = 2 // SEGV_ACCERR.
+ read = true
+ write = false
+ execute = false
+ default:
+ info.Code = 2
+ }
+
+ if !user {
+ read = true
+ write = false
+ execute = true
+
+ }
+ accessType := usermem.AccessType{
+ Read: read,
+ Write: write,
+ Execute: execute,
+ }
+
+ return accessType, platform.ErrContextSignal
+}
+
+// retryInGuest runs the given function in guest mode.
+//
+// If the function does not complete in guest mode (due to execution of a
+// system call due to a GC stall, for example), then it will be retried. The
+// given function must be idempotent as a result of the retry mechanism.
+func (m *machine) retryInGuest(fn func()) {
+ c := m.Get()
+ defer m.Put(c)
+ for {
+ c.ClearErrorCode() // See below.
+ bluepill(c) // Force guest mode.
+ fn() // Execute the given function.
+ _, user := c.ErrorCode()
+ if user {
+ // If user is set, then we haven't bailed back to host
+ // mode via a kernel exception or system call. We
+ // consider the full function to have executed in guest
+ // mode and we can return.
+ break
+ }
+ }
+}
diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
new file mode 100644
index 000000000..3f2f97a6b
--- /dev/null
+++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
@@ -0,0 +1,362 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package kvm
+
+import (
+ "fmt"
+ "reflect"
+ "sync/atomic"
+ "syscall"
+ "unsafe"
+
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/platform"
+ "gvisor.dev/gvisor/pkg/sentry/platform/ring0"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
+)
+
+// setMemoryRegion initializes a region.
+//
+// This may be called from bluepillHandler, and therefore returns an errno
+// directly (instead of wrapping in an error) to avoid allocations.
+//
+//go:nosplit
+func (m *machine) setMemoryRegion(slot int, physical, length, virtual uintptr) syscall.Errno {
+ userRegion := userMemoryRegion{
+ slot: uint32(slot),
+ flags: 0,
+ guestPhysAddr: uint64(physical),
+ memorySize: uint64(length),
+ userspaceAddr: uint64(virtual),
+ }
+
+ // Set the region.
+ _, _, errno := syscall.RawSyscall(
+ syscall.SYS_IOCTL,
+ uintptr(m.fd),
+ _KVM_SET_USER_MEMORY_REGION,
+ uintptr(unsafe.Pointer(&userRegion)))
+ return errno
+}
+
+type kvmVcpuInit struct {
+ target uint32
+ features [7]uint32
+}
+
+var vcpuInit kvmVcpuInit
+
+// initArchState initializes architecture-specific state.
+func (m *machine) initArchState() error {
+ if _, _, errno := syscall.RawSyscall(
+ syscall.SYS_IOCTL,
+ uintptr(m.fd),
+ _KVM_ARM_PREFERRED_TARGET,
+ uintptr(unsafe.Pointer(&vcpuInit))); errno != 0 {
+ panic(fmt.Sprintf("error setting KVM_ARM_PREFERRED_TARGET failed: %v", errno))
+ }
+ return nil
+}
+
+func getPageWithReflect(p uintptr) []byte {
+ return (*(*[0xFFFFFF]byte)(unsafe.Pointer(p & ^uintptr(syscall.Getpagesize()-1))))[:syscall.Getpagesize()]
+}
+
+// Work around: move ring0.Vectors() into a specific address with 11-bits alignment.
+//
+// According to the design documentation of Arm64,
+// the start address of exception vector table should be 11-bits aligned.
+// Please see the code in linux kernel as reference: arch/arm64/kernel/entry.S
+// But, we can't align a function's start address to a specific address by using golang.
+// We have raised this question in golang community:
+// https://groups.google.com/forum/m/#!topic/golang-dev/RPj90l5x86I
+// This function will be removed when golang supports this feature.
+//
+// There are 2 jobs were implemented in this function:
+// 1, move the start address of exception vector table into the specific address.
+// 2, modify the offset of each instruction.
+func updateVectorTable() {
+ fromLocation := reflect.ValueOf(ring0.Vectors).Pointer()
+ offset := fromLocation & (1<<11 - 1)
+ if offset != 0 {
+ offset = 1<<11 - offset
+ }
+
+ toLocation := fromLocation + offset
+ page := getPageWithReflect(toLocation)
+ if err := syscall.Mprotect(page, syscall.PROT_READ|syscall.PROT_WRITE|syscall.PROT_EXEC); err != nil {
+ panic(err)
+ }
+
+ page = getPageWithReflect(toLocation + 4096)
+ if err := syscall.Mprotect(page, syscall.PROT_READ|syscall.PROT_WRITE|syscall.PROT_EXEC); err != nil {
+ panic(err)
+ }
+
+ // Move exception-vector-table into the specific address.
+ var entry *uint32
+ var entryFrom *uint32
+ for i := 1; i <= 0x800; i++ {
+ entry = (*uint32)(unsafe.Pointer(toLocation + 0x800 - uintptr(i)))
+ entryFrom = (*uint32)(unsafe.Pointer(fromLocation + 0x800 - uintptr(i)))
+ *entry = *entryFrom
+ }
+
+ // The offset from the address of each unconditionally branch is changed.
+ // We should modify the offset of each instruction.
+ nums := []uint32{0x0, 0x80, 0x100, 0x180, 0x200, 0x280, 0x300, 0x380, 0x400, 0x480, 0x500, 0x580, 0x600, 0x680, 0x700, 0x780}
+ for _, num := range nums {
+ entry = (*uint32)(unsafe.Pointer(toLocation + uintptr(num)))
+ *entry = *entry - (uint32)(offset/4)
+ }
+
+ page = getPageWithReflect(toLocation)
+ if err := syscall.Mprotect(page, syscall.PROT_READ|syscall.PROT_EXEC); err != nil {
+ panic(err)
+ }
+
+ page = getPageWithReflect(toLocation + 4096)
+ if err := syscall.Mprotect(page, syscall.PROT_READ|syscall.PROT_EXEC); err != nil {
+ panic(err)
+ }
+}
+
+// initArchState initializes architecture-specific state.
+func (c *vCPU) initArchState() error {
+ var (
+ reg kvmOneReg
+ data uint64
+ regGet kvmOneReg
+ dataGet uint64
+ )
+
+ reg.addr = uint64(reflect.ValueOf(&data).Pointer())
+ regGet.addr = uint64(reflect.ValueOf(&dataGet).Pointer())
+
+ vcpuInit.features[0] |= (1 << _KVM_ARM_VCPU_PSCI_0_2)
+ if _, _, errno := syscall.RawSyscall(
+ syscall.SYS_IOCTL,
+ uintptr(c.fd),
+ _KVM_ARM_VCPU_INIT,
+ uintptr(unsafe.Pointer(&vcpuInit))); errno != 0 {
+ panic(fmt.Sprintf("error setting KVM_ARM_VCPU_INIT failed: %v", errno))
+ }
+
+ // cpacr_el1
+ reg.id = _KVM_ARM64_REGS_CPACR_EL1
+ data = (_FPEN_NOTRAP << _FPEN_SHIFT)
+ if err := c.setOneRegister(&reg); err != nil {
+ return err
+ }
+
+ // sctlr_el1
+ regGet.id = _KVM_ARM64_REGS_SCTLR_EL1
+ if err := c.getOneRegister(&regGet); err != nil {
+ return err
+ }
+
+ dataGet |= (_SCTLR_M | _SCTLR_C | _SCTLR_I)
+ data = dataGet
+ reg.id = _KVM_ARM64_REGS_SCTLR_EL1
+ if err := c.setOneRegister(&reg); err != nil {
+ return err
+ }
+
+ // tcr_el1
+ data = _TCR_TXSZ_VA48 | _TCR_CACHE_FLAGS | _TCR_SHARED | _TCR_TG_FLAGS | _TCR_ASID16 | _TCR_IPS_40BITS
+ reg.id = _KVM_ARM64_REGS_TCR_EL1
+ if err := c.setOneRegister(&reg); err != nil {
+ return err
+ }
+
+ // mair_el1
+ data = _MT_EL1_INIT
+ reg.id = _KVM_ARM64_REGS_MAIR_EL1
+ if err := c.setOneRegister(&reg); err != nil {
+ return err
+ }
+
+ // ttbr0_el1
+ data = c.machine.kernel.PageTables.TTBR0_EL1(false, 0)
+
+ reg.id = _KVM_ARM64_REGS_TTBR0_EL1
+ if err := c.setOneRegister(&reg); err != nil {
+ return err
+ }
+
+ c.SetTtbr0Kvm(uintptr(data))
+
+ // ttbr1_el1
+ data = c.machine.kernel.PageTables.TTBR1_EL1(false, 0)
+
+ reg.id = _KVM_ARM64_REGS_TTBR1_EL1
+ if err := c.setOneRegister(&reg); err != nil {
+ return err
+ }
+
+ // sp_el1
+ data = c.CPU.StackTop()
+ reg.id = _KVM_ARM64_REGS_SP_EL1
+ if err := c.setOneRegister(&reg); err != nil {
+ return err
+ }
+
+ // pc
+ reg.id = _KVM_ARM64_REGS_PC
+ data = uint64(reflect.ValueOf(ring0.Start).Pointer())
+ if err := c.setOneRegister(&reg); err != nil {
+ return err
+ }
+
+ // r8
+ reg.id = _KVM_ARM64_REGS_R8
+ data = uint64(reflect.ValueOf(&c.CPU).Pointer())
+ if err := c.setOneRegister(&reg); err != nil {
+ return err
+ }
+
+ // vbar_el1
+ reg.id = _KVM_ARM64_REGS_VBAR_EL1
+
+ fromLocation := reflect.ValueOf(ring0.Vectors).Pointer()
+ offset := fromLocation & (1<<11 - 1)
+ if offset != 0 {
+ offset = 1<<11 - offset
+ }
+
+ toLocation := fromLocation + offset
+ data = uint64(ring0.KernelStartAddress | toLocation)
+ if err := c.setOneRegister(&reg); err != nil {
+ return err
+ }
+
+ data = ring0.PsrDefaultSet | ring0.KernelFlagsSet
+ reg.id = _KVM_ARM64_REGS_PSTATE
+ if err := c.setOneRegister(&reg); err != nil {
+ return err
+ }
+
+ return nil
+}
+
+//go:nosplit
+func (c *vCPU) loadSegments(tid uint64) {
+ // TODO(gvisor.dev/issue/1238): TLS is not supported.
+ // Get TLS from tpidr_el0.
+ atomic.StoreUint64(&c.tid, tid)
+}
+
+func (c *vCPU) setOneRegister(reg *kvmOneReg) error {
+ if _, _, errno := syscall.RawSyscall(
+ syscall.SYS_IOCTL,
+ uintptr(c.fd),
+ _KVM_SET_ONE_REG,
+ uintptr(unsafe.Pointer(reg))); errno != 0 {
+ return fmt.Errorf("error setting one register: %v", errno)
+ }
+ return nil
+}
+
+func (c *vCPU) getOneRegister(reg *kvmOneReg) error {
+ if _, _, errno := syscall.RawSyscall(
+ syscall.SYS_IOCTL,
+ uintptr(c.fd),
+ _KVM_GET_ONE_REG,
+ uintptr(unsafe.Pointer(reg))); errno != 0 {
+ return fmt.Errorf("error setting one register: %v", errno)
+ }
+ return nil
+}
+
+// setCPUID sets the CPUID to be used by the guest.
+func (c *vCPU) setCPUID() error {
+ return nil
+}
+
+// setSystemTime sets the TSC for the vCPU.
+func (c *vCPU) setSystemTime() error {
+ return nil
+}
+
+// setSignalMask sets the vCPU signal mask.
+//
+// This must be called prior to running the vCPU.
+func (c *vCPU) setSignalMask() error {
+ // The layout of this structure implies that it will not necessarily be
+ // the same layout chosen by the Go compiler. It gets fudged here.
+ var data struct {
+ length uint32
+ mask1 uint32
+ mask2 uint32
+ _ uint32
+ }
+ data.length = 8 // Fixed sigset size.
+ data.mask1 = ^uint32(bounceSignalMask & 0xffffffff)
+ data.mask2 = ^uint32(bounceSignalMask >> 32)
+ if _, _, errno := syscall.RawSyscall(
+ syscall.SYS_IOCTL,
+ uintptr(c.fd),
+ _KVM_SET_SIGNAL_MASK,
+ uintptr(unsafe.Pointer(&data))); errno != 0 {
+ return fmt.Errorf("error setting signal mask: %v", errno)
+ }
+
+ return nil
+}
+
+// SwitchToUser unpacks architectural-details.
+func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo) (usermem.AccessType, error) {
+ // Check for canonical addresses.
+ if regs := switchOpts.Registers; !ring0.IsCanonical(regs.Pc) {
+ return nonCanonical(regs.Pc, int32(syscall.SIGSEGV), info)
+ } else if !ring0.IsCanonical(regs.Sp) {
+ return nonCanonical(regs.Sp, int32(syscall.SIGBUS), info)
+ }
+
+ var vector ring0.Vector
+ ttbr0App := switchOpts.PageTables.TTBR0_EL1(false, 0)
+ c.SetTtbr0App(uintptr(ttbr0App))
+
+ // TODO(gvisor.dev/issue/1238): full context-switch supporting for Arm64.
+ // The Arm64 user-mode execution state consists of:
+ // x0-x30
+ // PC, SP, PSTATE
+ // V0-V31: 32 128-bit registers for floating point, and simd
+ // FPSR
+ // TPIDR_EL0, used for TLS
+ appRegs := switchOpts.Registers
+ c.SetAppAddr(ring0.KernelStartAddress | uintptr(unsafe.Pointer(appRegs)))
+
+ entersyscall()
+ bluepill(c)
+ vector = c.CPU.SwitchToUser(switchOpts)
+ exitsyscall()
+
+ switch vector {
+ case ring0.Syscall:
+ // Fast path: system call executed.
+ return usermem.NoAccess, nil
+
+ case ring0.PageFault:
+ return c.fault(int32(syscall.SIGSEGV), info)
+ case 0xaa:
+ return usermem.NoAccess, nil
+ default:
+ return usermem.NoAccess, platform.ErrContextSignal
+ }
+
+}
diff --git a/pkg/sentry/platform/kvm/machine_unsafe.go b/pkg/sentry/platform/kvm/machine_unsafe.go
index ed9433311..f04be2ab5 100644
--- a/pkg/sentry/platform/kvm/machine_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_unsafe.go
@@ -87,46 +87,6 @@ func unmapRunData(r *runData) error {
return nil
}
-// setUserRegisters sets user registers in the vCPU.
-func (c *vCPU) setUserRegisters(uregs *userRegs) error {
- if _, _, errno := syscall.RawSyscall(
- syscall.SYS_IOCTL,
- uintptr(c.fd),
- _KVM_SET_REGS,
- uintptr(unsafe.Pointer(uregs))); errno != 0 {
- return fmt.Errorf("error setting user registers: %v", errno)
- }
- return nil
-}
-
-// getUserRegisters reloads user registers in the vCPU.
-//
-// This is safe to call from a nosplit context.
-//
-//go:nosplit
-func (c *vCPU) getUserRegisters(uregs *userRegs) syscall.Errno {
- if _, _, errno := syscall.RawSyscall(
- syscall.SYS_IOCTL,
- uintptr(c.fd),
- _KVM_GET_REGS,
- uintptr(unsafe.Pointer(uregs))); errno != 0 {
- return errno
- }
- return 0
-}
-
-// setSystemRegisters sets system registers.
-func (c *vCPU) setSystemRegisters(sregs *systemRegs) error {
- if _, _, errno := syscall.RawSyscall(
- syscall.SYS_IOCTL,
- uintptr(c.fd),
- _KVM_SET_SREGS,
- uintptr(unsafe.Pointer(sregs))); errno != 0 {
- return fmt.Errorf("error setting system registers: %v", errno)
- }
- return nil
-}
-
// atomicAddressSpace is an atomic address space pointer.
type atomicAddressSpace struct {
pointer unsafe.Pointer
diff --git a/pkg/sentry/platform/kvm/physical_map.go b/pkg/sentry/platform/kvm/physical_map.go
index 586e91bb2..91de5dab1 100644
--- a/pkg/sentry/platform/kvm/physical_map.go
+++ b/pkg/sentry/platform/kvm/physical_map.go
@@ -24,15 +24,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/usermem"
)
-const (
- // reservedMemory is a chunk of physical memory reserved starting at
- // physical address zero. There are some special pages in this region,
- // so we just call the whole thing off.
- //
- // Other architectures may define this to be zero.
- reservedMemory = 0x100000000
-)
-
type region struct {
virtual uintptr
length uintptr
@@ -59,8 +50,7 @@ func fillAddressSpace() (excludedRegions []region) {
// We can cut vSize in half, because the kernel will be using the top
// half and we ignore it while constructing mappings. It's as if we've
// already excluded half the possible addresses.
- vSize := uintptr(1) << ring0.VirtualAddressBits()
- vSize = vSize >> 1
+ vSize := ring0.UserspaceSize
// We exclude reservedMemory below from our physical memory size, so it
// needs to be dropped here as well. Otherwise, we could end up with
diff --git a/pkg/sentry/platform/kvm/physical_map_amd64.go b/pkg/sentry/platform/kvm/physical_map_amd64.go
new file mode 100644
index 000000000..c5adfb577
--- /dev/null
+++ b/pkg/sentry/platform/kvm/physical_map_amd64.go
@@ -0,0 +1,22 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kvm
+
+const (
+ // reservedMemory is a chunk of physical memory reserved starting at
+ // physical address zero. There are some special pages in this region,
+ // so we just call the whole thing off.
+ reservedMemory = 0x100000000
+)
diff --git a/pkg/sentry/fsimpl/proc/proc.go b/pkg/sentry/platform/kvm/physical_map_arm64.go
index 31dec36de..4d8561453 100644
--- a/pkg/sentry/fsimpl/proc/proc.go
+++ b/pkg/sentry/platform/kvm/physical_map_arm64.go
@@ -12,5 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-// Package proc implements a partial in-memory file system for procfs.
-package proc
+package kvm
+
+const (
+ reservedMemory = 0
+)
diff --git a/pkg/sentry/platform/kvm/testutil/testutil_arm64.s b/pkg/sentry/platform/kvm/testutil/testutil_arm64.s
index 2cd28b2d2..0bebee852 100644
--- a/pkg/sentry/platform/kvm/testutil/testutil_arm64.s
+++ b/pkg/sentry/platform/kvm/testutil/testutil_arm64.s
@@ -50,6 +50,21 @@ TEXT ·SpinLoop(SB),NOSPLIT,$0
start:
B start
+TEXT ·FloatingPointWorks(SB),NOSPLIT,$0-8
+ NO_LOCAL_POINTERS
+ FMOVD $(9.9), F0
+ MOVD $SYS_GETPID, R8 // getpid
+ SVC
+ FMOVD $(9.9), F1
+ FCMPD F0, F1
+ BNE isNaN
+ MOVD $1, R0
+ MOVD R0, ret+0(FP)
+ RET
+isNaN:
+ MOVD $0, ret+0(FP)
+ RET
+
// MVN: bitwise logical NOT
// This case simulates an application that modified R0-R30.
#define TWIDDLE_REGS() \
diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD
index 0df8cfa0f..cd13390c3 100644
--- a/pkg/sentry/platform/ptrace/BUILD
+++ b/pkg/sentry/platform/ptrace/BUILD
@@ -33,6 +33,7 @@ go_library(
"//pkg/sentry/platform/interrupt",
"//pkg/sentry/platform/safecopy",
"//pkg/sentry/usermem",
+ "//pkg/sync",
"@org_golang_x_sys//unix:go_default_library",
],
)
diff --git a/pkg/sentry/platform/ptrace/ptrace.go b/pkg/sentry/platform/ptrace/ptrace.go
index 7b120a15d..bb0e03880 100644
--- a/pkg/sentry/platform/ptrace/ptrace.go
+++ b/pkg/sentry/platform/ptrace/ptrace.go
@@ -46,13 +46,13 @@ package ptrace
import (
"os"
- "sync"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/platform/interrupt"
"gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sync"
)
var (
diff --git a/pkg/sentry/platform/ptrace/stub_amd64.s b/pkg/sentry/platform/ptrace/stub_amd64.s
index 64c718d21..16f9c523e 100644
--- a/pkg/sentry/platform/ptrace/stub_amd64.s
+++ b/pkg/sentry/platform/ptrace/stub_amd64.s
@@ -64,6 +64,8 @@ begin:
CMPQ AX, $0
JL error
+ MOVQ $0, BX
+
// SIGSTOP to wait for attach.
//
// The SYSCALL instruction will be used for future syscall injection by
@@ -73,23 +75,26 @@ begin:
MOVQ $SIGSTOP, SI
SYSCALL
- // The tracer may "detach" and/or allow code execution here in three cases:
- //
- // 1. New (traced) stub threads are explicitly detached by the
- // goroutine in newSubprocess. However, they are detached while in
- // group-stop, so they do not execute code here.
- //
- // 2. If a tracer thread exits, it implicitly detaches from the stub,
- // potentially allowing code execution here. However, the Go runtime
- // never exits individual threads, so this case never occurs.
- //
- // 3. subprocess.createStub clones a new stub process that is untraced,
+ // The sentry sets BX to 1 when creating stub process.
+ CMPQ BX, $1
+ JE clone
+
+ // Notify the Sentry that syscall exited.
+done:
+ INT $3
+ // Be paranoid.
+ JMP done
+clone:
+ // subprocess.createStub clones a new stub process that is untraced,
// thus executing this code. We setup the PDEATHSIG before SIGSTOPing
// ourselves for attach by the tracer.
//
// R15 has been updated with the expected PPID.
- JMP begin
+ CMPQ AX, $0
+ JE begin
+ // The clone syscall returns a non-zero value.
+ JMP done
error:
// Exit with -errno.
MOVQ AX, DI
diff --git a/pkg/sentry/platform/ptrace/stub_arm64.s b/pkg/sentry/platform/ptrace/stub_arm64.s
index 2c5e4d5cb..6162df02a 100644
--- a/pkg/sentry/platform/ptrace/stub_arm64.s
+++ b/pkg/sentry/platform/ptrace/stub_arm64.s
@@ -59,6 +59,8 @@ begin:
CMP $0x0, R0
BLT error
+ MOVD $0, R9
+
// SIGSTOP to wait for attach.
//
// The SYSCALL instruction will be used for future syscall injection by
@@ -66,22 +68,26 @@ begin:
MOVD $SYS_KILL, R8
MOVD $SIGSTOP, R1
SVC
- // The tracer may "detach" and/or allow code execution here in three cases:
- //
- // 1. New (traced) stub threads are explicitly detached by the
- // goroutine in newSubprocess. However, they are detached while in
- // group-stop, so they do not execute code here.
- //
- // 2. If a tracer thread exits, it implicitly detaches from the stub,
- // potentially allowing code execution here. However, the Go runtime
- // never exits individual threads, so this case never occurs.
- //
- // 3. subprocess.createStub clones a new stub process that is untraced,
+
+ // The sentry sets R9 to 1 when creating stub process.
+ CMP $1, R9
+ BEQ clone
+
+done:
+ // Notify the Sentry that syscall exited.
+ BRK $3
+ B done // Be paranoid.
+clone:
+ // subprocess.createStub clones a new stub process that is untraced,
// thus executing this code. We setup the PDEATHSIG before SIGSTOPing
// ourselves for attach by the tracer.
//
// R7 has been updated with the expected PPID.
- B begin
+ CMP $0, R0
+ BEQ begin
+
+ // The clone system call returned a non-zero value.
+ B done
error:
// Exit with -errno.
diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go
index ddb1f41e3..15dc46a5b 100644
--- a/pkg/sentry/platform/ptrace/subprocess.go
+++ b/pkg/sentry/platform/ptrace/subprocess.go
@@ -18,14 +18,15 @@ import (
"fmt"
"os"
"runtime"
- "sync"
"syscall"
+ "golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/procid"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sync"
)
// Linux kernel errnos which "should never be seen by user programs", but will
@@ -429,13 +430,15 @@ func (t *thread) syscall(regs *syscall.PtraceRegs) (uintptr, error) {
}
for {
- // Execute the syscall instruction.
- if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_SYSCALL, uintptr(t.tid), 0, 0, 0, 0); errno != 0 {
+ // Execute the syscall instruction. The task has to stop on the
+ // trap instruction which is right after the syscall
+ // instruction.
+ if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_CONT, uintptr(t.tid), 0, 0, 0, 0); errno != 0 {
panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno))
}
sig := t.wait(stopped)
- if sig == (syscallEvent | syscall.SIGTRAP) {
+ if sig == syscall.SIGTRAP {
// Reached syscall-enter-stop.
break
} else {
@@ -447,18 +450,6 @@ func (t *thread) syscall(regs *syscall.PtraceRegs) (uintptr, error) {
}
}
- // Complete the actual system call.
- if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_SYSCALL, uintptr(t.tid), 0, 0, 0, 0); errno != 0 {
- panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno))
- }
-
- // Wait for syscall-exit-stop. "[Signal-delivery-stop] never happens
- // between syscall-enter-stop and syscall-exit-stop; it happens *after*
- // syscall-exit-stop.)" - ptrace(2), "Syscall-stops"
- if sig := t.wait(stopped); sig != (syscallEvent | syscall.SIGTRAP) {
- t.dumpAndPanic(fmt.Sprintf("wait failed: expected SIGTRAP, got %v [%d]", sig, sig))
- }
-
// Grab registers.
if err := t.getRegs(regs); err != nil {
panic(fmt.Sprintf("ptrace get regs failed: %v", err))
@@ -541,14 +532,14 @@ func (s *subprocess) switchToApp(c *context, ac arch.Context) bool {
if isSingleStepping(regs) {
if _, _, errno := syscall.RawSyscall6(
syscall.SYS_PTRACE,
- syscall.PTRACE_SYSEMU_SINGLESTEP,
+ unix.PTRACE_SYSEMU_SINGLESTEP,
uintptr(t.tid), 0, 0, 0, 0); errno != 0 {
panic(fmt.Sprintf("ptrace sysemu failed: %v", errno))
}
} else {
if _, _, errno := syscall.RawSyscall6(
syscall.SYS_PTRACE,
- syscall.PTRACE_SYSEMU,
+ unix.PTRACE_SYSEMU,
uintptr(t.tid), 0, 0, 0, 0); errno != 0 {
panic(fmt.Sprintf("ptrace sysemu failed: %v", errno))
}
diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go
index 4649a94a7..e99798c56 100644
--- a/pkg/sentry/platform/ptrace/subprocess_amd64.go
+++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go
@@ -21,6 +21,8 @@ import (
"strings"
"syscall"
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/seccomp"
"gvisor.dev/gvisor/pkg/sentry/arch"
)
@@ -139,7 +141,55 @@ func (t *thread) adjustInitRegsRip() {
t.initRegs.Rip -= initRegsRipAdjustment
}
-// Pass the expected PPID to the child via R15 when creating stub process
+// Pass the expected PPID to the child via R15 when creating stub process.
func initChildProcessPPID(initregs *syscall.PtraceRegs, ppid int32) {
initregs.R15 = uint64(ppid)
+ // Rbx has to be set to 1 when creating stub process.
+ initregs.Rbx = 1
+}
+
+// patchSignalInfo patches the signal info to account for hitting the seccomp
+// filters from vsyscall emulation, specified below. We allow for SIGSYS as a
+// synchronous trap, but patch the structure to appear like a SIGSEGV with the
+// Rip as the faulting address.
+//
+// Note that this should only be called after verifying that the signalInfo has
+// been generated by the kernel.
+func patchSignalInfo(regs *syscall.PtraceRegs, signalInfo *arch.SignalInfo) {
+ if linux.Signal(signalInfo.Signo) == linux.SIGSYS {
+ signalInfo.Signo = int32(linux.SIGSEGV)
+
+ // Unwind the kernel emulation, if any has occurred. A SIGSYS is delivered
+ // with the si_call_addr field pointing to the current RIP. This field
+ // aligns with the si_addr field for a SIGSEGV, so we don't need to touch
+ // anything there. We do need to unwind emulation however, so we set the
+ // instruction pointer to the faulting value, and "unpop" the stack.
+ regs.Rip = signalInfo.Addr()
+ regs.Rsp -= 8
+ }
+}
+
+// enableCpuidFault enables cpuid-faulting.
+//
+// This may fail on older kernels or hardware, so we just disregard the result.
+// Host CPUID will be enabled.
+//
+// This is safe to call in an afterFork context.
+//
+//go:nosplit
+func enableCpuidFault() {
+ syscall.RawSyscall6(syscall.SYS_ARCH_PRCTL, linux.ARCH_SET_CPUID, 0, 0, 0, 0, 0)
+}
+
+// appendArchSeccompRules append architecture specific seccomp rules when creating BPF program.
+// Ref attachedThread() for more detail.
+func appendArchSeccompRules(rules []seccomp.RuleSet) []seccomp.RuleSet {
+ return append(rules, seccomp.RuleSet{
+ Rules: seccomp.SyscallRules{
+ syscall.SYS_ARCH_PRCTL: []seccomp.Rule{
+ {seccomp.AllowValue(linux.ARCH_SET_CPUID), seccomp.AllowValue(0)},
+ },
+ },
+ Action: linux.SECCOMP_RET_ALLOW,
+ })
}
diff --git a/pkg/sentry/platform/ptrace/subprocess_arm64.go b/pkg/sentry/platform/ptrace/subprocess_arm64.go
index bec884ba5..7b975137f 100644
--- a/pkg/sentry/platform/ptrace/subprocess_arm64.go
+++ b/pkg/sentry/platform/ptrace/subprocess_arm64.go
@@ -17,8 +17,12 @@
package ptrace
import (
+ "fmt"
+ "strings"
"syscall"
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/seccomp"
"gvisor.dev/gvisor/pkg/sentry/arch"
)
@@ -37,7 +41,7 @@ const (
// resetSysemuRegs sets up emulation registers.
//
// This should be called prior to calling sysemu.
-func (s *subprocess) resetSysemuRegs(regs *syscall.PtraceRegs) {
+func (t *thread) resetSysemuRegs(regs *syscall.PtraceRegs) {
}
// createSyscallRegs sets up syscall registers.
@@ -123,4 +127,39 @@ func (t *thread) adjustInitRegsRip() {
// Pass the expected PPID to the child via X7 when creating stub process
func initChildProcessPPID(initregs *syscall.PtraceRegs, ppid int32) {
initregs.Regs[7] = uint64(ppid)
+ // R9 has to be set to 1 when creating stub process.
+ initregs.Regs[9] = 1
+}
+
+// patchSignalInfo patches the signal info to account for hitting the seccomp
+// filters from vsyscall emulation, specified below. We allow for SIGSYS as a
+// synchronous trap, but patch the structure to appear like a SIGSEGV with the
+// Rip as the faulting address.
+//
+// Note that this should only be called after verifying that the signalInfo has
+// been generated by the kernel.
+func patchSignalInfo(regs *syscall.PtraceRegs, signalInfo *arch.SignalInfo) {
+ if linux.Signal(signalInfo.Signo) == linux.SIGSYS {
+ signalInfo.Signo = int32(linux.SIGSEGV)
+
+ // Unwind the kernel emulation, if any has occurred. A SIGSYS is delivered
+ // with the si_call_addr field pointing to the current RIP. This field
+ // aligns with the si_addr field for a SIGSEGV, so we don't need to touch
+ // anything there. We do need to unwind emulation however, so we set the
+ // instruction pointer to the faulting value, and "unpop" the stack.
+ regs.Pc = signalInfo.Addr()
+ regs.Sp -= 8
+ }
+}
+
+// Noop on arm64.
+//
+//go:nosplit
+func enableCpuidFault() {
+}
+
+// appendArchSeccompRules append architecture specific seccomp rules when creating BPF program.
+// Ref attachedThread() for more detail.
+func appendArchSeccompRules(rules []seccomp.RuleSet) []seccomp.RuleSet {
+ return rules
}
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go
index 3782d4332..74968dfdf 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux.go
@@ -20,6 +20,7 @@ import (
"fmt"
"syscall"
+ "golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/procid"
@@ -53,7 +54,7 @@ func probeSeccomp() bool {
for {
// Attempt an emulation.
- if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, syscall.PTRACE_SYSEMU, uintptr(t.tid), 0, 0, 0, 0); errno != 0 {
+ if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, unix.PTRACE_SYSEMU, uintptr(t.tid), 0, 0, 0, 0); errno != 0 {
panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno))
}
@@ -77,27 +78,6 @@ func probeSeccomp() bool {
}
}
-// patchSignalInfo patches the signal info to account for hitting the seccomp
-// filters from vsyscall emulation, specified below. We allow for SIGSYS as a
-// synchronous trap, but patch the structure to appear like a SIGSEGV with the
-// Rip as the faulting address.
-//
-// Note that this should only be called after verifying that the signalInfo has
-// been generated by the kernel.
-func patchSignalInfo(regs *syscall.PtraceRegs, signalInfo *arch.SignalInfo) {
- if linux.Signal(signalInfo.Signo) == linux.SIGSYS {
- signalInfo.Signo = int32(linux.SIGSEGV)
-
- // Unwind the kernel emulation, if any has occurred. A SIGSYS is delivered
- // with the si_call_addr field pointing to the current RIP. This field
- // aligns with the si_addr field for a SIGSEGV, so we don't need to touch
- // anything there. We do need to unwind emulation however, so we set the
- // instruction pointer to the faulting value, and "unpop" the stack.
- regs.Rip = signalInfo.Addr()
- regs.Rsp -= 8
- }
-}
-
// createStub creates a fresh stub processes.
//
// Precondition: the runtime OS thread must be locked.
@@ -149,7 +129,7 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro
Rules: seccomp.SyscallRules{
syscall.SYS_GETTIMEOFDAY: {},
syscall.SYS_TIME: {},
- 309: {}, // SYS_GETCPU.
+ unix.SYS_GETCPU: {}, // SYS_GETCPU was not defined in package syscall on amd64.
},
Action: linux.SECCOMP_RET_TRAP,
Vsyscall: true,
@@ -173,10 +153,7 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro
// For the initial process creation.
syscall.SYS_WAIT4: {},
- syscall.SYS_ARCH_PRCTL: []seccomp.Rule{
- {seccomp.AllowValue(linux.ARCH_SET_CPUID), seccomp.AllowValue(0)},
- },
- syscall.SYS_EXIT: {},
+ syscall.SYS_EXIT: {},
// For the stub prctl dance (all).
syscall.SYS_PRCTL: []seccomp.Rule{
@@ -196,6 +173,8 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro
},
Action: linux.SECCOMP_RET_ALLOW,
})
+
+ rules = appendArchSeccompRules(rules)
}
instrs, err := seccomp.BuildProgram(rules, defaultAction)
if err != nil {
@@ -267,9 +246,8 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro
syscall.RawSyscall(syscall.SYS_EXIT, uintptr(errno), 0, 0)
}
- // Enable cpuid-faulting; this may fail on older kernels or hardware,
- // so we just disregard the result. Host CPUID will be enabled.
- syscall.RawSyscall6(syscall.SYS_ARCH_PRCTL, linux.ARCH_SET_CPUID, 0, 0, 0, 0, 0)
+ // Enable cpuid-faulting.
+ enableCpuidFault()
// Call the stub; should not return.
stubCall(stubStart, ppid)
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go
index 2e6fbe488..245b20722 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go
@@ -18,7 +18,6 @@
package ptrace
import (
- "sync"
"sync/atomic"
"syscall"
"unsafe"
@@ -26,6 +25,7 @@ import (
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/hostcpu"
+ "gvisor.dev/gvisor/pkg/sync"
)
// maskPool contains reusable CPU masks for setting affinity. Unfortunately,
diff --git a/pkg/sentry/platform/ring0/BUILD b/pkg/sentry/platform/ring0/BUILD
index f1af18265..87f4552b5 100644
--- a/pkg/sentry/platform/ring0/BUILD
+++ b/pkg/sentry/platform/ring0/BUILD
@@ -71,6 +71,7 @@ go_library(
"lib_amd64.go",
"lib_amd64.s",
"lib_arm64.go",
+ "lib_arm64.s",
"ring0.go",
],
importpath = "gvisor.dev/gvisor/pkg/sentry/platform/ring0",
diff --git a/pkg/sentry/platform/ring0/defs.go b/pkg/sentry/platform/ring0/defs.go
index 3f094c2a7..86fd5ed58 100644
--- a/pkg/sentry/platform/ring0/defs.go
+++ b/pkg/sentry/platform/ring0/defs.go
@@ -17,7 +17,7 @@ package ring0
import (
"syscall"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
)
// Kernel is a global kernel object.
diff --git a/pkg/sentry/platform/ring0/defs_amd64.go b/pkg/sentry/platform/ring0/defs_amd64.go
index 10dbd381f..9dae0dccb 100644
--- a/pkg/sentry/platform/ring0/defs_amd64.go
+++ b/pkg/sentry/platform/ring0/defs_amd64.go
@@ -18,6 +18,7 @@ package ring0
import (
"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
)
var (
diff --git a/pkg/sentry/platform/ring0/defs_arm64.go b/pkg/sentry/platform/ring0/defs_arm64.go
index fbfbd9bab..a850ce6cf 100644
--- a/pkg/sentry/platform/ring0/defs_arm64.go
+++ b/pkg/sentry/platform/ring0/defs_arm64.go
@@ -18,6 +18,7 @@ package ring0
import (
"gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
)
var (
@@ -73,6 +74,9 @@ type CPUArchState struct {
// application context pointer.
appAddr uintptr
+
+ // lazyVFP is the value of cpacr_el1.
+ lazyVFP uintptr
}
// ErrorCode returns the last error code.
diff --git a/pkg/sentry/platform/ring0/entry_arm64.s b/pkg/sentry/platform/ring0/entry_arm64.s
index 0ba4c6b73..da07815ff 100644
--- a/pkg/sentry/platform/ring0/entry_arm64.s
+++ b/pkg/sentry/platform/ring0/entry_arm64.s
@@ -31,6 +31,11 @@
#define RSV_REG R18_PLATFORM
#define RSV_REG_APP R9
+#define FPEN_NOTRAP 0x3
+#define FPEN_SHIFT 20
+
+#define FPEN_ENABLE (FPEN_NOTRAP << FPEN_SHIFT)
+
#define REGISTERS_SAVE(reg, offset) \
MOVD R0, offset+PTRACE_R0(reg); \
MOVD R1, offset+PTRACE_R1(reg); \
@@ -279,6 +284,16 @@
#define IRQ_DISABLE \
MSR $2, DAIFClr;
+#define VFP_ENABLE \
+ MOVD $FPEN_ENABLE, R0; \
+ WORD $0xd5181040; \ //MSR R0, CPACR_EL1
+ ISB $15;
+
+#define VFP_DISABLE \
+ MOVD $0x0, R0; \
+ WORD $0xd5181040; \ //MSR R0, CPACR_EL1
+ ISB $15;
+
#define KERNEL_ENTRY_FROM_EL0 \
SUB $16, RSP, RSP; \ // step1, save r18, r9 into kernel temporary stack.
STP (RSV_REG, RSV_REG_APP), 16*0(RSP); \
@@ -318,6 +333,11 @@ TEXT ·Halt(SB),NOSPLIT,$0
BNE mmio_exit
MOVD $0, CPU_REGISTERS+PTRACE_R9(RSV_REG)
mmio_exit:
+ // Disable fpsimd.
+ WORD $0xd5381041 // MRS CPACR_EL1, R1
+ MOVD R1, CPU_LAZY_VFP(RSV_REG)
+ VFP_DISABLE
+
// MMIO_EXIT.
MOVD $0, R9
MOVD R0, 0xffff000000001000(R9)
@@ -337,6 +357,73 @@ TEXT ·Current(SB),NOSPLIT,$0-8
#define STACK_FRAME_SIZE 16
TEXT ·kernelExitToEl0(SB),NOSPLIT,$0
+ // Step1, save sentry context into memory.
+ REGISTERS_SAVE(RSV_REG, CPU_REGISTERS)
+ MOVD RSV_REG_APP, CPU_REGISTERS+PTRACE_R9(RSV_REG)
+
+ WORD $0xd5384003 // MRS SPSR_EL1, R3
+ MOVD R3, CPU_REGISTERS+PTRACE_PSTATE(RSV_REG)
+ MOVD R30, CPU_REGISTERS+PTRACE_PC(RSV_REG)
+ MOVD RSP, R3
+ MOVD R3, CPU_REGISTERS+PTRACE_SP(RSV_REG)
+
+ MOVD CPU_REGISTERS+PTRACE_R3(RSV_REG), R3
+
+ // Step2, save SP_EL1, PSTATE into kernel temporary stack.
+ // switch to temporary stack.
+ LOAD_KERNEL_STACK(RSV_REG)
+ WORD $0xd538d092 //MRS TPIDR_EL1, R18
+
+ SUB $STACK_FRAME_SIZE, RSP, RSP
+ MOVD CPU_REGISTERS+PTRACE_SP(RSV_REG), R11
+ MOVD CPU_REGISTERS+PTRACE_PSTATE(RSV_REG), R12
+ STP (R11, R12), 16*0(RSP)
+
+ MOVD CPU_REGISTERS+PTRACE_R11(RSV_REG), R11
+ MOVD CPU_REGISTERS+PTRACE_R12(RSV_REG), R12
+
+ // Step3, test user pagetable.
+ // If user pagetable is empty, trapped in el1_ia.
+ WORD $0xd538d092 //MRS TPIDR_EL1, R18
+ SWITCH_TO_APP_PAGETABLE(RSV_REG)
+ WORD $0xd538d092 //MRS TPIDR_EL1, R18
+ SWITCH_TO_KVM_PAGETABLE(RSV_REG)
+ WORD $0xd538d092 //MRS TPIDR_EL1, R18
+
+ // If pagetable is not empty, recovery kernel temporary stack.
+ ADD $STACK_FRAME_SIZE, RSP, RSP
+
+ // Step4, load app context pointer.
+ MOVD CPU_APP_ADDR(RSV_REG), RSV_REG_APP
+
+ // Step5, prepare the environment for container application.
+ // set sp_el0.
+ MOVD PTRACE_SP(RSV_REG_APP), R1
+ WORD $0xd5184101 //MSR R1, SP_EL0
+ // set pc.
+ MOVD PTRACE_PC(RSV_REG_APP), R1
+ MSR R1, ELR_EL1
+ // set pstate.
+ MOVD PTRACE_PSTATE(RSV_REG_APP), R1
+ WORD $0xd5184001 //MSR R1, SPSR_EL1
+
+ // RSV_REG & RSV_REG_APP will be loaded at the end.
+ REGISTERS_LOAD(RSV_REG_APP, 0)
+
+ // switch to user pagetable.
+ MOVD PTRACE_R18(RSV_REG_APP), RSV_REG
+ MOVD PTRACE_R9(RSV_REG_APP), RSV_REG_APP
+
+ SUB $STACK_FRAME_SIZE, RSP, RSP
+ STP (RSV_REG, RSV_REG_APP), 16*0(RSP)
+
+ WORD $0xd538d092 //MRS TPIDR_EL1, R18
+
+ SWITCH_TO_APP_PAGETABLE(RSV_REG)
+
+ LDP 16*0(RSP), (RSV_REG, RSV_REG_APP)
+ ADD $STACK_FRAME_SIZE, RSP, RSP
+
ERET()
TEXT ·kernelExitToEl1(SB),NOSPLIT,$0
@@ -382,6 +469,8 @@ TEXT ·El1_sync(SB),NOSPLIT,$0
BEQ el1_svc
CMP $ESR_ELx_EC_BREAKPT_CUR, R24
BGE el1_dbg
+ CMP $ESR_ELx_EC_FP_ASIMD, R24
+ BEQ el1_fpsimd_acc
B el1_invalid
el1_da:
@@ -412,6 +501,10 @@ el1_svc:
el1_dbg:
B ·Shutdown(SB)
+el1_fpsimd_acc:
+ VFP_ENABLE
+ B ·kernelExitToEl1(SB) // Resume.
+
el1_invalid:
B ·Shutdown(SB)
@@ -538,6 +631,10 @@ TEXT ·Vectors(SB),NOSPLIT,$0
B ·El0_error_invalid(SB)
nop31Instructions()
+ // The exception-vector-table is required to be 11-bits aligned.
+ // Please see Linux source code as reference: arch/arm64/kernel/entry.s.
+ // For gvisor, I defined it as 4K in length, filled the 2nd 2K part with NOPs.
+ // So that, I can safely move the 1st 2K part into the address with 11-bits alignment.
WORD $0xd503201f //nop
nop31Instructions()
WORD $0xd503201f
diff --git a/pkg/sentry/platform/ring0/lib_arm64.go b/pkg/sentry/platform/ring0/lib_arm64.go
index 900ee6380..8bcfe1032 100644
--- a/pkg/sentry/platform/ring0/lib_arm64.go
+++ b/pkg/sentry/platform/ring0/lib_arm64.go
@@ -16,10 +16,24 @@
package ring0
-// LoadFloatingPoint loads floating point state by the most efficient mechanism
-// available (set by Init).
-var LoadFloatingPoint func(*byte)
+// CPACREL1 returns the value of the CPACR_EL1 register.
+func CPACREL1() (value uintptr)
-// SaveFloatingPoint saves floating point state by the most efficient mechanism
-// available (set by Init).
-var SaveFloatingPoint func(*byte)
+// FPCR returns the value of FPCR register.
+func FPCR() (value uintptr)
+
+// SetFPCR writes the FPCR value.
+func SetFPCR(value uintptr)
+
+// FPSR returns the value of FPSR register.
+func FPSR() (value uintptr)
+
+// SetFPSR writes the FPSR value.
+func SetFPSR(value uintptr)
+
+// SaveVRegs saves V0-V31 registers.
+// V0-V31: 32 128-bit registers for floating point and simd.
+func SaveVRegs(*byte)
+
+// LoadVRegs loads V0-V31 registers.
+func LoadVRegs(*byte)
diff --git a/pkg/sentry/platform/ring0/lib_arm64.s b/pkg/sentry/platform/ring0/lib_arm64.s
new file mode 100644
index 000000000..0e6a6235b
--- /dev/null
+++ b/pkg/sentry/platform/ring0/lib_arm64.s
@@ -0,0 +1,121 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "funcdata.h"
+#include "textflag.h"
+
+TEXT ·CPACREL1(SB),NOSPLIT,$0-8
+ WORD $0xd5381041 // MRS CPACR_EL1, R1
+ MOVD R1, ret+0(FP)
+ RET
+
+TEXT ·GetFPCR(SB),NOSPLIT,$0-8
+ WORD $0xd53b4201 // MRS NZCV, R1
+ MOVD R1, ret+0(FP)
+ RET
+
+TEXT ·GetFPSR(SB),NOSPLIT,$0-8
+ WORD $0xd53b4421 // MRS FPSR, R1
+ MOVD R1, ret+0(FP)
+ RET
+
+TEXT ·SetFPCR(SB),NOSPLIT,$0-8
+ MOVD addr+0(FP), R1
+ WORD $0xd51b4201 // MSR R1, NZCV
+ RET
+
+TEXT ·SetFPSR(SB),NOSPLIT,$0-8
+ MOVD addr+0(FP), R1
+ WORD $0xd51b4421 // MSR R1, FPSR
+ RET
+
+TEXT ·SaveVRegs(SB),NOSPLIT,$0-8
+ MOVD addr+0(FP), R0
+
+ // Skip aarch64_ctx, fpsr, fpcr.
+ FMOVD F0, 16*1(R0)
+ FMOVD F1, 16*2(R0)
+ FMOVD F2, 16*3(R0)
+ FMOVD F3, 16*4(R0)
+ FMOVD F4, 16*5(R0)
+ FMOVD F5, 16*6(R0)
+ FMOVD F6, 16*7(R0)
+ FMOVD F7, 16*8(R0)
+ FMOVD F8, 16*9(R0)
+ FMOVD F9, 16*10(R0)
+ FMOVD F10, 16*11(R0)
+ FMOVD F11, 16*12(R0)
+ FMOVD F12, 16*13(R0)
+ FMOVD F13, 16*14(R0)
+ FMOVD F14, 16*15(R0)
+ FMOVD F15, 16*16(R0)
+ FMOVD F16, 16*17(R0)
+ FMOVD F17, 16*18(R0)
+ FMOVD F18, 16*19(R0)
+ FMOVD F19, 16*20(R0)
+ FMOVD F20, 16*21(R0)
+ FMOVD F21, 16*22(R0)
+ FMOVD F22, 16*23(R0)
+ FMOVD F23, 16*24(R0)
+ FMOVD F24, 16*25(R0)
+ FMOVD F25, 16*26(R0)
+ FMOVD F26, 16*27(R0)
+ FMOVD F27, 16*28(R0)
+ FMOVD F28, 16*29(R0)
+ FMOVD F29, 16*30(R0)
+ FMOVD F30, 16*31(R0)
+ FMOVD F31, 16*32(R0)
+ ISB $15
+
+ RET
+
+TEXT ·LoadVRegs(SB),NOSPLIT,$0-8
+ MOVD addr+0(FP), R0
+
+ // Skip aarch64_ctx, fpsr, fpcr.
+ FMOVD 16*1(R0), F0
+ FMOVD 16*2(R0), F1
+ FMOVD 16*3(R0), F2
+ FMOVD 16*4(R0), F3
+ FMOVD 16*5(R0), F4
+ FMOVD 16*6(R0), F5
+ FMOVD 16*7(R0), F6
+ FMOVD 16*8(R0), F7
+ FMOVD 16*9(R0), F8
+ FMOVD 16*10(R0), F9
+ FMOVD 16*11(R0), F10
+ FMOVD 16*12(R0), F11
+ FMOVD 16*13(R0), F12
+ FMOVD 16*14(R0), F13
+ FMOVD 16*15(R0), F14
+ FMOVD 16*16(R0), F15
+ FMOVD 16*17(R0), F16
+ FMOVD 16*18(R0), F17
+ FMOVD 16*19(R0), F18
+ FMOVD 16*20(R0), F19
+ FMOVD 16*21(R0), F20
+ FMOVD 16*22(R0), F21
+ FMOVD 16*23(R0), F22
+ FMOVD 16*24(R0), F23
+ FMOVD 16*25(R0), F24
+ FMOVD 16*26(R0), F25
+ FMOVD 16*27(R0), F26
+ FMOVD 16*28(R0), F27
+ FMOVD 16*29(R0), F28
+ FMOVD 16*30(R0), F29
+ FMOVD 16*31(R0), F30
+ FMOVD 16*32(R0), F31
+ ISB $15
+
+ RET
diff --git a/pkg/sentry/platform/ring0/offsets_arm64.go b/pkg/sentry/platform/ring0/offsets_arm64.go
index d7aa1c7cc..cd2a65f97 100644
--- a/pkg/sentry/platform/ring0/offsets_arm64.go
+++ b/pkg/sentry/platform/ring0/offsets_arm64.go
@@ -39,6 +39,7 @@ func Emit(w io.Writer) {
fmt.Fprintf(w, "#define CPU_TTBR0_APP 0x%02x\n", reflect.ValueOf(&c.ttbr0App).Pointer()-reflect.ValueOf(c).Pointer())
fmt.Fprintf(w, "#define CPU_VECTOR_CODE 0x%02x\n", reflect.ValueOf(&c.vecCode).Pointer()-reflect.ValueOf(c).Pointer())
fmt.Fprintf(w, "#define CPU_APP_ADDR 0x%02x\n", reflect.ValueOf(&c.appAddr).Pointer()-reflect.ValueOf(c).Pointer())
+ fmt.Fprintf(w, "#define CPU_LAZY_VFP 0x%02x\n", reflect.ValueOf(&c.lazyVFP).Pointer()-reflect.ValueOf(c).Pointer())
fmt.Fprintf(w, "\n// Bits.\n")
fmt.Fprintf(w, "#define _KERNEL_FLAGS 0x%02x\n", KernelFlagsSet)
diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD
index e2e15ba5c..387a7f6c3 100644
--- a/pkg/sentry/platform/ring0/pagetables/BUILD
+++ b/pkg/sentry/platform/ring0/pagetables/BUILD
@@ -96,7 +96,10 @@ go_library(
"//pkg/sentry/platform/kvm:__subpackages__",
"//pkg/sentry/platform/ring0:__subpackages__",
],
- deps = ["//pkg/sentry/usermem"],
+ deps = [
+ "//pkg/sentry/usermem",
+ "//pkg/sync",
+ ],
)
go_test(
diff --git a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
index 0f029f25d..e199bae18 100644
--- a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
+++ b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go
@@ -17,7 +17,7 @@
package pagetables
import (
- "sync"
+ "gvisor.dev/gvisor/pkg/sync"
)
// limitPCID is the number of valid PCIDs.
diff --git a/pkg/sentry/sighandling/sighandling.go b/pkg/sentry/sighandling/sighandling.go
index 2f65db70b..ba1f9043d 100644
--- a/pkg/sentry/sighandling/sighandling.go
+++ b/pkg/sentry/sighandling/sighandling.go
@@ -16,7 +16,6 @@
package sighandling
import (
- "fmt"
"os"
"os/signal"
"reflect"
@@ -31,37 +30,25 @@ const numSignals = 32
// handleSignals listens for incoming signals and calls the given handler
// function.
//
-// It starts when the start channel is closed, stops when the stop channel
-// is closed, and closes done once it will no longer deliver signals to k.
-func handleSignals(sigchans []chan os.Signal, handler func(linux.Signal), start, stop, done chan struct{}) {
+// It stops when the stop channel is closed. The done channel is closed once it
+// will no longer deliver signals to k.
+func handleSignals(sigchans []chan os.Signal, handler func(linux.Signal), stop, done chan struct{}) {
// Build a select case.
- sc := []reflect.SelectCase{{Dir: reflect.SelectRecv, Chan: reflect.ValueOf(start)}}
+ sc := []reflect.SelectCase{{Dir: reflect.SelectRecv, Chan: reflect.ValueOf(stop)}}
for _, sigchan := range sigchans {
sc = append(sc, reflect.SelectCase{Dir: reflect.SelectRecv, Chan: reflect.ValueOf(sigchan)})
}
- started := false
for {
// Wait for a notification.
index, _, ok := reflect.Select(sc)
- // Was it the start / stop channel?
+ // Was it the stop channel?
if index == 0 {
if !ok {
- if !started {
- // start channel; start forwarding and
- // swap this case for the stop channel
- // to select stop requests.
- started = true
- sc[0] = reflect.SelectCase{Dir: reflect.SelectRecv, Chan: reflect.ValueOf(stop)}
- } else {
- // stop channel; stop forwarding and
- // clear this case so it is never
- // selected again.
- started = false
- close(done)
- sc[0].Chan = reflect.Value{}
- }
+ // Stop forwarding and notify that it's done.
+ close(done)
+ return
}
continue
}
@@ -73,44 +60,17 @@ func handleSignals(sigchans []chan os.Signal, handler func(linux.Signal), start,
// Otherwise, it was a signal on channel N. Index 0 represents the stop
// channel, so index N represents the channel for signal N.
- signal := linux.Signal(index)
-
- if !started {
- // Kernel cannot receive signals, either because it is
- // not ready yet or is shutting down.
- //
- // Kill ourselves if this signal would have killed the
- // process before PrepareForwarding was called. i.e., all
- // _SigKill signals; see Go
- // src/runtime/sigtab_linux_generic.go.
- //
- // Otherwise ignore the signal.
- //
- // TODO(b/114489875): Drop in Go 1.12, which uses tgkill
- // in runtime.raise.
- switch signal {
- case linux.SIGHUP, linux.SIGINT, linux.SIGTERM:
- dieFromSignal(signal)
- panic(fmt.Sprintf("Failed to die from signal %d", signal))
- default:
- continue
- }
- }
-
- // Pass the signal to the handler.
- handler(signal)
+ handler(linux.Signal(index))
}
}
-// PrepareHandler ensures that synchronous signals are passed to the given
-// handler function and returns a callback that starts signal delivery, which
-// itself returns a callback that stops signal handling.
+// StartSignalForwarding ensures that synchronous signals are passed to the
+// given handler function and returns a callback that stops signal delivery.
//
// Note that this function permanently takes over signal handling. After the
// stop callback, signals revert to the default Go runtime behavior, which
// cannot be overridden with external calls to signal.Notify.
-func PrepareHandler(handler func(linux.Signal)) func() func() {
- start := make(chan struct{})
+func StartSignalForwarding(handler func(linux.Signal)) func() {
stop := make(chan struct{})
done := make(chan struct{})
@@ -128,13 +88,10 @@ func PrepareHandler(handler func(linux.Signal)) func() func() {
signal.Notify(sigchan, syscall.Signal(sig))
}
// Start up our listener.
- go handleSignals(sigchans, handler, start, stop, done) // S/R-SAFE: synchronized by Kernel.extMu.
+ go handleSignals(sigchans, handler, stop, done) // S/R-SAFE: synchronized by Kernel.extMu.
- return func() func() {
- close(start)
- return func() {
- close(stop)
- <-done
- }
+ return func() {
+ close(stop)
+ <-done
}
}
diff --git a/pkg/sentry/sighandling/sighandling_unsafe.go b/pkg/sentry/sighandling/sighandling_unsafe.go
index c303435d5..1ebe22d34 100644
--- a/pkg/sentry/sighandling/sighandling_unsafe.go
+++ b/pkg/sentry/sighandling/sighandling_unsafe.go
@@ -15,8 +15,6 @@
package sighandling
import (
- "fmt"
- "runtime"
"syscall"
"unsafe"
@@ -48,27 +46,3 @@ func IgnoreChildStop() error {
return nil
}
-
-// dieFromSignal kills the current process with sig.
-//
-// Preconditions: The default action of sig is termination.
-func dieFromSignal(sig linux.Signal) {
- runtime.LockOSThread()
- defer runtime.UnlockOSThread()
-
- sa := sigaction{handler: linux.SIG_DFL}
- if _, _, e := syscall.RawSyscall6(syscall.SYS_RT_SIGACTION, uintptr(sig), uintptr(unsafe.Pointer(&sa)), 0, linux.SignalSetSize, 0, 0); e != 0 {
- panic(fmt.Sprintf("rt_sigaction failed: %v", e))
- }
-
- set := linux.MakeSignalSet(sig)
- if _, _, e := syscall.RawSyscall6(syscall.SYS_RT_SIGPROCMASK, linux.SIG_UNBLOCK, uintptr(unsafe.Pointer(&set)), 0, linux.SignalSetSize, 0, 0); e != 0 {
- panic(fmt.Sprintf("rt_sigprocmask failed: %v", e))
- }
-
- if err := syscall.Tgkill(syscall.Getpid(), syscall.Gettid(), syscall.Signal(sig)); err != nil {
- panic(fmt.Sprintf("tgkill failed: %v", err))
- }
-
- panic("failed to die")
-}
diff --git a/pkg/sentry/socket/control/BUILD b/pkg/sentry/socket/control/BUILD
index 4a6e83a8b..357517ed4 100644
--- a/pkg/sentry/socket/control/BUILD
+++ b/pkg/sentry/socket/control/BUILD
@@ -17,6 +17,7 @@ go_library(
"//pkg/sentry/fs",
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
+ "//pkg/sentry/socket",
"//pkg/sentry/socket/unix/transport",
"//pkg/sentry/usermem",
"//pkg/syserror",
diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index 4e95101b7..1684dfc24 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -23,6 +23,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/socket"
"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
"gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserror"
@@ -194,15 +195,15 @@ func putCmsg(buf []byte, flags int, msgType uint32, align uint, data []int32) ([
// the available space, we must align down.
//
// align must be >= 4 and each data int32 is 4 bytes. The length of the
- // header is already aligned, so if we align to the with of the data there
+ // header is already aligned, so if we align to the width of the data there
// are two cases:
// 1. The aligned length is less than the length of the header. The
// unaligned length was also less than the length of the header, so we
// can't write anything.
// 2. The aligned length is greater than or equal to the length of the
- // header. We can write the header plus zero or more datas. We can't write
- // a partial int32, so the length of the message will be
- // min(aligned length, header + datas).
+ // header. We can write the header plus zero or more bytes of data. We can't
+ // write a partial int32, so the length of the message will be
+ // min(aligned length, header + data).
if space < linux.SizeOfControlMessageHeader {
flags |= linux.MSG_CTRUNC
return buf, flags
@@ -239,12 +240,12 @@ func putCmsgStruct(buf []byte, msgLevel, msgType uint32, align uint, data interf
buf = binary.Marshal(buf, usermem.ByteOrder, data)
- // Check if we went over.
+ // If the control message data brought us over capacity, omit it.
if cap(buf) != cap(ob) {
return hdrBuf
}
- // Fix up length.
+ // Update control message length to include data.
putUint64(ob, uint64(len(buf)-len(ob)))
return alignSlice(buf, align)
@@ -320,35 +321,109 @@ func PackInq(t *kernel.Task, inq int32, buf []byte) []byte {
buf,
linux.SOL_TCP,
linux.TCP_INQ,
- 4,
+ t.Arch().Width(),
inq,
)
}
+// PackTOS packs an IP_TOS socket control message.
+func PackTOS(t *kernel.Task, tos uint8, buf []byte) []byte {
+ return putCmsgStruct(
+ buf,
+ linux.SOL_IP,
+ linux.IP_TOS,
+ t.Arch().Width(),
+ tos,
+ )
+}
+
+// PackTClass packs an IPV6_TCLASS socket control message.
+func PackTClass(t *kernel.Task, tClass int32, buf []byte) []byte {
+ return putCmsgStruct(
+ buf,
+ linux.SOL_IPV6,
+ linux.IPV6_TCLASS,
+ t.Arch().Width(),
+ tClass,
+ )
+}
+
+// PackControlMessages packs control messages into the given buffer.
+//
+// We skip control messages specific to Unix domain sockets.
+//
+// Note that some control messages may be truncated if they do not fit under
+// the capacity of buf.
+func PackControlMessages(t *kernel.Task, cmsgs socket.ControlMessages, buf []byte) []byte {
+ if cmsgs.IP.HasTimestamp {
+ buf = PackTimestamp(t, cmsgs.IP.Timestamp, buf)
+ }
+
+ if cmsgs.IP.HasInq {
+ // In Linux, TCP_CM_INQ is added after SO_TIMESTAMP.
+ buf = PackInq(t, cmsgs.IP.Inq, buf)
+ }
+
+ if cmsgs.IP.HasTOS {
+ buf = PackTOS(t, cmsgs.IP.TOS, buf)
+ }
+
+ if cmsgs.IP.HasTClass {
+ buf = PackTClass(t, cmsgs.IP.TClass, buf)
+ }
+
+ return buf
+}
+
+// cmsgSpace is equivalent to CMSG_SPACE in Linux.
+func cmsgSpace(t *kernel.Task, dataLen int) int {
+ return linux.SizeOfControlMessageHeader + AlignUp(dataLen, t.Arch().Width())
+}
+
+// CmsgsSpace returns the number of bytes needed to fit the control messages
+// represented in cmsgs.
+func CmsgsSpace(t *kernel.Task, cmsgs socket.ControlMessages) int {
+ space := 0
+
+ if cmsgs.IP.HasTimestamp {
+ space += cmsgSpace(t, linux.SizeOfTimeval)
+ }
+
+ if cmsgs.IP.HasInq {
+ space += cmsgSpace(t, linux.SizeOfControlMessageInq)
+ }
+
+ if cmsgs.IP.HasTOS {
+ space += cmsgSpace(t, linux.SizeOfControlMessageTOS)
+ }
+
+ if cmsgs.IP.HasTClass {
+ space += cmsgSpace(t, linux.SizeOfControlMessageTClass)
+ }
+
+ return space
+}
+
// Parse parses a raw socket control message into portable objects.
-func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (transport.ControlMessages, error) {
+func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.ControlMessages, error) {
var (
- fds linux.ControlMessageRights
- haveCreds bool
- creds linux.ControlMessageCredentials
+ cmsgs socket.ControlMessages
+ fds linux.ControlMessageRights
)
for i := 0; i < len(buf); {
if i+linux.SizeOfControlMessageHeader > len(buf) {
- return transport.ControlMessages{}, syserror.EINVAL
+ return cmsgs, syserror.EINVAL
}
var h linux.ControlMessageHeader
binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageHeader], usermem.ByteOrder, &h)
if h.Length < uint64(linux.SizeOfControlMessageHeader) {
- return transport.ControlMessages{}, syserror.EINVAL
+ return socket.ControlMessages{}, syserror.EINVAL
}
if h.Length > uint64(len(buf)-i) {
- return transport.ControlMessages{}, syserror.EINVAL
- }
- if h.Level != linux.SOL_SOCKET {
- return transport.ControlMessages{}, syserror.EINVAL
+ return socket.ControlMessages{}, syserror.EINVAL
}
i += linux.SizeOfControlMessageHeader
@@ -358,59 +433,85 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (transport.
// sizeof(long) in CMSG_ALIGN.
width := t.Arch().Width()
- switch h.Type {
- case linux.SCM_RIGHTS:
- rightsSize := AlignDown(length, linux.SizeOfControlMessageRight)
- numRights := rightsSize / linux.SizeOfControlMessageRight
-
- if len(fds)+numRights > linux.SCM_MAX_FD {
- return transport.ControlMessages{}, syserror.EINVAL
+ switch h.Level {
+ case linux.SOL_SOCKET:
+ switch h.Type {
+ case linux.SCM_RIGHTS:
+ rightsSize := AlignDown(length, linux.SizeOfControlMessageRight)
+ numRights := rightsSize / linux.SizeOfControlMessageRight
+
+ if len(fds)+numRights > linux.SCM_MAX_FD {
+ return socket.ControlMessages{}, syserror.EINVAL
+ }
+
+ for j := i; j < i+rightsSize; j += linux.SizeOfControlMessageRight {
+ fds = append(fds, int32(usermem.ByteOrder.Uint32(buf[j:j+linux.SizeOfControlMessageRight])))
+ }
+
+ i += AlignUp(length, width)
+
+ case linux.SCM_CREDENTIALS:
+ if length < linux.SizeOfControlMessageCredentials {
+ return socket.ControlMessages{}, syserror.EINVAL
+ }
+
+ var creds linux.ControlMessageCredentials
+ binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageCredentials], usermem.ByteOrder, &creds)
+ scmCreds, err := NewSCMCredentials(t, creds)
+ if err != nil {
+ return socket.ControlMessages{}, err
+ }
+ cmsgs.Unix.Credentials = scmCreds
+ i += AlignUp(length, width)
+
+ default:
+ // Unknown message type.
+ return socket.ControlMessages{}, syserror.EINVAL
}
-
- for j := i; j < i+rightsSize; j += linux.SizeOfControlMessageRight {
- fds = append(fds, int32(usermem.ByteOrder.Uint32(buf[j:j+linux.SizeOfControlMessageRight])))
+ case linux.SOL_IP:
+ switch h.Type {
+ case linux.IP_TOS:
+ if length < linux.SizeOfControlMessageTOS {
+ return socket.ControlMessages{}, syserror.EINVAL
+ }
+ cmsgs.IP.HasTOS = true
+ binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageTOS], usermem.ByteOrder, &cmsgs.IP.TOS)
+ i += AlignUp(length, width)
+
+ default:
+ return socket.ControlMessages{}, syserror.EINVAL
}
-
- i += AlignUp(length, width)
-
- case linux.SCM_CREDENTIALS:
- if length < linux.SizeOfControlMessageCredentials {
- return transport.ControlMessages{}, syserror.EINVAL
+ case linux.SOL_IPV6:
+ switch h.Type {
+ case linux.IPV6_TCLASS:
+ if length < linux.SizeOfControlMessageTClass {
+ return socket.ControlMessages{}, syserror.EINVAL
+ }
+ cmsgs.IP.HasTClass = true
+ binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageTClass], usermem.ByteOrder, &cmsgs.IP.TClass)
+ i += AlignUp(length, width)
+
+ default:
+ return socket.ControlMessages{}, syserror.EINVAL
}
-
- binary.Unmarshal(buf[i:i+linux.SizeOfControlMessageCredentials], usermem.ByteOrder, &creds)
- haveCreds = true
- i += AlignUp(length, width)
-
default:
- // Unknown message type.
- return transport.ControlMessages{}, syserror.EINVAL
+ return socket.ControlMessages{}, syserror.EINVAL
}
}
- var credentials SCMCredentials
- if haveCreds {
- var err error
- if credentials, err = NewSCMCredentials(t, creds); err != nil {
- return transport.ControlMessages{}, err
- }
- } else {
- credentials = makeCreds(t, socketOrEndpoint)
+ if cmsgs.Unix.Credentials == nil {
+ cmsgs.Unix.Credentials = makeCreds(t, socketOrEndpoint)
}
- var rights SCMRights
if len(fds) > 0 {
- var err error
- if rights, err = NewSCMRights(t, fds); err != nil {
- return transport.ControlMessages{}, err
+ rights, err := NewSCMRights(t, fds)
+ if err != nil {
+ return socket.ControlMessages{}, err
}
+ cmsgs.Unix.Rights = rights
}
- if credentials == nil && rights == nil {
- return transport.ControlMessages{}, nil
- }
-
- return transport.ControlMessages{Credentials: credentials, Rights: rights}, nil
+ return cmsgs, nil
}
func makeCreds(t *kernel.Task, socketOrEndpoint interface{}) SCMCredentials {
diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD
index 8b66a719d..4c44c7c0f 100644
--- a/pkg/sentry/socket/hostinet/BUILD
+++ b/pkg/sentry/socket/hostinet/BUILD
@@ -29,10 +29,12 @@ go_library(
"//pkg/sentry/kernel/time",
"//pkg/sentry/safemem",
"//pkg/sentry/socket",
+ "//pkg/sentry/socket/control",
"//pkg/sentry/usermem",
"//pkg/syserr",
"//pkg/syserror",
"//pkg/tcpip/stack",
"//pkg/waiter",
+ "@org_golang_x_sys//unix:go_default_library",
],
)
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index 92beb1bcf..c957b0f1d 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -18,6 +18,7 @@ import (
"fmt"
"syscall"
+ "golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/binary"
"gvisor.dev/gvisor/pkg/fdnotifier"
@@ -29,6 +30,7 @@ import (
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/safemem"
"gvisor.dev/gvisor/pkg/sentry/socket"
+ "gvisor.dev/gvisor/pkg/sentry/socket/control"
"gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserr"
"gvisor.dev/gvisor/pkg/syserror"
@@ -41,6 +43,10 @@ const (
// sizeofSockaddr is the size in bytes of the largest sockaddr type
// supported by this package.
sizeofSockaddr = syscall.SizeofSockaddrInet6 // sizeof(sockaddr_in6) > sizeof(sockaddr_in)
+
+ // maxControlLen is the maximum size of a control message buffer used in a
+ // recvmsg or sendmsg syscall.
+ maxControlLen = 1024
)
// socketOperations implements fs.FileOperations and socket.Socket for a socket
@@ -281,26 +287,32 @@ func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outPt
// Whitelist options and constrain option length.
var optlen int
switch level {
- case syscall.SOL_IPV6:
+ case linux.SOL_IP:
+ switch name {
+ case linux.IP_TOS, linux.IP_RECVTOS:
+ optlen = sizeofInt32
+ }
+ case linux.SOL_IPV6:
switch name {
- case syscall.IPV6_V6ONLY:
+ case linux.IPV6_TCLASS, linux.IPV6_RECVTCLASS, linux.IPV6_V6ONLY:
optlen = sizeofInt32
}
- case syscall.SOL_SOCKET:
+ case linux.SOL_SOCKET:
switch name {
- case syscall.SO_ERROR, syscall.SO_KEEPALIVE, syscall.SO_SNDBUF, syscall.SO_RCVBUF, syscall.SO_REUSEADDR:
+ case linux.SO_ERROR, linux.SO_KEEPALIVE, linux.SO_SNDBUF, linux.SO_RCVBUF, linux.SO_REUSEADDR:
optlen = sizeofInt32
- case syscall.SO_LINGER:
+ case linux.SO_LINGER:
optlen = syscall.SizeofLinger
}
- case syscall.SOL_TCP:
+ case linux.SOL_TCP:
switch name {
- case syscall.TCP_NODELAY:
+ case linux.TCP_NODELAY:
optlen = sizeofInt32
- case syscall.TCP_INFO:
+ case linux.TCP_INFO:
optlen = int(linux.SizeOfTCPInfo)
}
}
+
if optlen == 0 {
return nil, syserr.ErrProtocolNotAvailable // ENOPROTOOPT
}
@@ -320,19 +332,24 @@ func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt [
// Whitelist options and constrain option length.
var optlen int
switch level {
- case syscall.SOL_IPV6:
+ case linux.SOL_IP:
+ switch name {
+ case linux.IP_TOS, linux.IP_RECVTOS:
+ optlen = sizeofInt32
+ }
+ case linux.SOL_IPV6:
switch name {
- case syscall.IPV6_V6ONLY:
+ case linux.IPV6_TCLASS, linux.IPV6_RECVTCLASS, linux.IPV6_V6ONLY:
optlen = sizeofInt32
}
- case syscall.SOL_SOCKET:
+ case linux.SOL_SOCKET:
switch name {
- case syscall.SO_SNDBUF, syscall.SO_RCVBUF, syscall.SO_REUSEADDR:
+ case linux.SO_SNDBUF, linux.SO_RCVBUF, linux.SO_REUSEADDR:
optlen = sizeofInt32
}
- case syscall.SOL_TCP:
+ case linux.SOL_TCP:
switch name {
- case syscall.TCP_NODELAY:
+ case linux.TCP_NODELAY:
optlen = sizeofInt32
}
}
@@ -354,11 +371,11 @@ func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt [
}
// RecvMsg implements socket.Socket.RecvMsg.
-func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) {
+func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) {
// Whitelist flags.
//
// FIXME(jamieliu): We can't support MSG_ERRQUEUE because it uses ancillary
- // messages that netstack/tcpip/transport/unix doesn't understand. Kill the
+ // messages that gvisor/pkg/tcpip/transport/unix doesn't understand. Kill the
// Socket interface's dependence on netstack.
if flags&^(syscall.MSG_DONTWAIT|syscall.MSG_PEEK|syscall.MSG_TRUNC) != 0 {
return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrInvalidArgument
@@ -370,6 +387,7 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
senderAddrBuf = make([]byte, sizeofSockaddr)
}
+ var controlBuf []byte
var msgFlags int
recvmsgToBlocks := safemem.ReaderFunc(func(dsts safemem.BlockSeq) (uint64, error) {
@@ -384,11 +402,6 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
// We always do a non-blocking recv*().
sysflags := flags | syscall.MSG_DONTWAIT
- if dsts.NumBlocks() == 1 {
- // Skip allocating []syscall.Iovec.
- return recvfrom(s.fd, dsts.Head().ToSlice(), sysflags, &senderAddrBuf)
- }
-
iovs := iovecsFromBlockSeq(dsts)
msg := syscall.Msghdr{
Iov: &iovs[0],
@@ -398,12 +411,21 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
msg.Name = &senderAddrBuf[0]
msg.Namelen = uint32(len(senderAddrBuf))
}
+ if controlLen > 0 {
+ if controlLen > maxControlLen {
+ controlLen = maxControlLen
+ }
+ controlBuf = make([]byte, controlLen)
+ msg.Control = &controlBuf[0]
+ msg.Controllen = controlLen
+ }
n, err := recvmsg(s.fd, &msg, sysflags)
if err != nil {
return 0, err
}
senderAddrBuf = senderAddrBuf[:msg.Namelen]
msgFlags = int(msg.Flags)
+ controlLen = uint64(msg.Controllen)
return n, nil
})
@@ -429,14 +451,38 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
n, err = dst.CopyOutFrom(t, recvmsgToBlocks)
}
}
-
- // We don't allow control messages.
- msgFlags &^= linux.MSG_CTRUNC
+ if err != nil {
+ return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err)
+ }
if senderRequested {
senderAddr = socket.UnmarshalSockAddr(s.family, senderAddrBuf)
}
- return int(n), msgFlags, senderAddr, uint32(len(senderAddrBuf)), socket.ControlMessages{}, syserr.FromError(err)
+
+ unixControlMessages, err := unix.ParseSocketControlMessage(controlBuf[:controlLen])
+ if err != nil {
+ return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err)
+ }
+
+ controlMessages := socket.ControlMessages{}
+ for _, unixCmsg := range unixControlMessages {
+ switch unixCmsg.Header.Level {
+ case syscall.SOL_IP:
+ switch unixCmsg.Header.Type {
+ case syscall.IP_TOS:
+ controlMessages.IP.HasTOS = true
+ binary.Unmarshal(unixCmsg.Data[:linux.SizeOfControlMessageTOS], usermem.ByteOrder, &controlMessages.IP.TOS)
+ }
+ case syscall.SOL_IPV6:
+ switch unixCmsg.Header.Type {
+ case syscall.IPV6_TCLASS:
+ controlMessages.IP.HasTClass = true
+ binary.Unmarshal(unixCmsg.Data[:linux.SizeOfControlMessageTClass], usermem.ByteOrder, &controlMessages.IP.TClass)
+ }
+ }
+ }
+
+ return int(n), msgFlags, senderAddr, uint32(len(senderAddrBuf)), controlMessages, nil
}
// SendMsg implements socket.Socket.SendMsg.
@@ -446,6 +492,14 @@ func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
return 0, syserr.ErrInvalidArgument
}
+ space := uint64(control.CmsgsSpace(t, controlMessages))
+ if space > maxControlLen {
+ space = maxControlLen
+ }
+ controlBuf := make([]byte, 0, space)
+ // PackControlMessages will append up to space bytes to controlBuf.
+ controlBuf = control.PackControlMessages(t, controlMessages, controlBuf)
+
sendmsgFromBlocks := safemem.WriterFunc(func(srcs safemem.BlockSeq) (uint64, error) {
// Refuse to do anything if any part of src.Addrs was unusable.
if uint64(src.NumBytes()) != srcs.NumBytes() {
@@ -458,7 +512,7 @@ func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
// We always do a non-blocking send*().
sysflags := flags | syscall.MSG_DONTWAIT
- if srcs.NumBlocks() == 1 {
+ if srcs.NumBlocks() == 1 && len(controlBuf) == 0 {
// Skip allocating []syscall.Iovec.
src := srcs.Head()
n, _, errno := syscall.Syscall6(syscall.SYS_SENDTO, uintptr(s.fd), src.Addr(), uintptr(src.Len()), uintptr(sysflags), uintptr(firstBytePtr(to)), uintptr(len(to)))
@@ -477,6 +531,10 @@ func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
msg.Name = &to[0]
msg.Namelen = uint32(len(to))
}
+ if len(controlBuf) != 0 {
+ msg.Control = &controlBuf[0]
+ msg.Controllen = uint64(len(controlBuf))
+ }
return sendmsg(s.fd, &msg, sysflags)
})
diff --git a/pkg/sentry/socket/netfilter/BUILD b/pkg/sentry/socket/netfilter/BUILD
index 5eb06bbf4..b70047d81 100644
--- a/pkg/sentry/socket/netfilter/BUILD
+++ b/pkg/sentry/socket/netfilter/BUILD
@@ -14,6 +14,7 @@ go_library(
deps = [
"//pkg/abi/linux",
"//pkg/binary",
+ "//pkg/log",
"//pkg/sentry/kernel",
"//pkg/sentry/usermem",
"//pkg/syserr",
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 9f87c32f1..a9cfc1749 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -21,6 +21,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserr"
@@ -35,6 +36,7 @@ const errorTargetName = "ERROR"
// metadata is opaque to netstack. It holds data that we need to translate
// between Linux's and netstack's iptables representations.
+// TODO(gvisor.dev/issue/170): This might be removable.
type metadata struct {
HookEntry [linux.NF_INET_NUMHOOKS]uint32
Underflow [linux.NF_INET_NUMHOOKS]uint32
@@ -51,7 +53,7 @@ func GetInfo(t *kernel.Task, ep tcpip.Endpoint, outPtr usermem.Addr) (linux.IPTG
}
// Find the appropriate table.
- table, err := findTable(ep, info.TableName())
+ table, err := findTable(ep, info.Name)
if err != nil {
return linux.IPTGetinfo{}, err
}
@@ -82,30 +84,31 @@ func GetEntries(t *kernel.Task, ep tcpip.Endpoint, outPtr usermem.Addr, outLen i
}
// Find the appropriate table.
- table, err := findTable(ep, userEntries.TableName())
+ table, err := findTable(ep, userEntries.Name)
if err != nil {
return linux.KernelIPTGetEntries{}, err
}
// Convert netstack's iptables rules to something that the iptables
// tool can understand.
- entries, _, err := convertNetstackToBinary(userEntries.TableName(), table)
+ entries, _, err := convertNetstackToBinary(userEntries.Name.String(), table)
if err != nil {
return linux.KernelIPTGetEntries{}, err
}
if binary.Size(entries) > uintptr(outLen) {
+ log.Warningf("Insufficient GetEntries output size: %d", uintptr(outLen))
return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument
}
return entries, nil
}
-func findTable(ep tcpip.Endpoint, tableName string) (iptables.Table, *syserr.Error) {
+func findTable(ep tcpip.Endpoint, tablename linux.TableName) (iptables.Table, *syserr.Error) {
ipt, err := ep.IPTables()
if err != nil {
return iptables.Table{}, syserr.FromError(err)
}
- table, ok := ipt.Tables[tableName]
+ table, ok := ipt.Tables[tablename.String()]
if !ok {
return iptables.Table{}, syserr.ErrInvalidArgument
}
@@ -135,110 +138,68 @@ func FillDefaultIPTables(stack *stack.Stack) {
// format expected by the iptables tool. Linux stores each table as a binary
// blob that can only be traversed by parsing a bit, reading some offsets,
// jumping to those offsets, parsing again, etc.
-func convertNetstackToBinary(name string, table iptables.Table) (linux.KernelIPTGetEntries, metadata, *syserr.Error) {
+func convertNetstackToBinary(tablename string, table iptables.Table) (linux.KernelIPTGetEntries, metadata, *syserr.Error) {
// Return values.
var entries linux.KernelIPTGetEntries
var meta metadata
// The table name has to fit in the struct.
- if linux.XT_TABLE_MAXNAMELEN < len(name) {
+ if linux.XT_TABLE_MAXNAMELEN < len(tablename) {
+ log.Warningf("Table name %q too long.", tablename)
return linux.KernelIPTGetEntries{}, metadata{}, syserr.ErrInvalidArgument
}
- copy(entries.Name[:], name)
-
- // Deal with the built in chains first (INPUT, OUTPUT, etc.). Each of
- // these chains ends with an unconditional policy entry.
- for hook := iptables.Prerouting; hook < iptables.NumHooks; hook++ {
- chain, ok := table.BuiltinChains[hook]
- if !ok {
- // This table doesn't support this hook.
- continue
- }
-
- // Sanity check.
- if len(chain.Rules) < 1 {
- return linux.KernelIPTGetEntries{}, metadata{}, syserr.ErrInvalidArgument
- }
+ copy(entries.Name[:], tablename)
- for ruleIdx, rule := range chain.Rules {
- // If this is the first rule of a builtin chain, set
- // the metadata hook entry point.
- if ruleIdx == 0 {
+ for ruleIdx, rule := range table.Rules {
+ // Is this a chain entry point?
+ for hook, hookRuleIdx := range table.BuiltinChains {
+ if hookRuleIdx == ruleIdx {
meta.HookEntry[hook] = entries.Size
}
-
- // Each rule corresponds to an entry.
- entry := linux.KernelIPTEntry{
- IPTEntry: linux.IPTEntry{
- NextOffset: linux.SizeOfIPTEntry,
- TargetOffset: linux.SizeOfIPTEntry,
- },
+ }
+ // Is this a chain underflow point?
+ for underflow, underflowRuleIdx := range table.Underflows {
+ if underflowRuleIdx == ruleIdx {
+ meta.Underflow[underflow] = entries.Size
}
+ }
- for _, matcher := range rule.Matchers {
- // Serialize the matcher and add it to the
- // entry.
- serialized := marshalMatcher(matcher)
- entry.Elems = append(entry.Elems, serialized...)
- entry.NextOffset += uint16(len(serialized))
- entry.TargetOffset += uint16(len(serialized))
- }
+ // Each rule corresponds to an entry.
+ entry := linux.KernelIPTEntry{
+ IPTEntry: linux.IPTEntry{
+ NextOffset: linux.SizeOfIPTEntry,
+ TargetOffset: linux.SizeOfIPTEntry,
+ },
+ }
- // Serialize and append the target.
- serialized := marshalTarget(rule.Target)
+ for _, matcher := range rule.Matchers {
+ // Serialize the matcher and add it to the
+ // entry.
+ serialized := marshalMatcher(matcher)
entry.Elems = append(entry.Elems, serialized...)
entry.NextOffset += uint16(len(serialized))
-
- // The underflow rule is the last rule in the chain,
- // and is an unconditional rule (i.e. it matches any
- // packet). This is enforced when saving iptables.
- if ruleIdx == len(chain.Rules)-1 {
- meta.Underflow[hook] = entries.Size
- }
-
- entries.Size += uint32(entry.NextOffset)
- entries.Entrytable = append(entries.Entrytable, entry)
- meta.NumEntries++
+ entry.TargetOffset += uint16(len(serialized))
}
- }
-
- // TODO(gvisor.dev/issue/170): Deal with the user chains here. Each of
- // these starts with an error node holding the chain's name and ends
- // with an unconditional return.
+ // Serialize and append the target.
+ serialized := marshalTarget(rule.Target)
+ entry.Elems = append(entry.Elems, serialized...)
+ entry.NextOffset += uint16(len(serialized))
- // Lastly, each table ends with an unconditional error target rule as
- // its final entry.
- errorEntry := linux.KernelIPTEntry{
- IPTEntry: linux.IPTEntry{
- NextOffset: linux.SizeOfIPTEntry,
- TargetOffset: linux.SizeOfIPTEntry,
- },
+ entries.Size += uint32(entry.NextOffset)
+ entries.Entrytable = append(entries.Entrytable, entry)
+ meta.NumEntries++
}
- var errorTarget linux.XTErrorTarget
- errorTarget.Target.TargetSize = linux.SizeOfXTErrorTarget
- copy(errorTarget.ErrorName[:], errorTargetName)
- copy(errorTarget.Target.Name[:], errorTargetName)
-
- // Serialize and add it to the list of entries.
- errorTargetBuf := make([]byte, 0, linux.SizeOfXTErrorTarget)
- serializedErrorTarget := binary.Marshal(errorTargetBuf, usermem.ByteOrder, errorTarget)
- errorEntry.Elems = append(errorEntry.Elems, serializedErrorTarget...)
- errorEntry.NextOffset += uint16(len(serializedErrorTarget))
-
- entries.Size += uint32(errorEntry.NextOffset)
- entries.Entrytable = append(entries.Entrytable, errorEntry)
- meta.NumEntries++
- meta.Size = entries.Size
+ meta.Size = entries.Size
return entries, meta, nil
}
func marshalMatcher(matcher iptables.Matcher) []byte {
switch matcher.(type) {
default:
- // TODO(gvisor.dev/issue/170): We don't support any matchers yet, so
- // any call to marshalMatcher will panic.
+ // TODO(gvisor.dev/issue/170): We don't support any matchers
+ // yet, so any call to marshalMatcher will panic.
panic(fmt.Errorf("unknown matcher of type %T", matcher))
}
}
@@ -246,28 +207,46 @@ func marshalMatcher(matcher iptables.Matcher) []byte {
func marshalTarget(target iptables.Target) []byte {
switch target.(type) {
case iptables.UnconditionalAcceptTarget:
- return marshalUnconditionalAcceptTarget()
+ return marshalStandardTarget(iptables.Accept)
+ case iptables.UnconditionalDropTarget:
+ return marshalStandardTarget(iptables.Drop)
+ case iptables.ErrorTarget:
+ return marshalErrorTarget()
default:
panic(fmt.Errorf("unknown target of type %T", target))
}
}
-func marshalUnconditionalAcceptTarget() []byte {
+func marshalStandardTarget(verdict iptables.Verdict) []byte {
// The target's name will be the empty string.
target := linux.XTStandardTarget{
Target: linux.XTEntryTarget{
TargetSize: linux.SizeOfXTStandardTarget,
},
- Verdict: translateStandardVerdict(iptables.Accept),
+ Verdict: translateFromStandardVerdict(verdict),
}
ret := make([]byte, 0, linux.SizeOfXTStandardTarget)
return binary.Marshal(ret, usermem.ByteOrder, target)
}
-// translateStandardVerdict translates verdicts the same way as the iptables
+func marshalErrorTarget() []byte {
+ // This is an error target named error
+ target := linux.XTErrorTarget{
+ Target: linux.XTEntryTarget{
+ TargetSize: linux.SizeOfXTErrorTarget,
+ },
+ }
+ copy(target.Name[:], errorTargetName)
+ copy(target.Target.Name[:], errorTargetName)
+
+ ret := make([]byte, 0, linux.SizeOfXTErrorTarget)
+ return binary.Marshal(ret, usermem.ByteOrder, target)
+}
+
+// translateFromStandardVerdict translates verdicts the same way as the iptables
// tool.
-func translateStandardVerdict(verdict iptables.Verdict) int32 {
+func translateFromStandardVerdict(verdict iptables.Verdict) int32 {
switch verdict {
case iptables.Accept:
return -linux.NF_ACCEPT - 1
@@ -280,7 +259,258 @@ func translateStandardVerdict(verdict iptables.Verdict) int32 {
case iptables.Jump:
// TODO(gvisor.dev/issue/170): Support Jump.
panic("Jump isn't supported yet")
+ }
+ panic(fmt.Sprintf("unknown standard verdict: %d", verdict))
+}
+
+// translateToStandardVerdict translates from the value in a
+// linux.XTStandardTarget to an iptables.Verdict.
+func translateToStandardVerdict(val int32) (iptables.Verdict, *syserr.Error) {
+ // TODO(gvisor.dev/issue/170): Support other verdicts.
+ switch val {
+ case -linux.NF_ACCEPT - 1:
+ return iptables.Accept, nil
+ case -linux.NF_DROP - 1:
+ return iptables.Drop, nil
+ case -linux.NF_QUEUE - 1:
+ log.Warningf("Unsupported iptables verdict QUEUE.")
+ case linux.NF_RETURN:
+ log.Warningf("Unsupported iptables verdict RETURN.")
+ default:
+ log.Warningf("Unknown iptables verdict %d.", val)
+ }
+ return iptables.Invalid, syserr.ErrInvalidArgument
+}
+
+// SetEntries sets iptables rules for a single table. See
+// net/ipv4/netfilter/ip_tables.c:translate_table for reference.
+func SetEntries(stack *stack.Stack, optVal []byte) *syserr.Error {
+ printReplace(optVal)
+
+ // Get the basic rules data (struct ipt_replace).
+ if len(optVal) < linux.SizeOfIPTReplace {
+ log.Warningf("netfilter.SetEntries: optVal has insufficient size for replace %d", len(optVal))
+ return syserr.ErrInvalidArgument
+ }
+ var replace linux.IPTReplace
+ replaceBuf := optVal[:linux.SizeOfIPTReplace]
+ optVal = optVal[linux.SizeOfIPTReplace:]
+ binary.Unmarshal(replaceBuf, usermem.ByteOrder, &replace)
+
+ // TODO(gvisor.dev/issue/170): Support other tables.
+ var table iptables.Table
+ switch replace.Name.String() {
+ case iptables.TablenameFilter:
+ table = iptables.EmptyFilterTable()
default:
- panic(fmt.Sprintf("unknown standard verdict: %d", verdict))
+ log.Warningf("We don't yet support writing to the %q table (gvisor.dev/issue/170)", replace.Name.String())
+ return syserr.ErrInvalidArgument
+ }
+
+ // Convert input into a list of rules and their offsets.
+ var offset uint32
+ var offsets []uint32
+ for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ {
+ // Get the struct ipt_entry.
+ if len(optVal) < linux.SizeOfIPTEntry {
+ log.Warningf("netfilter: optVal has insufficient size for entry %d", len(optVal))
+ return syserr.ErrInvalidArgument
+ }
+ var entry linux.IPTEntry
+ buf := optVal[:linux.SizeOfIPTEntry]
+ optVal = optVal[linux.SizeOfIPTEntry:]
+ binary.Unmarshal(buf, usermem.ByteOrder, &entry)
+ if entry.TargetOffset != linux.SizeOfIPTEntry {
+ // TODO(gvisor.dev/issue/170): Support matchers.
+ return syserr.ErrInvalidArgument
+ }
+
+ // TODO(gvisor.dev/issue/170): We should support IPTIP
+ // filtering. We reject any nonzero IPTIP values for now.
+ emptyIPTIP := linux.IPTIP{}
+ if entry.IP != emptyIPTIP {
+ log.Warningf("netfilter: non-empty struct iptip found")
+ return syserr.ErrInvalidArgument
+ }
+
+ // Get the target of the rule.
+ target, consumed, err := parseTarget(optVal)
+ if err != nil {
+ return err
+ }
+ optVal = optVal[consumed:]
+
+ table.Rules = append(table.Rules, iptables.Rule{Target: target})
+ offsets = append(offsets, offset)
+ offset += linux.SizeOfIPTEntry + consumed
+ }
+
+ // Go through the list of supported hooks for this table and, for each
+ // one, set the rule it corresponds to.
+ for hook, _ := range replace.HookEntry {
+ if table.ValidHooks()&uint32(hook) != 0 {
+ hk := hookFromLinux(hook)
+ for ruleIdx, offset := range offsets {
+ if offset == replace.HookEntry[hook] {
+ table.BuiltinChains[hk] = ruleIdx
+ }
+ if offset == replace.Underflow[hook] {
+ table.Underflows[hk] = ruleIdx
+ }
+ }
+ if ruleIdx := table.BuiltinChains[hk]; ruleIdx == iptables.HookUnset {
+ log.Warningf("Hook %v is unset.", hk)
+ return syserr.ErrInvalidArgument
+ }
+ if ruleIdx := table.Underflows[hk]; ruleIdx == iptables.HookUnset {
+ log.Warningf("Underflow %v is unset.", hk)
+ return syserr.ErrInvalidArgument
+ }
+ }
+ }
+
+ ipt := stack.IPTables()
+ table.SetMetadata(metadata{
+ HookEntry: replace.HookEntry,
+ Underflow: replace.Underflow,
+ NumEntries: replace.NumEntries,
+ Size: replace.Size,
+ })
+ ipt.Tables[replace.Name.String()] = table
+ stack.SetIPTables(ipt)
+
+ return nil
+}
+
+// parseTarget parses a target from the start of optVal and returns the target
+// along with the number of bytes it occupies in optVal.
+func parseTarget(optVal []byte) (iptables.Target, uint32, *syserr.Error) {
+ if len(optVal) < linux.SizeOfXTEntryTarget {
+ log.Warningf("netfilter: optVal has insufficient size for entry target %d", len(optVal))
+ return nil, 0, syserr.ErrInvalidArgument
+ }
+ var target linux.XTEntryTarget
+ buf := optVal[:linux.SizeOfXTEntryTarget]
+ binary.Unmarshal(buf, usermem.ByteOrder, &target)
+ switch target.Name.String() {
+ case "":
+ // Standard target.
+ if len(optVal) < linux.SizeOfXTStandardTarget {
+ log.Warningf("netfilter.SetEntries: optVal has insufficient size for standard target %d", len(optVal))
+ return nil, 0, syserr.ErrInvalidArgument
+ }
+ var standardTarget linux.XTStandardTarget
+ buf = optVal[:linux.SizeOfXTStandardTarget]
+ binary.Unmarshal(buf, usermem.ByteOrder, &standardTarget)
+
+ verdict, err := translateToStandardVerdict(standardTarget.Verdict)
+ if err != nil {
+ return nil, 0, err
+ }
+ switch verdict {
+ case iptables.Accept:
+ return iptables.UnconditionalAcceptTarget{}, linux.SizeOfXTStandardTarget, nil
+ case iptables.Drop:
+ // TODO(gvisor.dev/issue/170): Return an
+ // iptables.UnconditionalDropTarget to support DROP.
+ log.Infof("netfilter DROP is not supported yet.")
+ return nil, 0, syserr.ErrInvalidArgument
+ default:
+ panic(fmt.Sprintf("Unknown verdict: %v", verdict))
+ }
+
+ case errorTargetName:
+ // Error target.
+ if len(optVal) < linux.SizeOfXTErrorTarget {
+ log.Infof("netfilter.SetEntries: optVal has insufficient size for error target %d", len(optVal))
+ return nil, 0, syserr.ErrInvalidArgument
+ }
+ var errorTarget linux.XTErrorTarget
+ buf = optVal[:linux.SizeOfXTErrorTarget]
+ binary.Unmarshal(buf, usermem.ByteOrder, &errorTarget)
+
+ // Error targets are used in 2 cases:
+ // * An actual error case. These rules have an error
+ // named errorTargetName. The last entry of the table
+ // is usually an error case to catch any packets that
+ // somehow fall through every rule.
+ // * To mark the start of a user defined chain. These
+ // rules have an error with the name of the chain.
+ switch errorTarget.Name.String() {
+ case errorTargetName:
+ return iptables.ErrorTarget{}, linux.SizeOfXTErrorTarget, nil
+ default:
+ log.Infof("Unknown error target %q doesn't exist or isn't supported yet.", errorTarget.Name.String())
+ return nil, 0, syserr.ErrInvalidArgument
+ }
+ }
+
+ // Unknown target.
+ log.Infof("Unknown target %q doesn't exist or isn't supported yet.", target.Name.String())
+ return nil, 0, syserr.ErrInvalidArgument
+}
+
+func hookFromLinux(hook int) iptables.Hook {
+ switch hook {
+ case linux.NF_INET_PRE_ROUTING:
+ return iptables.Prerouting
+ case linux.NF_INET_LOCAL_IN:
+ return iptables.Input
+ case linux.NF_INET_FORWARD:
+ return iptables.Forward
+ case linux.NF_INET_LOCAL_OUT:
+ return iptables.Output
+ case linux.NF_INET_POST_ROUTING:
+ return iptables.Postrouting
+ }
+ panic(fmt.Sprintf("Unknown hook %d does not correspond to a builtin chain", hook))
+}
+
+// printReplace prints information about the struct ipt_replace in optVal. It
+// is only for debugging.
+func printReplace(optVal []byte) {
+ // Basic replace info.
+ var replace linux.IPTReplace
+ replaceBuf := optVal[:linux.SizeOfIPTReplace]
+ optVal = optVal[linux.SizeOfIPTReplace:]
+ binary.Unmarshal(replaceBuf, usermem.ByteOrder, &replace)
+ log.Infof("Replacing table %q: %+v", replace.Name.String(), replace)
+
+ // Read in the list of entries at the end of replace.
+ var totalOffset uint16
+ for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ {
+ var entry linux.IPTEntry
+ entryBuf := optVal[:linux.SizeOfIPTEntry]
+ binary.Unmarshal(entryBuf, usermem.ByteOrder, &entry)
+ log.Infof("Entry %d (total offset %d): %+v", entryIdx, totalOffset, entry)
+
+ totalOffset += entry.NextOffset
+ if entry.TargetOffset == linux.SizeOfIPTEntry {
+ log.Infof("Entry has no matches.")
+ } else {
+ log.Infof("Entry has matches.")
+ }
+
+ var target linux.XTEntryTarget
+ targetBuf := optVal[entry.TargetOffset : entry.TargetOffset+linux.SizeOfXTEntryTarget]
+ binary.Unmarshal(targetBuf, usermem.ByteOrder, &target)
+ log.Infof("Target named %q: %+v", target.Name.String(), target)
+
+ switch target.Name.String() {
+ case "":
+ var standardTarget linux.XTStandardTarget
+ stBuf := optVal[entry.TargetOffset : entry.TargetOffset+linux.SizeOfXTStandardTarget]
+ binary.Unmarshal(stBuf, usermem.ByteOrder, &standardTarget)
+ log.Infof("Standard target with verdict %q (%d).", linux.VerdictStrings[standardTarget.Verdict], standardTarget.Verdict)
+ case errorTargetName:
+ var errorTarget linux.XTErrorTarget
+ etBuf := optVal[entry.TargetOffset : entry.TargetOffset+linux.SizeOfXTErrorTarget]
+ binary.Unmarshal(etBuf, usermem.ByteOrder, &errorTarget)
+ log.Infof("Error target with name %q.", errorTarget.Name.String())
+ default:
+ log.Infof("Unknown target type.")
+ }
+
+ optVal = optVal[entry.NextOffset:]
}
}
diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD
index 79589e3c8..103933144 100644
--- a/pkg/sentry/socket/netlink/BUILD
+++ b/pkg/sentry/socket/netlink/BUILD
@@ -22,12 +22,12 @@ go_library(
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/kernel/time",
- "//pkg/sentry/safemem",
"//pkg/sentry/socket",
"//pkg/sentry/socket/netlink/port",
"//pkg/sentry/socket/unix",
"//pkg/sentry/socket/unix/transport",
"//pkg/sentry/usermem",
+ "//pkg/sync",
"//pkg/syserr",
"//pkg/syserror",
"//pkg/tcpip",
diff --git a/pkg/sentry/socket/netlink/port/BUILD b/pkg/sentry/socket/netlink/port/BUILD
index 463544c1a..2d9f4ba9b 100644
--- a/pkg/sentry/socket/netlink/port/BUILD
+++ b/pkg/sentry/socket/netlink/port/BUILD
@@ -8,6 +8,7 @@ go_library(
srcs = ["port.go"],
importpath = "gvisor.dev/gvisor/pkg/sentry/socket/netlink/port",
visibility = ["//pkg/sentry:internal"],
+ deps = ["//pkg/sync"],
)
go_test(
diff --git a/pkg/sentry/socket/netlink/port/port.go b/pkg/sentry/socket/netlink/port/port.go
index e9d3275b1..2cd3afc22 100644
--- a/pkg/sentry/socket/netlink/port/port.go
+++ b/pkg/sentry/socket/netlink/port/port.go
@@ -24,7 +24,8 @@ import (
"fmt"
"math"
"math/rand"
- "sync"
+
+ "gvisor.dev/gvisor/pkg/sync"
)
// maxPorts is a sanity limit on the maximum number of ports to allocate per
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index 4a1b87a9a..cea56f4ed 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -17,7 +17,6 @@ package netlink
import (
"math"
- "sync"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/binary"
@@ -29,12 +28,12 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
- "gvisor.dev/gvisor/pkg/sentry/safemem"
"gvisor.dev/gvisor/pkg/sentry/socket"
"gvisor.dev/gvisor/pkg/sentry/socket/netlink/port"
"gvisor.dev/gvisor/pkg/sentry/socket/unix"
"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
"gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserr"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/tcpip"
@@ -500,29 +499,29 @@ func (s *Socket) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, have
trunc := flags&linux.MSG_TRUNC != 0
r := unix.EndpointReader{
+ Ctx: t,
Endpoint: s.ep,
Peek: flags&linux.MSG_PEEK != 0,
}
+ doRead := func() (int64, error) {
+ return dst.CopyOutFrom(t, &r)
+ }
+
// If MSG_TRUNC is set with a zero byte destination then we still need
// to read the message and discard it, or in the case where MSG_PEEK is
// set, leave it be. In both cases the full message length must be
- // returned. However, the memory manager for the destination will not read
- // the endpoint if the destination is zero length.
- //
- // In order for the endpoint to be read when the destination size is zero,
- // we must cause a read of the endpoint by using a separate fake zero
- // length block sequence and calling the EndpointReader directly.
+ // returned.
if trunc && dst.Addrs.NumBytes() == 0 {
- // Perform a read to a zero byte block sequence. We can ignore the
- // original destination since it was zero bytes. The length returned by
- // ReadToBlocks is ignored and we return the full message length to comply
- // with MSG_TRUNC.
- _, err := r.ReadToBlocks(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(make([]byte, 0))))
- return int(r.MsgSize), linux.MSG_TRUNC, from, fromLen, socket.ControlMessages{}, syserr.FromError(err)
+ doRead = func() (int64, error) {
+ err := r.Truncate()
+ // Always return zero for bytes read since the destination size is
+ // zero.
+ return 0, err
+ }
}
- if n, err := dst.CopyOutFrom(t, &r); err != syserror.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 {
+ if n, err := doRead(); err != syserror.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 {
var mflags int
if n < int64(r.MsgSize) {
mflags |= linux.MSG_TRUNC
@@ -540,7 +539,7 @@ func (s *Socket) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, have
defer s.EventUnregister(&e)
for {
- if n, err := dst.CopyOutFrom(t, &r); err != syserror.ErrWouldBlock {
+ if n, err := doRead(); err != syserror.ErrWouldBlock {
var mflags int
if n < int64(r.MsgSize) {
mflags |= linux.MSG_TRUNC
diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD
index e414d8055..f78784569 100644
--- a/pkg/sentry/socket/netstack/BUILD
+++ b/pkg/sentry/socket/netstack/BUILD
@@ -34,6 +34,7 @@ go_library(
"//pkg/sentry/socket/netfilter",
"//pkg/sentry/unimpl",
"//pkg/sentry/usermem",
+ "//pkg/sync",
"//pkg/syserr",
"//pkg/syserror",
"//pkg/tcpip",
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index d92399efd..d2f7e987d 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -29,7 +29,6 @@ import (
"io"
"math"
"reflect"
- "sync"
"syscall"
"time"
@@ -49,6 +48,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/socket/netfilter"
"gvisor.dev/gvisor/pkg/sentry/unimpl"
"gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserr"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/tcpip"
@@ -151,6 +151,8 @@ var Metrics = tcpip.Stats{
PassiveConnectionOpenings: mustCreateMetric("/netstack/tcp/passive_connection_openings", "Number of connections opened successfully via Listen."),
CurrentEstablished: mustCreateMetric("/netstack/tcp/current_established", "Number of connections in either ESTABLISHED or CLOSE-WAIT state now."),
EstablishedResets: mustCreateMetric("/netstack/tcp/established_resets", "Number of times TCP connections have made a direct transition to the CLOSED state from either the ESTABLISHED state or the CLOSE-WAIT state"),
+ EstablishedClosed: mustCreateMetric("/netstack/tcp/established_closed", "number of times established TCP connections made a transition to CLOSED state."),
+ EstablishedTimedout: mustCreateMetric("/netstack/tcp/established_timedout", "Number of times an established connection was reset because of keep-alive time out."),
ListenOverflowSynDrop: mustCreateMetric("/netstack/tcp/listen_overflow_syn_drop", "Number of times the listen queue overflowed and a SYN was dropped."),
ListenOverflowAckDrop: mustCreateMetric("/netstack/tcp/listen_overflow_ack_drop", "Number of times the listen queue overflowed and the final ACK in the handshake was dropped."),
ListenOverflowSynCookieSent: mustCreateMetric("/netstack/tcp/listen_overflow_syn_cookie_sent", "Number of times a SYN cookie was sent."),
@@ -220,17 +222,25 @@ type commonEndpoint interface {
// transport.Endpoint.SetSockOpt.
SetSockOpt(interface{}) *tcpip.Error
+ // SetSockOptBool implements tcpip.Endpoint.SetSockOptBool and
+ // transport.Endpoint.SetSockOptBool.
+ SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error
+
// SetSockOptInt implements tcpip.Endpoint.SetSockOptInt and
// transport.Endpoint.SetSockOptInt.
- SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error
+ SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error
// GetSockOpt implements tcpip.Endpoint.GetSockOpt and
// transport.Endpoint.GetSockOpt.
GetSockOpt(interface{}) *tcpip.Error
+ // GetSockOptBool implements tcpip.Endpoint.GetSockOptBool and
+ // transport.Endpoint.GetSockOpt.
+ GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error)
+
// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt and
// transport.Endpoint.GetSockOpt.
- GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error)
+ GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error)
}
// SocketOperations encapsulates all the state needed to represent a network stack
@@ -314,22 +324,15 @@ func bytesToIPAddress(addr []byte) tcpip.Address {
// converts it to the FullAddress format. It supports AF_UNIX, AF_INET,
// AF_INET6, and AF_PACKET addresses.
//
-// strict indicates whether addresses with the AF_UNSPEC family are accepted of not.
-//
// AddressAndFamily returns an address and its family.
-func AddressAndFamily(sfamily int, addr []byte, strict bool) (tcpip.FullAddress, uint16, *syserr.Error) {
+func AddressAndFamily(addr []byte) (tcpip.FullAddress, uint16, *syserr.Error) {
// Make sure we have at least 2 bytes for the address family.
if len(addr) < 2 {
return tcpip.FullAddress{}, 0, syserr.ErrInvalidArgument
}
- family := usermem.ByteOrder.Uint16(addr)
- if family != uint16(sfamily) && (!strict && family != linux.AF_UNSPEC) {
- return tcpip.FullAddress{}, family, syserr.ErrAddressFamilyNotSupported
- }
-
// Get the rest of the fields based on the address family.
- switch family {
+ switch family := usermem.ByteOrder.Uint16(addr); family {
case linux.AF_UNIX:
path := addr[2:]
if len(path) > linux.UnixPathMax {
@@ -628,10 +631,40 @@ func (s *SocketOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
return r
}
+func (s *SocketOperations) checkFamily(family uint16, exact bool) *syserr.Error {
+ if family == uint16(s.family) {
+ return nil
+ }
+ if !exact && family == linux.AF_INET && s.family == linux.AF_INET6 {
+ v, err := s.Endpoint.GetSockOptBool(tcpip.V6OnlyOption)
+ if err != nil {
+ return syserr.TranslateNetstackError(err)
+ }
+ if !v {
+ return nil
+ }
+ }
+ return syserr.ErrInvalidArgument
+}
+
+// mapFamily maps the AF_INET ANY address to the IPv4-mapped IPv6 ANY if the
+// receiver's family is AF_INET6.
+//
+// This is a hack to work around the fact that both IPv4 and IPv6 ANY are
+// represented by the empty string.
+//
+// TODO(gvisor.dev/issues/1556): remove this function.
+func (s *SocketOperations) mapFamily(addr tcpip.FullAddress, family uint16) tcpip.FullAddress {
+ if len(addr.Addr) == 0 && s.family == linux.AF_INET6 && family == linux.AF_INET {
+ addr.Addr = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\x00\x00\x00\x00"
+ }
+ return addr
+}
+
// Connect implements the linux syscall connect(2) for sockets backed by
// tpcip.Endpoint.
func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error {
- addr, family, err := AddressAndFamily(s.family, sockaddr, false /* strict */)
+ addr, family, err := AddressAndFamily(sockaddr)
if err != nil {
return err
}
@@ -643,6 +676,12 @@ func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo
}
return syserr.TranslateNetstackError(err)
}
+
+ if err := s.checkFamily(family, false /* exact */); err != nil {
+ return err
+ }
+ addr = s.mapFamily(addr, family)
+
// Always return right away in the non-blocking case.
if !blocking {
return syserr.TranslateNetstackError(s.Endpoint.Connect(addr))
@@ -671,10 +710,14 @@ func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo
// Bind implements the linux syscall bind(2) for sockets backed by
// tcpip.Endpoint.
func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
- addr, _, err := AddressAndFamily(s.family, sockaddr, true /* strict */)
+ addr, family, err := AddressAndFamily(sockaddr)
if err != nil {
return err
}
+ if err := s.checkFamily(family, true /* exact */); err != nil {
+ return err
+ }
+ addr = s.mapFamily(addr, family)
// Issue the bind request to the endpoint.
return syserr.TranslateNetstackError(s.Endpoint.Bind(addr))
@@ -975,13 +1018,23 @@ func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family
if err := ep.GetSockOpt(&v); err != nil {
return nil, syserr.TranslateNetstackError(err)
}
- if len(v) == 0 {
+ if v == 0 {
return []byte{}, nil
}
if outLen < linux.IFNAMSIZ {
return nil, syserr.ErrInvalidArgument
}
- return append([]byte(v), 0), nil
+ s := t.NetworkContext()
+ if s == nil {
+ return nil, syserr.ErrNoDevice
+ }
+ nic, ok := s.Interfaces()[int32(v)]
+ if !ok {
+ // The NICID no longer indicates a valid interface, probably because that
+ // interface was removed.
+ return nil, syserr.ErrUnknownDevice
+ }
+ return append([]byte(nic.Name), 0), nil
case linux.SO_BROADCAST:
if outLen < sizeOfInt32 {
@@ -1125,6 +1178,18 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
return int32(time.Duration(v) / time.Second), nil
+ case linux.TCP_USER_TIMEOUT:
+ if outLen < sizeOfInt32 {
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ var v tcpip.TCPUserTimeoutOption
+ if err := ep.GetSockOpt(&v); err != nil {
+ return nil, syserr.TranslateNetstackError(err)
+ }
+
+ return int32(time.Duration(v) / time.Millisecond), nil
+
case linux.TCP_INFO:
var v tcpip.TCPInfoOption
if err := ep.GetSockOpt(&v); err != nil {
@@ -1199,12 +1264,15 @@ func getSockOptIPv6(t *kernel.Task, ep commonEndpoint, name, outLen int) (interf
return nil, syserr.ErrInvalidArgument
}
- var v tcpip.V6OnlyOption
- if err := ep.GetSockOpt(&v); err != nil {
+ v, err := ep.GetSockOptBool(tcpip.V6OnlyOption)
+ if err != nil {
return nil, syserr.TranslateNetstackError(err)
}
-
- return int32(v), nil
+ var o int32
+ if v {
+ o = 1
+ }
+ return o, nil
case linux.IPV6_PATHMTU:
t.Kernel().EmitUnimplementedEvent(t)
@@ -1309,6 +1377,21 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int, family in
}
return int32(v), nil
+ case linux.IP_RECVTOS:
+ if outLen < sizeOfInt32 {
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ v, err := ep.GetSockOptBool(tcpip.ReceiveTOSOption)
+ if err != nil {
+ return nil, syserr.TranslateNetstackError(err)
+ }
+ var o int32
+ if v {
+ o = 1
+ }
+ return o, nil
+
default:
emitUnimplementedEventIP(t, name)
}
@@ -1342,6 +1425,26 @@ func (s *SocketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVa
return nil
}
+ if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP {
+ switch name {
+ case linux.IPT_SO_SET_REPLACE:
+ if len(optVal) < linux.SizeOfIPTReplace {
+ return syserr.ErrInvalidArgument
+ }
+
+ stack := inet.StackFromContext(t)
+ if stack == nil {
+ return syserr.ErrNoDevice
+ }
+ // Stack must be a netstack stack.
+ return netfilter.SetEntries(stack.(*Stack).Stack, optVal)
+
+ case linux.IPT_SO_SET_ADD_COUNTERS:
+ // TODO(gvisor.dev/issue/170): Counter support.
+ return nil
+ }
+ }
+
return SetSockOpt(t, s, s.Endpoint, level, name, optVal)
}
@@ -1413,7 +1516,20 @@ func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name i
if n == -1 {
n = len(optVal)
}
- return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.BindToDeviceOption(optVal[:n])))
+ name := string(optVal[:n])
+ if name == "" {
+ return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.BindToDeviceOption(0)))
+ }
+ s := t.NetworkContext()
+ if s == nil {
+ return syserr.ErrNoDevice
+ }
+ for nicID, nic := range s.Interfaces() {
+ if nic.Name == name {
+ return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.BindToDeviceOption(nicID)))
+ }
+ }
+ return syserr.ErrUnknownDevice
case linux.SO_BROADCAST:
if len(optVal) < sizeOfInt32 {
@@ -1561,6 +1677,17 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
}
return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.KeepaliveIntervalOption(time.Second * time.Duration(v))))
+ case linux.TCP_USER_TIMEOUT:
+ if len(optVal) < sizeOfInt32 {
+ return syserr.ErrInvalidArgument
+ }
+
+ v := int32(usermem.ByteOrder.Uint32(optVal))
+ if v < 0 {
+ return syserr.ErrInvalidArgument
+ }
+ return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPUserTimeoutOption(time.Millisecond * time.Duration(v))))
+
case linux.TCP_CONGESTION:
v := tcpip.CongestionControlOption(optVal)
if err := ep.SetSockOpt(v); err != nil {
@@ -1596,7 +1723,7 @@ func setSockOptIPv6(t *kernel.Task, ep commonEndpoint, name int, optVal []byte)
}
v := usermem.ByteOrder.Uint32(optVal)
- return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.V6OnlyOption(v)))
+ return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.V6OnlyOption, v != 0))
case linux.IPV6_ADD_MEMBERSHIP,
linux.IPV6_DROP_MEMBERSHIP,
@@ -1783,6 +1910,13 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
}
return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.IPv4TOSOption(v)))
+ case linux.IP_RECVTOS:
+ v, err := parseIntOrChar(optVal)
+ if err != nil {
+ return err
+ }
+ return syserr.TranslateNetstackError(ep.SetSockOptBool(tcpip.ReceiveTOSOption, v != 0))
+
case linux.IP_ADD_SOURCE_MEMBERSHIP,
linux.IP_BIND_ADDRESS_NO_PORT,
linux.IP_BLOCK_SOURCE,
@@ -1803,7 +1937,6 @@ func setSockOptIP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *s
linux.IP_RECVFRAGSIZE,
linux.IP_RECVOPTS,
linux.IP_RECVORIGDSTADDR,
- linux.IP_RECVTOS,
linux.IP_RECVTTL,
linux.IP_RETOPTS,
linux.IP_TRANSPARENT,
@@ -2001,8 +2134,8 @@ func ConvertAddress(family int, addr tcpip.FullAddress) (linux.SockAddr, uint32)
case linux.AF_INET6:
var out linux.SockAddrInet6
- if len(addr.Addr) == 4 {
- // Copy address is v4-mapped format.
+ if len(addr.Addr) == header.IPv4AddressSize {
+ // Copy address in v4-mapped format.
copy(out.Addr[12:], addr.Addr)
out.Addr[10] = 0xff
out.Addr[11] = 0xff
@@ -2223,7 +2356,14 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
}
func (s *SocketOperations) controlMessages() socket.ControlMessages {
- return socket.ControlMessages{IP: tcpip.ControlMessages{HasTimestamp: s.readCM.HasTimestamp && s.sockOptTimestamp, Timestamp: s.readCM.Timestamp}}
+ return socket.ControlMessages{
+ IP: tcpip.ControlMessages{
+ HasTimestamp: s.readCM.HasTimestamp && s.sockOptTimestamp,
+ Timestamp: s.readCM.Timestamp,
+ HasTOS: s.readCM.HasTOS,
+ TOS: s.readCM.TOS,
+ },
+ }
}
// updateTimestamp sets the timestamp for SIOCGSTAMP. It should be called after
@@ -2316,10 +2456,14 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
var addr *tcpip.FullAddress
if len(to) > 0 {
- addrBuf, _, err := AddressAndFamily(s.family, to, true /* strict */)
+ addrBuf, family, err := AddressAndFamily(to)
if err != nil {
return 0, err
}
+ if err := s.checkFamily(family, false /* exact */); err != nil {
+ return 0, err
+ }
+ addrBuf = s.mapFamily(addrBuf, family)
addr = &addrBuf
}
diff --git a/pkg/sentry/socket/rpcinet/conn/BUILD b/pkg/sentry/socket/rpcinet/conn/BUILD
index 23eadcb1b..b2677c659 100644
--- a/pkg/sentry/socket/rpcinet/conn/BUILD
+++ b/pkg/sentry/socket/rpcinet/conn/BUILD
@@ -10,6 +10,7 @@ go_library(
deps = [
"//pkg/binary",
"//pkg/sentry/socket/rpcinet:syscall_rpc_go_proto",
+ "//pkg/sync",
"//pkg/syserr",
"//pkg/unet",
"@com_github_golang_protobuf//proto:go_default_library",
diff --git a/pkg/sentry/socket/rpcinet/conn/conn.go b/pkg/sentry/socket/rpcinet/conn/conn.go
index 356adad99..02f39c767 100644
--- a/pkg/sentry/socket/rpcinet/conn/conn.go
+++ b/pkg/sentry/socket/rpcinet/conn/conn.go
@@ -17,12 +17,12 @@ package conn
import (
"fmt"
- "sync"
"sync/atomic"
"syscall"
"github.com/golang/protobuf/proto"
"gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserr"
"gvisor.dev/gvisor/pkg/unet"
diff --git a/pkg/sentry/socket/rpcinet/notifier/BUILD b/pkg/sentry/socket/rpcinet/notifier/BUILD
index a3585e10d..a5954f22b 100644
--- a/pkg/sentry/socket/rpcinet/notifier/BUILD
+++ b/pkg/sentry/socket/rpcinet/notifier/BUILD
@@ -10,6 +10,7 @@ go_library(
deps = [
"//pkg/sentry/socket/rpcinet:syscall_rpc_go_proto",
"//pkg/sentry/socket/rpcinet/conn",
+ "//pkg/sync",
"//pkg/waiter",
"@org_golang_x_sys//unix:go_default_library",
],
diff --git a/pkg/sentry/socket/rpcinet/notifier/notifier.go b/pkg/sentry/socket/rpcinet/notifier/notifier.go
index 7efe4301f..82b75d6dd 100644
--- a/pkg/sentry/socket/rpcinet/notifier/notifier.go
+++ b/pkg/sentry/socket/rpcinet/notifier/notifier.go
@@ -17,12 +17,12 @@ package notifier
import (
"fmt"
- "sync"
"syscall"
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/conn"
pb "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/syscall_rpc_go_proto"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/waiter"
)
diff --git a/pkg/sentry/socket/rpcinet/syscall_rpc.proto b/pkg/sentry/socket/rpcinet/syscall_rpc.proto
index 9586f5923..b677e9eb3 100644
--- a/pkg/sentry/socket/rpcinet/syscall_rpc.proto
+++ b/pkg/sentry/socket/rpcinet/syscall_rpc.proto
@@ -3,7 +3,6 @@ syntax = "proto3";
// package syscall_rpc is a set of networking related system calls that can be
// forwarded to a socket gofer.
//
-// TODO(b/77963526): Document individual RPCs.
package syscall_rpc;
message SendmsgRequest {
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
index 8c250c325..2389a9cdb 100644
--- a/pkg/sentry/socket/socket.go
+++ b/pkg/sentry/socket/socket.go
@@ -43,6 +43,11 @@ type ControlMessages struct {
IP tcpip.ControlMessages
}
+// Release releases Unix domain socket credentials and rights.
+func (c *ControlMessages) Release() {
+ c.Unix.Release()
+}
+
// Socket is the interface containing socket syscalls used by the syscall layer
// to redirect them to the appropriate implementation.
type Socket interface {
diff --git a/pkg/sentry/socket/unix/io.go b/pkg/sentry/socket/unix/io.go
index 2ec1a662d..2447f24ef 100644
--- a/pkg/sentry/socket/unix/io.go
+++ b/pkg/sentry/socket/unix/io.go
@@ -83,6 +83,19 @@ type EndpointReader struct {
ControlTrunc bool
}
+// Truncate calls RecvMsg on the endpoint without writing to a destination.
+func (r *EndpointReader) Truncate() error {
+ // Ignore bytes read since it will always be zero.
+ _, ms, c, ct, err := r.Endpoint.RecvMsg(r.Ctx, [][]byte{}, r.Creds, r.NumRights, r.Peek, r.From)
+ r.Control = c
+ r.ControlTrunc = ct
+ r.MsgSize = ms
+ if err != nil {
+ return err.ToError()
+ }
+ return nil
+}
+
// ReadToBlocks implements safemem.Reader.ReadToBlocks.
func (r *EndpointReader) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
return safemem.FromVecReaderFunc{func(bufs [][]byte) (int64, error) {
diff --git a/pkg/sentry/socket/unix/transport/BUILD b/pkg/sentry/socket/unix/transport/BUILD
index 788ad70d2..d7ba95dff 100644
--- a/pkg/sentry/socket/unix/transport/BUILD
+++ b/pkg/sentry/socket/unix/transport/BUILD
@@ -32,6 +32,7 @@ go_library(
"//pkg/ilist",
"//pkg/refs",
"//pkg/sentry/context",
+ "//pkg/sync",
"//pkg/syserr",
"//pkg/tcpip",
"//pkg/tcpip/buffer",
diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go
index dea11e253..9e6fbc111 100644
--- a/pkg/sentry/socket/unix/transport/connectioned.go
+++ b/pkg/sentry/socket/unix/transport/connectioned.go
@@ -15,10 +15,9 @@
package transport
import (
- "sync"
-
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserr"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/waiter"
diff --git a/pkg/sentry/socket/unix/transport/queue.go b/pkg/sentry/socket/unix/transport/queue.go
index e27b1c714..5dcd3d95e 100644
--- a/pkg/sentry/socket/unix/transport/queue.go
+++ b/pkg/sentry/socket/unix/transport/queue.go
@@ -15,9 +15,8 @@
package transport
import (
- "sync"
-
"gvisor.dev/gvisor/pkg/refs"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserr"
"gvisor.dev/gvisor/pkg/waiter"
)
diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go
index 529a7a7a9..fcc0da332 100644
--- a/pkg/sentry/socket/unix/transport/unix.go
+++ b/pkg/sentry/socket/unix/transport/unix.go
@@ -16,11 +16,11 @@
package transport
import (
- "sync"
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserr"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/buffer"
@@ -175,17 +175,25 @@ type Endpoint interface {
// types.
SetSockOpt(opt interface{}) *tcpip.Error
+ // SetSockOptBool sets a socket option for simple cases when a value has
+ // the int type.
+ SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error
+
// SetSockOptInt sets a socket option for simple cases when a value has
// the int type.
- SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error
+ SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error
// GetSockOpt gets a socket option. opt should be a pointer to one of the
// tcpip.*Option types.
GetSockOpt(opt interface{}) *tcpip.Error
+ // GetSockOptBool gets a socket option for simple cases when a return
+ // value has the int type.
+ GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error)
+
// GetSockOptInt gets a socket option for simple cases when a return
// value has the int type.
- GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error)
+ GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error)
// State returns the current state of the socket, as represented by Linux in
// procfs.
@@ -851,11 +859,19 @@ func (e *baseEndpoint) SetSockOpt(opt interface{}) *tcpip.Error {
return nil
}
-func (e *baseEndpoint) SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error {
+func (e *baseEndpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
return nil
}
-func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) {
+func (e *baseEndpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
+ return nil
+}
+
+func (e *baseEndpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
+ return false, tcpip.ErrUnknownProtocolOption
+}
+
+func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
switch opt {
case tcpip.ReceiveQueueSizeOption:
v := 0
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 1aaae8487..7f49ba864 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -116,10 +116,16 @@ func (s *SocketOperations) Endpoint() transport.Endpoint {
// extractPath extracts and validates the address.
func extractPath(sockaddr []byte) (string, *syserr.Error) {
- addr, _, err := netstack.AddressAndFamily(linux.AF_UNIX, sockaddr, true /* strict */)
+ addr, family, err := netstack.AddressAndFamily(sockaddr)
if err != nil {
+ if err == syserr.ErrAddressFamilyNotSupported {
+ err = syserr.ErrInvalidArgument
+ }
return "", err
}
+ if family != linux.AF_UNIX {
+ return "", syserr.ErrInvalidArgument
+ }
// The address is trimmed by GetAddress.
p := string(addr.Addr)
@@ -541,8 +547,27 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
if senderRequested {
r.From = &tcpip.FullAddress{}
}
+
+ doRead := func() (int64, error) {
+ return dst.CopyOutFrom(t, &r)
+ }
+
+ // If MSG_TRUNC is set with a zero byte destination then we still need
+ // to read the message and discard it, or in the case where MSG_PEEK is
+ // set, leave it be. In both cases the full message length must be
+ // returned.
+ if trunc && dst.Addrs.NumBytes() == 0 {
+ doRead = func() (int64, error) {
+ err := r.Truncate()
+ // Always return zero for bytes read since the destination size is
+ // zero.
+ return 0, err
+ }
+
+ }
+
var total int64
- if n, err := dst.CopyOutFrom(t, &r); err != syserror.ErrWouldBlock || dontWait {
+ if n, err := doRead(); err != syserror.ErrWouldBlock || dontWait {
var from linux.SockAddr
var fromLen uint32
if r.From != nil && len([]byte(r.From.Addr)) != 0 {
@@ -577,7 +602,7 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
defer s.EventUnregister(&e)
for {
- if n, err := dst.CopyOutFrom(t, &r); err != syserror.ErrWouldBlock {
+ if n, err := doRead(); err != syserror.ErrWouldBlock {
var from linux.SockAddr
var fromLen uint32
if r.From != nil {
diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD
index 72ebf766d..aa1ac720c 100644
--- a/pkg/sentry/strace/BUILD
+++ b/pkg/sentry/strace/BUILD
@@ -10,10 +10,12 @@ go_library(
"capability.go",
"clone.go",
"futex.go",
- "linux64.go",
+ "linux64_amd64.go",
+ "linux64_arm64.go",
"open.go",
"poll.go",
"ptrace.go",
+ "select.go",
"signal.go",
"socket.go",
"strace.go",
diff --git a/pkg/sentry/strace/linux64.go b/pkg/sentry/strace/linux64_amd64.go
index 5d57b75af..1e823b685 100644
--- a/pkg/sentry/strace/linux64.go
+++ b/pkg/sentry/strace/linux64_amd64.go
@@ -1,4 +1,4 @@
-// Copyright 2018 The gVisor Authors.
+// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -12,8 +12,15 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+// +build amd64
+
package strace
+import (
+ "gvisor.dev/gvisor/pkg/abi"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+)
+
// linuxAMD64 provides a mapping of the Linux amd64 syscalls and their argument
// types for display / formatting.
var linuxAMD64 = SyscallMap{
@@ -40,7 +47,7 @@ var linuxAMD64 = SyscallMap{
20: makeSyscallInfo("writev", FD, WriteIOVec, Hex),
21: makeSyscallInfo("access", Path, Oct),
22: makeSyscallInfo("pipe", PipeFDs),
- 23: makeSyscallInfo("select", Hex, Hex, Hex, Hex, Timeval),
+ 23: makeSyscallInfo("select", Hex, SelectFDSet, SelectFDSet, SelectFDSet, Timeval),
24: makeSyscallInfo("sched_yield"),
25: makeSyscallInfo("mremap", Hex, Hex, Hex, Hex, Hex),
26: makeSyscallInfo("msync", Hex, Hex, Hex),
@@ -287,7 +294,7 @@ var linuxAMD64 = SyscallMap{
267: makeSyscallInfo("readlinkat", FD, Path, ReadBuffer, Hex),
268: makeSyscallInfo("fchmodat", FD, Path, Mode),
269: makeSyscallInfo("faccessat", FD, Path, Oct, Hex),
- 270: makeSyscallInfo("pselect6", Hex, Hex, Hex, Hex, Hex, Hex),
+ 270: makeSyscallInfo("pselect6", Hex, SelectFDSet, SelectFDSet, SelectFDSet, Timespec, SigSet),
271: makeSyscallInfo("ppoll", PollFDs, Hex, Timespec, SigSet, Hex),
272: makeSyscallInfo("unshare", CloneFlags),
273: makeSyscallInfo("set_robust_list", Hex, Hex),
@@ -335,5 +342,43 @@ var linuxAMD64 = SyscallMap{
315: makeSyscallInfo("sched_getattr", Hex, Hex, Hex),
316: makeSyscallInfo("renameat2", FD, Path, Hex, Path, Hex),
317: makeSyscallInfo("seccomp", Hex, Hex, Hex),
+ 318: makeSyscallInfo("getrandom", Hex, Hex, Hex),
+ 319: makeSyscallInfo("memfd_create", Path, Hex), // Not quite a path, but close.
+ 320: makeSyscallInfo("kexec_file_load", FD, FD, Hex, Hex, Hex),
+ 321: makeSyscallInfo("bpf", Hex, Hex, Hex),
+ 322: makeSyscallInfo("execveat", FD, Path, ExecveStringVector, ExecveStringVector, Hex),
+ 323: makeSyscallInfo("userfaultfd", Hex),
+ 324: makeSyscallInfo("membarrier", Hex, Hex),
+ 325: makeSyscallInfo("mlock2", Hex, Hex, Hex),
+ 326: makeSyscallInfo("copy_file_range", FD, Hex, FD, Hex, Hex, Hex),
+ 327: makeSyscallInfo("preadv2", FD, ReadIOVec, Hex, Hex, Hex),
+ 328: makeSyscallInfo("pwritev2", FD, WriteIOVec, Hex, Hex, Hex),
+ 329: makeSyscallInfo("pkey_mprotect", Hex, Hex, Hex, Hex),
+ 330: makeSyscallInfo("pkey_alloc", Hex, Hex),
+ 331: makeSyscallInfo("pkey_free", Hex),
332: makeSyscallInfo("statx", FD, Path, Hex, Hex, Hex),
+ 333: makeSyscallInfo("io_pgetevents", Hex, Hex, Hex, Hex, Timespec, SigSet),
+ 334: makeSyscallInfo("rseq", Hex, Hex, Hex, Hex),
+ 424: makeSyscallInfo("pidfd_send_signal", FD, Signal, Hex, Hex),
+ 425: makeSyscallInfo("io_uring_setup", Hex, Hex),
+ 426: makeSyscallInfo("io_uring_enter", FD, Hex, Hex, Hex, SigSet, Hex),
+ 427: makeSyscallInfo("io_uring_register", FD, Hex, Hex, Hex),
+ 428: makeSyscallInfo("open_tree", FD, Path, Hex),
+ 429: makeSyscallInfo("move_mount", FD, Path, FD, Path, Hex),
+ 430: makeSyscallInfo("fsopen", Path, Hex), // Not quite a path, but close.
+ 431: makeSyscallInfo("fsconfig", FD, Hex, Hex, Hex, Hex),
+ 432: makeSyscallInfo("fsmount", FD, Hex, Hex),
+ 433: makeSyscallInfo("fspick", FD, Path, Hex),
+ 434: makeSyscallInfo("pidfd_open", Hex, Hex),
+ 435: makeSyscallInfo("clone3", Hex, Hex),
+}
+
+func init() {
+ syscallTables = append(syscallTables,
+ syscallTable{
+ os: abi.Linux,
+ arch: arch.AMD64,
+ syscalls: linuxAMD64,
+ },
+ )
}
diff --git a/pkg/sentry/strace/linux64_arm64.go b/pkg/sentry/strace/linux64_arm64.go
new file mode 100644
index 000000000..c3ac5248d
--- /dev/null
+++ b/pkg/sentry/strace/linux64_arm64.go
@@ -0,0 +1,323 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package strace
+
+import (
+ "gvisor.dev/gvisor/pkg/abi"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+)
+
+// linuxARM64 provides a mapping of the Linux arm64 syscalls and their argument
+// types for display / formatting.
+var linuxARM64 = SyscallMap{
+ 0: makeSyscallInfo("io_setup", Hex, Hex),
+ 1: makeSyscallInfo("io_destroy", Hex),
+ 2: makeSyscallInfo("io_submit", Hex, Hex, Hex),
+ 3: makeSyscallInfo("io_cancel", Hex, Hex, Hex),
+ 4: makeSyscallInfo("io_getevents", Hex, Hex, Hex, Hex, Timespec),
+ 5: makeSyscallInfo("setxattr", Path, Path, Hex, Hex, Hex),
+ 6: makeSyscallInfo("lsetxattr", Path, Path, Hex, Hex, Hex),
+ 7: makeSyscallInfo("fsetxattr", FD, Path, Hex, Hex, Hex),
+ 8: makeSyscallInfo("getxattr", Path, Path, Hex, Hex),
+ 9: makeSyscallInfo("lgetxattr", Path, Path, Hex, Hex),
+ 10: makeSyscallInfo("fgetxattr", FD, Path, Hex, Hex),
+ 11: makeSyscallInfo("listxattr", Path, Path, Hex),
+ 12: makeSyscallInfo("llistxattr", Path, Path, Hex),
+ 13: makeSyscallInfo("flistxattr", FD, Path, Hex),
+ 14: makeSyscallInfo("removexattr", Path, Path),
+ 15: makeSyscallInfo("lremovexattr", Path, Path),
+ 16: makeSyscallInfo("fremovexattr", FD, Path),
+ 17: makeSyscallInfo("getcwd", PostPath, Hex),
+ 18: makeSyscallInfo("lookup_dcookie", Hex, Hex, Hex),
+ 19: makeSyscallInfo("eventfd2", Hex, Hex),
+ 20: makeSyscallInfo("epoll_create1", Hex),
+ 21: makeSyscallInfo("epoll_ctl", Hex, Hex, FD, Hex),
+ 22: makeSyscallInfo("epoll_pwait", Hex, Hex, Hex, Hex, SigSet, Hex),
+ 23: makeSyscallInfo("dup", FD),
+ 24: makeSyscallInfo("dup3", FD, FD, Hex),
+ 25: makeSyscallInfo("fcntl", FD, Hex, Hex),
+ 26: makeSyscallInfo("inotify_init1", Hex),
+ 27: makeSyscallInfo("inotify_add_watch", Hex, Path, Hex),
+ 28: makeSyscallInfo("inotify_rm_watch", Hex, Hex),
+ 29: makeSyscallInfo("ioctl", FD, Hex, Hex),
+ 30: makeSyscallInfo("ioprio_set", Hex, Hex, Hex),
+ 31: makeSyscallInfo("ioprio_get", Hex, Hex),
+ 32: makeSyscallInfo("flock", FD, Hex),
+ 33: makeSyscallInfo("mknodat", FD, Path, Mode, Hex),
+ 34: makeSyscallInfo("mkdirat", FD, Path, Hex),
+ 35: makeSyscallInfo("unlinkat", FD, Path, Hex),
+ 36: makeSyscallInfo("symlinkat", Path, Hex, Path),
+ 37: makeSyscallInfo("linkat", FD, Path, Hex, Path, Hex),
+ 38: makeSyscallInfo("renameat", FD, Path, Hex, Path),
+ 39: makeSyscallInfo("umount2", Path, Hex),
+ 40: makeSyscallInfo("mount", Path, Path, Path, Hex, Path),
+ 41: makeSyscallInfo("pivot_root", Path, Path),
+ 42: makeSyscallInfo("nfsservctl", Hex, Hex, Hex),
+ 43: makeSyscallInfo("statfs", Path, Hex),
+ 44: makeSyscallInfo("fstatfs", FD, Hex),
+ 45: makeSyscallInfo("truncate", Path, Hex),
+ 46: makeSyscallInfo("ftruncate", FD, Hex),
+ 47: makeSyscallInfo("fallocate", FD, Hex, Hex, Hex),
+ 48: makeSyscallInfo("faccessat", FD, Path, Oct, Hex),
+ 49: makeSyscallInfo("chdir", Path),
+ 50: makeSyscallInfo("fchdir", FD),
+ 51: makeSyscallInfo("chroot", Path),
+ 52: makeSyscallInfo("fchmod", FD, Mode),
+ 53: makeSyscallInfo("fchmodat", FD, Path, Mode),
+ 54: makeSyscallInfo("fchownat", FD, Path, Hex, Hex, Hex),
+ 55: makeSyscallInfo("fchown", FD, Hex, Hex),
+ 56: makeSyscallInfo("openat", FD, Path, OpenFlags, Mode),
+ 57: makeSyscallInfo("close", FD),
+ 58: makeSyscallInfo("vhangup"),
+ 59: makeSyscallInfo("pipe2", PipeFDs, Hex),
+ 60: makeSyscallInfo("quotactl", Hex, Hex, Hex, Hex),
+ 61: makeSyscallInfo("getdents64", FD, Hex, Hex),
+ 62: makeSyscallInfo("lseek", Hex, Hex, Hex),
+ 63: makeSyscallInfo("read", FD, ReadBuffer, Hex),
+ 64: makeSyscallInfo("write", FD, WriteBuffer, Hex),
+ 65: makeSyscallInfo("readv", FD, ReadIOVec, Hex),
+ 66: makeSyscallInfo("writev", FD, WriteIOVec, Hex),
+ 67: makeSyscallInfo("pread64", FD, ReadBuffer, Hex, Hex),
+ 68: makeSyscallInfo("pwrite64", FD, WriteBuffer, Hex, Hex),
+ 69: makeSyscallInfo("preadv", FD, ReadIOVec, Hex, Hex),
+ 70: makeSyscallInfo("pwritev", FD, WriteIOVec, Hex, Hex),
+ 71: makeSyscallInfo("sendfile", FD, FD, Hex, Hex),
+ 72: makeSyscallInfo("pselect6", Hex, Hex, Hex, Hex, Hex, Hex),
+ 73: makeSyscallInfo("ppoll", PollFDs, Hex, Timespec, SigSet, Hex),
+ 74: makeSyscallInfo("signalfd4", Hex, Hex, Hex, Hex),
+ 75: makeSyscallInfo("vmsplice", FD, Hex, Hex, Hex),
+ 76: makeSyscallInfo("splice", FD, Hex, FD, Hex, Hex, Hex),
+ 77: makeSyscallInfo("tee", FD, FD, Hex, Hex),
+ 78: makeSyscallInfo("readlinkat", FD, Path, ReadBuffer, Hex),
+ 79: makeSyscallInfo("fstatat", FD, Path, Stat, Hex),
+ 80: makeSyscallInfo("fstat", FD, Stat),
+ 81: makeSyscallInfo("sync"),
+ 82: makeSyscallInfo("fsync", FD),
+ 83: makeSyscallInfo("fdatasync", FD),
+ 84: makeSyscallInfo("sync_file_range", FD, Hex, Hex, Hex),
+ 85: makeSyscallInfo("timerfd_create", Hex, Hex),
+ 86: makeSyscallInfo("timerfd_settime", FD, Hex, ItimerSpec, PostItimerSpec),
+ 87: makeSyscallInfo("timerfd_gettime", FD, PostItimerSpec),
+ 88: makeSyscallInfo("utimensat", FD, Path, UTimeTimespec, Hex),
+ 89: makeSyscallInfo("acct", Hex),
+ 90: makeSyscallInfo("capget", CapHeader, PostCapData),
+ 91: makeSyscallInfo("capset", CapHeader, CapData),
+ 92: makeSyscallInfo("personality", Hex),
+ 93: makeSyscallInfo("exit", Hex),
+ 94: makeSyscallInfo("exit_group", Hex),
+ 95: makeSyscallInfo("waitid", Hex, Hex, Hex, Hex, Rusage),
+ 96: makeSyscallInfo("set_tid_address", Hex),
+ 97: makeSyscallInfo("unshare", CloneFlags),
+ 98: makeSyscallInfo("futex", Hex, FutexOp, Hex, Timespec, Hex, Hex),
+ 99: makeSyscallInfo("set_robust_list", Hex, Hex),
+ 100: makeSyscallInfo("get_robust_list", Hex, Hex, Hex),
+ 101: makeSyscallInfo("nanosleep", Timespec, PostTimespec),
+ 102: makeSyscallInfo("getitimer", ItimerType, PostItimerVal),
+ 103: makeSyscallInfo("setitimer", ItimerType, ItimerVal, PostItimerVal),
+ 104: makeSyscallInfo("kexec_load", Hex, Hex, Hex, Hex),
+ 105: makeSyscallInfo("init_module", Hex, Hex, Hex),
+ 106: makeSyscallInfo("delete_module", Hex, Hex),
+ 107: makeSyscallInfo("timer_create", Hex, Hex, Hex),
+ 108: makeSyscallInfo("timer_gettime", Hex, PostItimerSpec),
+ 109: makeSyscallInfo("timer_getoverrun", Hex),
+ 110: makeSyscallInfo("timer_settime", Hex, Hex, ItimerSpec, PostItimerSpec),
+ 111: makeSyscallInfo("timer_delete", Hex),
+ 112: makeSyscallInfo("clock_settime", Hex, Timespec),
+ 113: makeSyscallInfo("clock_gettime", Hex, PostTimespec),
+ 114: makeSyscallInfo("clock_getres", Hex, PostTimespec),
+ 115: makeSyscallInfo("clock_nanosleep", Hex, Hex, Timespec, PostTimespec),
+ 116: makeSyscallInfo("syslog", Hex, Hex, Hex),
+ 117: makeSyscallInfo("ptrace", PtraceRequest, Hex, Hex, Hex),
+ 118: makeSyscallInfo("sched_setparam", Hex, Hex),
+ 119: makeSyscallInfo("sched_setscheduler", Hex, Hex, Hex),
+ 120: makeSyscallInfo("sched_getscheduler", Hex),
+ 121: makeSyscallInfo("sched_getparam", Hex, Hex),
+ 122: makeSyscallInfo("sched_setaffinity", Hex, Hex, Hex),
+ 123: makeSyscallInfo("sched_getaffinity", Hex, Hex, Hex),
+ 124: makeSyscallInfo("sched_yield"),
+ 125: makeSyscallInfo("sched_get_priority_max", Hex),
+ 126: makeSyscallInfo("sched_get_priority_min", Hex),
+ 127: makeSyscallInfo("sched_rr_get_interval", Hex, Hex),
+ 128: makeSyscallInfo("restart_syscall"),
+ 129: makeSyscallInfo("kill", Hex, Signal),
+ 130: makeSyscallInfo("tkill", Hex, Signal),
+ 131: makeSyscallInfo("tgkill", Hex, Hex, Signal),
+ 132: makeSyscallInfo("sigaltstack", Hex, Hex),
+ 133: makeSyscallInfo("rt_sigsuspend", Hex),
+ 134: makeSyscallInfo("rt_sigaction", Signal, SigAction, PostSigAction),
+ 135: makeSyscallInfo("rt_sigprocmask", SignalMaskAction, SigSet, PostSigSet, Hex),
+ 136: makeSyscallInfo("rt_sigpending", Hex),
+ 137: makeSyscallInfo("rt_sigtimedwait", SigSet, Hex, Timespec, Hex),
+ 138: makeSyscallInfo("rt_sigqueueinfo", Hex, Signal, Hex),
+ 139: makeSyscallInfo("rt_sigreturn"),
+ 140: makeSyscallInfo("setpriority", Hex, Hex, Hex),
+ 141: makeSyscallInfo("getpriority", Hex, Hex),
+ 142: makeSyscallInfo("reboot", Hex, Hex, Hex, Hex),
+ 143: makeSyscallInfo("setregid", Hex, Hex),
+ 144: makeSyscallInfo("setgid", Hex),
+ 145: makeSyscallInfo("setreuid", Hex, Hex),
+ 146: makeSyscallInfo("setuid", Hex),
+ 147: makeSyscallInfo("setresuid", Hex, Hex, Hex),
+ 148: makeSyscallInfo("getresuid", Hex, Hex, Hex),
+ 149: makeSyscallInfo("setresgid", Hex, Hex, Hex),
+ 150: makeSyscallInfo("getresgid", Hex, Hex, Hex),
+ 151: makeSyscallInfo("setfsuid", Hex),
+ 152: makeSyscallInfo("setfsgid", Hex),
+ 153: makeSyscallInfo("times", Hex),
+ 154: makeSyscallInfo("setpgid", Hex, Hex),
+ 155: makeSyscallInfo("getpgid", Hex),
+ 156: makeSyscallInfo("getsid", Hex),
+ 157: makeSyscallInfo("setsid"),
+ 158: makeSyscallInfo("getgroups", Hex, Hex),
+ 159: makeSyscallInfo("setgroups", Hex, Hex),
+ 160: makeSyscallInfo("uname", Uname),
+ 161: makeSyscallInfo("sethostname", Hex, Hex),
+ 162: makeSyscallInfo("setdomainname", Hex, Hex),
+ 163: makeSyscallInfo("getrlimit", Hex, Hex),
+ 164: makeSyscallInfo("setrlimit", Hex, Hex),
+ 165: makeSyscallInfo("getrusage", Hex, Rusage),
+ 166: makeSyscallInfo("umask", Hex),
+ 167: makeSyscallInfo("prctl", Hex, Hex, Hex, Hex, Hex),
+ 168: makeSyscallInfo("getcpu", Hex, Hex, Hex),
+ 169: makeSyscallInfo("gettimeofday", Timeval, Hex),
+ 170: makeSyscallInfo("settimeofday", Timeval, Hex),
+ 171: makeSyscallInfo("adjtimex", Hex),
+ 172: makeSyscallInfo("getpid"),
+ 173: makeSyscallInfo("getppid"),
+ 174: makeSyscallInfo("getuid"),
+ 175: makeSyscallInfo("geteuid"),
+ 176: makeSyscallInfo("getgid"),
+ 177: makeSyscallInfo("getegid"),
+ 178: makeSyscallInfo("gettid"),
+ 179: makeSyscallInfo("sysinfo", Hex),
+ 180: makeSyscallInfo("mq_open", Hex, Hex, Hex, Hex),
+ 181: makeSyscallInfo("mq_unlink", Hex),
+ 182: makeSyscallInfo("mq_timedsend", Hex, Hex, Hex, Hex, Hex),
+ 183: makeSyscallInfo("mq_timedreceive", Hex, Hex, Hex, Hex, Hex),
+ 184: makeSyscallInfo("mq_notify", Hex, Hex),
+ 185: makeSyscallInfo("mq_getsetattr", Hex, Hex, Hex),
+ 186: makeSyscallInfo("msgget", Hex, Hex),
+ 187: makeSyscallInfo("msgctl", Hex, Hex, Hex),
+ 188: makeSyscallInfo("msgrcv", Hex, Hex, Hex, Hex, Hex),
+ 189: makeSyscallInfo("msgsnd", Hex, Hex, Hex, Hex),
+ 190: makeSyscallInfo("semget", Hex, Hex, Hex),
+ 191: makeSyscallInfo("semctl", Hex, Hex, Hex, Hex),
+ 192: makeSyscallInfo("semtimedop", Hex, Hex, Hex, Hex),
+ 193: makeSyscallInfo("semop", Hex, Hex, Hex),
+ 194: makeSyscallInfo("shmget", Hex, Hex, Hex),
+ 195: makeSyscallInfo("shmctl", Hex, Hex, Hex),
+ 196: makeSyscallInfo("shmat", Hex, Hex, Hex),
+ 197: makeSyscallInfo("shmdt", Hex),
+ 198: makeSyscallInfo("socket", SockFamily, SockType, SockProtocol),
+ 199: makeSyscallInfo("socketpair", SockFamily, SockType, SockProtocol, Hex),
+ 200: makeSyscallInfo("bind", FD, SockAddr, Hex),
+ 201: makeSyscallInfo("listen", FD, Hex),
+ 202: makeSyscallInfo("accept", FD, PostSockAddr, SockLen),
+ 203: makeSyscallInfo("connect", FD, SockAddr, Hex),
+ 204: makeSyscallInfo("getsockname", FD, PostSockAddr, SockLen),
+ 205: makeSyscallInfo("getpeername", FD, PostSockAddr, SockLen),
+ 206: makeSyscallInfo("sendto", FD, Hex, Hex, Hex, SockAddr, Hex),
+ 207: makeSyscallInfo("recvfrom", FD, Hex, Hex, Hex, PostSockAddr, SockLen),
+ 208: makeSyscallInfo("setsockopt", FD, Hex, Hex, Hex, Hex),
+ 209: makeSyscallInfo("getsockopt", FD, Hex, Hex, Hex, Hex),
+ 210: makeSyscallInfo("shutdown", FD, Hex),
+ 211: makeSyscallInfo("sendmsg", FD, SendMsgHdr, Hex),
+ 212: makeSyscallInfo("recvmsg", FD, RecvMsgHdr, Hex),
+ 213: makeSyscallInfo("readahead", Hex, Hex, Hex),
+ 214: makeSyscallInfo("brk", Hex),
+ 215: makeSyscallInfo("munmap", Hex, Hex),
+ 216: makeSyscallInfo("mremap", Hex, Hex, Hex, Hex, Hex),
+ 217: makeSyscallInfo("add_key", Hex, Hex, Hex, Hex, Hex),
+ 218: makeSyscallInfo("request_key", Hex, Hex, Hex, Hex),
+ 219: makeSyscallInfo("keyctl", Hex, Hex, Hex, Hex, Hex),
+ 220: makeSyscallInfo("clone", CloneFlags, Hex, Hex, Hex, Hex),
+ 221: makeSyscallInfo("execve", Path, ExecveStringVector, ExecveStringVector),
+ 222: makeSyscallInfo("mmap", Hex, Hex, Hex, Hex, FD, Hex),
+ 223: makeSyscallInfo("fadvise64", FD, Hex, Hex, Hex),
+ 224: makeSyscallInfo("swapon", Hex, Hex),
+ 225: makeSyscallInfo("swapoff", Hex),
+ 226: makeSyscallInfo("mprotect", Hex, Hex, Hex),
+ 227: makeSyscallInfo("msync", Hex, Hex, Hex),
+ 228: makeSyscallInfo("mlock", Hex, Hex),
+ 229: makeSyscallInfo("munlock", Hex, Hex),
+ 230: makeSyscallInfo("mlockall", Hex),
+ 231: makeSyscallInfo("munlockall"),
+ 232: makeSyscallInfo("mincore", Hex, Hex, Hex),
+ 233: makeSyscallInfo("madvise", Hex, Hex, Hex),
+ 234: makeSyscallInfo("remap_file_pages", Hex, Hex, Hex, Hex, Hex),
+ 235: makeSyscallInfo("mbind", Hex, Hex, Hex, Hex, Hex, Hex),
+ 236: makeSyscallInfo("get_mempolicy", Hex, Hex, Hex, Hex, Hex),
+ 237: makeSyscallInfo("set_mempolicy", Hex, Hex, Hex),
+ 238: makeSyscallInfo("migrate_pages", Hex, Hex, Hex, Hex),
+ 239: makeSyscallInfo("move_pages", Hex, Hex, Hex, Hex, Hex, Hex),
+ 240: makeSyscallInfo("rt_tgsigqueueinfo", Hex, Hex, Signal, Hex),
+ 241: makeSyscallInfo("perf_event_open", Hex, Hex, Hex, Hex, Hex),
+ 242: makeSyscallInfo("accept4", FD, PostSockAddr, SockLen, SockFlags),
+ 243: makeSyscallInfo("recvmmsg", FD, Hex, Hex, Hex, Hex),
+
+ 260: makeSyscallInfo("wait4", Hex, Hex, Hex, Rusage),
+ 261: makeSyscallInfo("prlimit64", Hex, Hex, Hex, Hex),
+ 262: makeSyscallInfo("fanotify_init", Hex, Hex),
+ 263: makeSyscallInfo("fanotify_mark", Hex, Hex, Hex, Hex, Hex),
+ 264: makeSyscallInfo("name_to_handle_at", FD, Hex, Hex, Hex, Hex),
+ 265: makeSyscallInfo("open_by_handle_at", FD, Hex, Hex),
+ 266: makeSyscallInfo("clock_adjtime", Hex, Hex),
+ 267: makeSyscallInfo("syncfs", FD),
+ 268: makeSyscallInfo("setns", FD, Hex),
+ 269: makeSyscallInfo("sendmmsg", FD, Hex, Hex, Hex),
+ 270: makeSyscallInfo("process_vm_readv", Hex, ReadIOVec, Hex, IOVec, Hex, Hex),
+ 271: makeSyscallInfo("process_vm_writev", Hex, IOVec, Hex, WriteIOVec, Hex, Hex),
+ 272: makeSyscallInfo("kcmp", Hex, Hex, Hex, Hex, Hex),
+ 273: makeSyscallInfo("finit_module", Hex, Hex, Hex),
+ 274: makeSyscallInfo("sched_setattr", Hex, Hex, Hex),
+ 275: makeSyscallInfo("sched_getattr", Hex, Hex, Hex),
+ 276: makeSyscallInfo("renameat2", FD, Path, Hex, Path, Hex),
+ 277: makeSyscallInfo("seccomp", Hex, Hex, Hex),
+ 278: makeSyscallInfo("getrandom", Hex, Hex, Hex),
+ 279: makeSyscallInfo("memfd_create", Path, Hex),
+ 280: makeSyscallInfo("bpf", Hex, Hex, Hex),
+ 281: makeSyscallInfo("execveat", FD, Path, Hex, Hex, Hex),
+ 282: makeSyscallInfo("userfaultfd", Hex),
+ 283: makeSyscallInfo("membarrier", Hex),
+ 284: makeSyscallInfo("mlock2", Hex, Hex, Hex),
+ 285: makeSyscallInfo("copy_file_range", FD, Hex, FD, Hex, Hex, Hex),
+ 286: makeSyscallInfo("preadv2", FD, ReadIOVec, Hex, Hex, Hex),
+ 287: makeSyscallInfo("pwritev2", FD, WriteIOVec, Hex, Hex, Hex),
+ 291: makeSyscallInfo("statx", FD, Path, Hex, Hex, Hex),
+ 292: makeSyscallInfo("io_pgetevents", Hex, Hex, Hex, Hex, Timespec, SigSet),
+ 293: makeSyscallInfo("rseq", Hex, Hex, Hex, Hex),
+ 424: makeSyscallInfo("pidfd_send_signal", FD, Signal, Hex, Hex),
+ 425: makeSyscallInfo("io_uring_setup", Hex, Hex),
+ 426: makeSyscallInfo("io_uring_enter", FD, Hex, Hex, Hex, SigSet, Hex),
+ 427: makeSyscallInfo("io_uring_register", FD, Hex, Hex, Hex),
+ 428: makeSyscallInfo("open_tree", FD, Path, Hex),
+ 429: makeSyscallInfo("move_mount", FD, Path, FD, Path, Hex),
+ 430: makeSyscallInfo("fsopen", Path, Hex), // Not quite a path, but close.
+ 431: makeSyscallInfo("fsconfig", FD, Hex, Hex, Hex, Hex),
+ 432: makeSyscallInfo("fsmount", FD, Hex, Hex),
+ 433: makeSyscallInfo("fspick", FD, Path, Hex),
+ 434: makeSyscallInfo("pidfd_open", Hex, Hex),
+ 435: makeSyscallInfo("clone3", Hex, Hex),
+}
+
+func init() {
+ syscallTables = append(syscallTables,
+ syscallTable{
+ os: abi.Linux,
+ arch: arch.ARM64,
+ syscalls: linuxARM64})
+}
diff --git a/pkg/sentry/strace/select.go b/pkg/sentry/strace/select.go
new file mode 100644
index 000000000..c77d418e6
--- /dev/null
+++ b/pkg/sentry/strace/select.go
@@ -0,0 +1,56 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package strace
+
+import (
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
+)
+
+func fdsFromSet(t *kernel.Task, set []byte) []int {
+ var fds []int
+ // Append n if the n-th bit is 1.
+ for i, v := range set {
+ for j := 0; j < 8; j++ {
+ if (v>>j)&1 == 1 {
+ fds = append(fds, i*8+j)
+ }
+ }
+ }
+ return fds
+}
+
+func fdSet(t *kernel.Task, nfds int, addr usermem.Addr) string {
+ if nfds < 0 {
+ return fmt.Sprintf("%#x (negative nfds)", addr)
+ }
+ if addr == 0 {
+ return "null"
+ }
+
+ // Calculate the size of the fd set (one bit per fd).
+ nBytes := (nfds + 7) / 8
+ nBitsInLastPartialByte := nfds % 8
+
+ set, err := linux.CopyInFDSet(t, addr, nBytes, nBitsInLastPartialByte)
+ if err != nil {
+ return fmt.Sprintf("%#x (error decoding fdset: %s)", addr, err)
+ }
+
+ return fmt.Sprintf("%#x %v", addr, fdsFromSet(t, set))
+}
diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go
index 94334f6d2..b6d7177f4 100644
--- a/pkg/sentry/strace/socket.go
+++ b/pkg/sentry/strace/socket.go
@@ -208,6 +208,15 @@ func cmsghdr(t *kernel.Task, addr usermem.Addr, length uint64, maxBytes uint64)
i += linux.SizeOfControlMessageHeader
width := t.Arch().Width()
length := int(h.Length) - linux.SizeOfControlMessageHeader
+ if length < 0 {
+ strs = append(strs, fmt.Sprintf(
+ "{level=%s, type=%s, length=%d, content too short}",
+ level,
+ typ,
+ h.Length,
+ ))
+ break
+ }
if skipData {
strs = append(strs, fmt.Sprintf("{level=%s, type=%s, length=%d}", level, typ, h.Length))
@@ -332,7 +341,7 @@ func sockAddr(t *kernel.Task, addr usermem.Addr, length uint32) string {
switch family {
case linux.AF_INET, linux.AF_INET6, linux.AF_UNIX:
- fa, _, err := netstack.AddressAndFamily(int(family), b, true /* strict */)
+ fa, _, err := netstack.AddressAndFamily(b)
if err != nil {
return fmt.Sprintf("%#x {Family: %s, error extracting address: %v}", addr, familyStr, err)
}
diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go
index 311389547..629c1f308 100644
--- a/pkg/sentry/strace/strace.go
+++ b/pkg/sentry/strace/strace.go
@@ -439,6 +439,8 @@ func (i *SyscallInfo) pre(t *kernel.Task, args arch.SyscallArguments, maximumBlo
output = append(output, capData(t, args[arg-1].Pointer(), args[arg].Pointer()))
case PollFDs:
output = append(output, pollFDs(t, args[arg].Pointer(), uint(args[arg+1].Uint()), false))
+ case SelectFDSet:
+ output = append(output, fdSet(t, int(args[0].Int()), args[arg].Pointer()))
case Oct:
output = append(output, "0o"+strconv.FormatUint(args[arg].Uint64(), 8))
case Hex:
diff --git a/pkg/sentry/strace/syscalls.go b/pkg/sentry/strace/syscalls.go
index 3c389d375..24e29a2ba 100644
--- a/pkg/sentry/strace/syscalls.go
+++ b/pkg/sentry/strace/syscalls.go
@@ -206,6 +206,10 @@ const (
// PollFDs is an array of struct pollfd. The number of entries in the
// array is in the next argument.
PollFDs
+
+ // SelectFDSet is an fd_set argument in select(2)/pselect(2). The number of
+ // fds represented must be the first argument.
+ SelectFDSet
)
// defaultFormat is the syscall argument format to use if the actual format is
@@ -246,14 +250,7 @@ type syscallTable struct {
syscalls SyscallMap
}
-// syscallTables contains all syscall tables.
-var syscallTables = []syscallTable{
- {
- os: abi.Linux,
- arch: arch.AMD64,
- syscalls: linuxAMD64,
- },
-}
+var syscallTables []syscallTable
// Lookup returns the SyscallMap for the OS/Arch combination. The returned map
// must not be changed.
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index 4c0bf96e4..430d796ba 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -13,6 +13,8 @@ go_library(
"sigset.go",
"sys_aio.go",
"sys_capability.go",
+ "sys_clone_amd64.go",
+ "sys_clone_arm64.go",
"sys_epoll.go",
"sys_eventfd.go",
"sys_file.go",
@@ -30,6 +32,7 @@ go_library(
"sys_random.go",
"sys_read.go",
"sys_rlimit.go",
+ "sys_rseq.go",
"sys_rusage.go",
"sys_sched.go",
"sys_seccomp.go",
@@ -49,6 +52,7 @@ go_library(
"sys_tls.go",
"sys_utsname.go",
"sys_write.go",
+ "sys_xattr.go",
"timespec.go",
],
importpath = "gvisor.dev/gvisor/pkg/sentry/syscalls/linux",
@@ -89,6 +93,7 @@ go_library(
"//pkg/sentry/syscalls",
"//pkg/sentry/usage",
"//pkg/sentry/usermem",
+ "//pkg/sync",
"//pkg/syserr",
"//pkg/syserror",
"//pkg/waiter",
diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go
index 1d9018c96..60469549d 100644
--- a/pkg/sentry/syscalls/linux/error.go
+++ b/pkg/sentry/syscalls/linux/error.go
@@ -16,13 +16,13 @@ package linux
import (
"io"
- "sync"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/metric"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
)
diff --git a/pkg/sentry/syscalls/linux/linux64_amd64.go b/pkg/sentry/syscalls/linux/linux64_amd64.go
index 81e4f93a6..479c5f6ff 100644
--- a/pkg/sentry/syscalls/linux/linux64_amd64.go
+++ b/pkg/sentry/syscalls/linux/linux64_amd64.go
@@ -228,10 +228,10 @@ var AMD64 = &kernel.SyscallTable{
185: syscalls.Error("security", syserror.ENOSYS, "Not implemented in Linux.", nil),
186: syscalls.Supported("gettid", Gettid),
187: syscalls.Supported("readahead", Readahead),
- 188: syscalls.Error("setxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+ 188: syscalls.PartiallySupported("setxattr", Setxattr, "Only supported for tmpfs.", nil),
189: syscalls.Error("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
190: syscalls.Error("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
- 191: syscalls.ErrorWithEvent("getxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+ 191: syscalls.PartiallySupported("getxattr", Getxattr, "Only supported for tmpfs.", nil),
192: syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
193: syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
194: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
@@ -260,7 +260,7 @@ var AMD64 = &kernel.SyscallTable{
217: syscalls.Supported("getdents64", Getdents64),
218: syscalls.Supported("set_tid_address", SetTidAddress),
219: syscalls.Supported("restart_syscall", RestartSyscall),
- 220: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}), // TODO(b/29354920)
+ 220: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}),
221: syscalls.PartiallySupported("fadvise64", Fadvise64, "Not all options are supported.", nil),
222: syscalls.Supported("timer_create", TimerCreate),
223: syscalls.Supported("timer_settime", TimerSettime),
@@ -367,11 +367,31 @@ var AMD64 = &kernel.SyscallTable{
324: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}), // TODO(gvisor.dev/issue/267)
325: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
- // Syscalls after 325 are "backports" from versions of Linux after 4.4.
+ // Syscalls implemented after 325 are "backports" from versions
+ // of Linux after 4.4.
326: syscalls.ErrorWithEvent("copy_file_range", syserror.ENOSYS, "", nil),
327: syscalls.Supported("preadv2", Preadv2),
328: syscalls.PartiallySupported("pwritev2", Pwritev2, "Flag RWF_HIPRI is not supported.", nil),
+ 329: syscalls.ErrorWithEvent("pkey_mprotect", syserror.ENOSYS, "", nil),
+ 330: syscalls.ErrorWithEvent("pkey_alloc", syserror.ENOSYS, "", nil),
+ 331: syscalls.ErrorWithEvent("pkey_free", syserror.ENOSYS, "", nil),
332: syscalls.Supported("statx", Statx),
+ 333: syscalls.ErrorWithEvent("io_pgetevents", syserror.ENOSYS, "", nil),
+ 334: syscalls.PartiallySupported("rseq", RSeq, "Not supported on all platforms.", nil),
+
+ // Linux skips ahead to syscall 424 to sync numbers between arches.
+ 424: syscalls.ErrorWithEvent("pidfd_send_signal", syserror.ENOSYS, "", nil),
+ 425: syscalls.ErrorWithEvent("io_uring_setup", syserror.ENOSYS, "", nil),
+ 426: syscalls.ErrorWithEvent("io_uring_enter", syserror.ENOSYS, "", nil),
+ 427: syscalls.ErrorWithEvent("io_uring_register", syserror.ENOSYS, "", nil),
+ 428: syscalls.ErrorWithEvent("open_tree", syserror.ENOSYS, "", nil),
+ 429: syscalls.ErrorWithEvent("move_mount", syserror.ENOSYS, "", nil),
+ 430: syscalls.ErrorWithEvent("fsopen", syserror.ENOSYS, "", nil),
+ 431: syscalls.ErrorWithEvent("fsconfig", syserror.ENOSYS, "", nil),
+ 432: syscalls.ErrorWithEvent("fsmount", syserror.ENOSYS, "", nil),
+ 433: syscalls.ErrorWithEvent("fspick", syserror.ENOSYS, "", nil),
+ 434: syscalls.ErrorWithEvent("pidfd_open", syserror.ENOSYS, "", nil),
+ 435: syscalls.ErrorWithEvent("clone3", syserror.ENOSYS, "", nil),
},
Emulate: map[usermem.Addr]uintptr{
diff --git a/pkg/sentry/syscalls/linux/linux64_arm64.go b/pkg/sentry/syscalls/linux/linux64_arm64.go
index a809115e0..d3f61f5e8 100644
--- a/pkg/sentry/syscalls/linux/linux64_arm64.go
+++ b/pkg/sentry/syscalls/linux/linux64_arm64.go
@@ -41,10 +41,10 @@ var ARM64 = &kernel.SyscallTable{
2: syscalls.PartiallySupported("io_submit", IoSubmit, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
3: syscalls.PartiallySupported("io_cancel", IoCancel, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
4: syscalls.PartiallySupported("io_getevents", IoGetevents, "Generally supported with exceptions. User ring optimizations are not implemented.", []string{"gvisor.dev/issue/204"}),
- 5: syscalls.Error("setxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+ 5: syscalls.PartiallySupported("setxattr", Setxattr, "Only supported for tmpfs.", nil),
6: syscalls.Error("lsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
7: syscalls.Error("fsetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
- 8: syscalls.ErrorWithEvent("getxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
+ 8: syscalls.PartiallySupported("getxattr", Getxattr, "Only supported for tmpfs.", nil),
9: syscalls.ErrorWithEvent("lgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
10: syscalls.ErrorWithEvent("fgetxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
11: syscalls.ErrorWithEvent("listxattr", syserror.ENOTSUP, "Requires filesystem support.", nil),
@@ -224,7 +224,7 @@ var ARM64 = &kernel.SyscallTable{
189: syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
190: syscalls.Supported("semget", Semget),
191: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, IPC_STAT, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil),
- 192: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}), // TODO(b/29354920)
+ 192: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}),
193: syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil),
194: syscalls.PartiallySupported("shmget", Shmget, "Option SHM_HUGETLB is not supported.", nil),
195: syscalls.PartiallySupported("shmctl", Shmctl, "Options SHM_LOCK, SHM_UNLOCK are not supported.", nil),
@@ -295,14 +295,33 @@ var ARM64 = &kernel.SyscallTable{
278: syscalls.Supported("getrandom", GetRandom),
279: syscalls.Supported("memfd_create", MemfdCreate),
280: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil),
- 281: syscalls.ErrorWithEvent("execveat", syserror.ENOSYS, "", []string{"gvisor.dev/issue/265"}), // TODO(b/118901836)
+ 281: syscalls.Supported("execveat", Execveat),
282: syscalls.ErrorWithEvent("userfaultfd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345)
283: syscalls.ErrorWithEvent("membarrier", syserror.ENOSYS, "", []string{"gvisor.dev/issue/267"}), // TODO(gvisor.dev/issue/267)
284: syscalls.PartiallySupported("mlock2", Mlock2, "Stub implementation. The sandbox lacks appropriate permissions.", nil),
285: syscalls.ErrorWithEvent("copy_file_range", syserror.ENOSYS, "", nil),
286: syscalls.Supported("preadv2", Preadv2),
287: syscalls.PartiallySupported("pwritev2", Pwritev2, "Flag RWF_HIPRI is not supported.", nil),
+ 288: syscalls.ErrorWithEvent("pkey_mprotect", syserror.ENOSYS, "", nil),
+ 289: syscalls.ErrorWithEvent("pkey_alloc", syserror.ENOSYS, "", nil),
+ 290: syscalls.ErrorWithEvent("pkey_free", syserror.ENOSYS, "", nil),
291: syscalls.Supported("statx", Statx),
+ 292: syscalls.ErrorWithEvent("io_pgetevents", syserror.ENOSYS, "", nil),
+ 293: syscalls.PartiallySupported("rseq", RSeq, "Not supported on all platforms.", nil),
+
+ // Linux skips ahead to syscall 424 to sync numbers between arches.
+ 424: syscalls.ErrorWithEvent("pidfd_send_signal", syserror.ENOSYS, "", nil),
+ 425: syscalls.ErrorWithEvent("io_uring_setup", syserror.ENOSYS, "", nil),
+ 426: syscalls.ErrorWithEvent("io_uring_enter", syserror.ENOSYS, "", nil),
+ 427: syscalls.ErrorWithEvent("io_uring_register", syserror.ENOSYS, "", nil),
+ 428: syscalls.ErrorWithEvent("open_tree", syserror.ENOSYS, "", nil),
+ 429: syscalls.ErrorWithEvent("move_mount", syserror.ENOSYS, "", nil),
+ 430: syscalls.ErrorWithEvent("fsopen", syserror.ENOSYS, "", nil),
+ 431: syscalls.ErrorWithEvent("fsconfig", syserror.ENOSYS, "", nil),
+ 432: syscalls.ErrorWithEvent("fsmount", syserror.ENOSYS, "", nil),
+ 433: syscalls.ErrorWithEvent("fspick", syserror.ENOSYS, "", nil),
+ 434: syscalls.ErrorWithEvent("pidfd_open", syserror.ENOSYS, "", nil),
+ 435: syscalls.ErrorWithEvent("clone3", syserror.ENOSYS, "", nil),
},
Emulate: map[usermem.Addr]uintptr{},
diff --git a/pkg/sentry/syscalls/linux/sys_clone_amd64.go b/pkg/sentry/syscalls/linux/sys_clone_amd64.go
new file mode 100644
index 000000000..dd43cf18d
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_clone_amd64.go
@@ -0,0 +1,35 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build amd64
+
+package linux
+
+import (
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+)
+
+// Clone implements linux syscall clone(2).
+// sys_clone has so many flavors. We implement the default one in linux 3.11
+// x86_64:
+// sys_clone(clone_flags, newsp, parent_tidptr, child_tidptr, tls_val)
+func Clone(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ flags := int(args[0].Int())
+ stack := args[1].Pointer()
+ parentTID := args[2].Pointer()
+ childTID := args[3].Pointer()
+ tls := args[4].Pointer()
+ return clone(t, flags, stack, parentTID, childTID, tls)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_clone_arm64.go b/pkg/sentry/syscalls/linux/sys_clone_arm64.go
new file mode 100644
index 000000000..cf68a8949
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_clone_arm64.go
@@ -0,0 +1,35 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package linux
+
+import (
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+)
+
+// Clone implements linux syscall clone(2).
+// sys_clone has so many flavors, and we implement the default one in linux 3.11
+// arm64(kernel/fork.c with CONFIG_CLONE_BACKWARDS defined in the config file):
+// sys_clone(clone_flags, newsp, parent_tidptr, tls_val, child_tidptr)
+func Clone(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ flags := int(args[0].Int())
+ stack := args[1].Pointer()
+ parentTID := args[2].Pointer()
+ tls := args[3].Pointer()
+ childTID := args[4].Pointer()
+ return clone(t, flags, stack, parentTID, childTID, tls)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go
index 167c2b60b..9bc2445a5 100644
--- a/pkg/sentry/syscalls/linux/sys_file.go
+++ b/pkg/sentry/syscalls/linux/sys_file.go
@@ -171,6 +171,9 @@ func openAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint) (fd uint
}
}
+ // Truncate is called when O_TRUNC is specified for any kind of
+ // existing Dirent. Behavior is delegated to the entry's Truncate
+ // implementation.
if flags&linux.O_TRUNC != 0 {
if err := d.Inode.Truncate(t, d, 0); err != nil {
return err
@@ -397,7 +400,9 @@ func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode l
return err
}
- // Should we truncate the file?
+ // Truncate is called when O_TRUNC is specified for any kind of
+ // existing Dirent. Behavior is delegated to the entry's Truncate
+ // implementation.
if flags&linux.O_TRUNC != 0 {
if err := found.Inode.Truncate(t, found, 0); err != nil {
return err
@@ -835,25 +840,42 @@ func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
return uintptr(newfd), nil, nil
}
-func fGetOwn(t *kernel.Task, file *fs.File) int32 {
+func fGetOwnEx(t *kernel.Task, file *fs.File) linux.FOwnerEx {
ma := file.Async(nil)
if ma == nil {
- return 0
+ return linux.FOwnerEx{}
}
a := ma.(*fasync.FileAsync)
ot, otg, opg := a.Owner()
switch {
case ot != nil:
- return int32(t.PIDNamespace().IDOfTask(ot))
+ return linux.FOwnerEx{
+ Type: linux.F_OWNER_TID,
+ PID: int32(t.PIDNamespace().IDOfTask(ot)),
+ }
case otg != nil:
- return int32(t.PIDNamespace().IDOfThreadGroup(otg))
+ return linux.FOwnerEx{
+ Type: linux.F_OWNER_PID,
+ PID: int32(t.PIDNamespace().IDOfThreadGroup(otg)),
+ }
case opg != nil:
- return int32(-t.PIDNamespace().IDOfProcessGroup(opg))
+ return linux.FOwnerEx{
+ Type: linux.F_OWNER_PGRP,
+ PID: int32(t.PIDNamespace().IDOfProcessGroup(opg)),
+ }
default:
- return 0
+ return linux.FOwnerEx{}
}
}
+func fGetOwn(t *kernel.Task, file *fs.File) int32 {
+ owner := fGetOwnEx(t, file)
+ if owner.Type == linux.F_OWNER_PGRP {
+ return -owner.PID
+ }
+ return owner.PID
+}
+
// fSetOwn sets the file's owner with the semantics of F_SETOWN in Linux.
//
// If who is positive, it represents a PID. If negative, it represents a PGID.
@@ -896,11 +918,13 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
t.FDTable().SetFlags(fd, kernel.FDFlags{
CloseOnExec: flags&linux.FD_CLOEXEC != 0,
})
+ return 0, nil, nil
case linux.F_GETFL:
return uintptr(file.Flags().ToLinux()), nil, nil
case linux.F_SETFL:
flags := uint(args[2].Uint())
file.SetFlags(linuxToFlags(flags).Settable())
+ return 0, nil, nil
case linux.F_SETLK, linux.F_SETLKW:
// In Linux the file system can choose to provide lock operations for an inode.
// Normally pipe and socket types lack lock operations. We diverge and use a heavy
@@ -1003,6 +1027,44 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
case linux.F_SETOWN:
fSetOwn(t, file, args[2].Int())
return 0, nil, nil
+ case linux.F_GETOWN_EX:
+ addr := args[2].Pointer()
+ owner := fGetOwnEx(t, file)
+ _, err := t.CopyOut(addr, &owner)
+ return 0, nil, err
+ case linux.F_SETOWN_EX:
+ addr := args[2].Pointer()
+ var owner linux.FOwnerEx
+ n, err := t.CopyIn(addr, &owner)
+ if err != nil {
+ return 0, nil, err
+ }
+ a := file.Async(fasync.New).(*fasync.FileAsync)
+ switch owner.Type {
+ case linux.F_OWNER_TID:
+ task := t.PIDNamespace().TaskWithID(kernel.ThreadID(owner.PID))
+ if task == nil {
+ return 0, nil, syserror.ESRCH
+ }
+ a.SetOwnerTask(t, task)
+ return uintptr(n), nil, nil
+ case linux.F_OWNER_PID:
+ tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(owner.PID))
+ if tg == nil {
+ return 0, nil, syserror.ESRCH
+ }
+ a.SetOwnerThreadGroup(t, tg)
+ return uintptr(n), nil, nil
+ case linux.F_OWNER_PGRP:
+ pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(owner.PID))
+ if pg == nil {
+ return 0, nil, syserror.ESRCH
+ }
+ a.SetOwnerProcessGroup(t, pg)
+ return uintptr(n), nil, nil
+ default:
+ return 0, nil, syserror.EINVAL
+ }
case linux.F_GET_SEALS:
val, err := tmpfs.GetSeals(file.Dirent.Inode)
return uintptr(val), nil, err
@@ -1030,7 +1092,6 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
// Everything else is not yet supported.
return 0, nil, syserror.EINVAL
}
- return 0, nil, nil
}
const (
@@ -1484,6 +1545,8 @@ func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
if fs.IsDir(d.Inode.StableAttr) {
return syserror.EISDIR
}
+ // In contrast to open(O_TRUNC), truncate(2) is only valid for file
+ // types.
if !fs.IsFile(d.Inode.StableAttr) {
return syserror.EINVAL
}
@@ -1522,7 +1585,8 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
return 0, nil, syserror.EINVAL
}
- // Note that this is different from truncate(2) above, where a
+ // In contrast to open(O_TRUNC), truncate(2) is only valid for file
+ // types. Note that this is different from truncate(2) above, where a
// directory returns EISDIR.
if !fs.IsFile(file.Dirent.Inode.StableAttr) {
return 0, nil, syserror.EINVAL
diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go
index b9bd25464..bde17a767 100644
--- a/pkg/sentry/syscalls/linux/sys_futex.go
+++ b/pkg/sentry/syscalls/linux/sys_futex.go
@@ -226,6 +226,11 @@ func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if mask == 0 {
return 0, nil, syserror.EINVAL
}
+ if val <= 0 {
+ // The Linux kernel wakes one waiter even if val is
+ // non-positive.
+ val = 1
+ }
n, err := t.Futex().Wake(t, addr, private, mask, val)
return uintptr(n), nil, err
@@ -242,6 +247,11 @@ func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
case linux.FUTEX_WAKE_OP:
op := uint32(val3)
+ if val <= 0 {
+ // The Linux kernel wakes one waiter even if val is
+ // non-positive.
+ val = 1
+ }
n, err := t.Futex().WakeOp(t, addr, naddr, private, val, nreq, op)
return uintptr(n), nil, err
diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go
index 7a13beac2..2b2df989a 100644
--- a/pkg/sentry/syscalls/linux/sys_poll.go
+++ b/pkg/sentry/syscalls/linux/sys_poll.go
@@ -197,53 +197,51 @@ func doPoll(t *kernel.Task, addr usermem.Addr, nfds uint, timeout time.Duration)
return remainingTimeout, n, err
}
+// CopyInFDSet copies an fd set from select(2)/pselect(2).
+func CopyInFDSet(t *kernel.Task, addr usermem.Addr, nBytes, nBitsInLastPartialByte int) ([]byte, error) {
+ set := make([]byte, nBytes)
+
+ if addr != 0 {
+ if _, err := t.CopyIn(addr, &set); err != nil {
+ return nil, err
+ }
+ // If we only use part of the last byte, mask out the extraneous bits.
+ //
+ // N.B. This only works on little-endian architectures.
+ if nBitsInLastPartialByte != 0 {
+ set[nBytes-1] &^= byte(0xff) << nBitsInLastPartialByte
+ }
+ }
+ return set, nil
+}
+
func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Addr, timeout time.Duration) (uintptr, error) {
if nfds < 0 || nfds > fileCap {
return 0, syserror.EINVAL
}
- // Capture all the provided input vectors.
- //
- // N.B. This only works on little-endian architectures.
- byteCount := (nfds + 7) / 8
-
- bitsInLastPartialByte := uint(nfds % 8)
- r := make([]byte, byteCount)
- w := make([]byte, byteCount)
- e := make([]byte, byteCount)
+ // Calculate the size of the fd sets (one bit per fd).
+ nBytes := (nfds + 7) / 8
+ nBitsInLastPartialByte := nfds % 8
- if readFDs != 0 {
- if _, err := t.CopyIn(readFDs, &r); err != nil {
- return 0, err
- }
- // Mask out bits above nfds.
- if bitsInLastPartialByte != 0 {
- r[byteCount-1] &^= byte(0xff) << bitsInLastPartialByte
- }
+ // Capture all the provided input vectors.
+ r, err := CopyInFDSet(t, readFDs, nBytes, nBitsInLastPartialByte)
+ if err != nil {
+ return 0, err
}
-
- if writeFDs != 0 {
- if _, err := t.CopyIn(writeFDs, &w); err != nil {
- return 0, err
- }
- if bitsInLastPartialByte != 0 {
- w[byteCount-1] &^= byte(0xff) << bitsInLastPartialByte
- }
+ w, err := CopyInFDSet(t, writeFDs, nBytes, nBitsInLastPartialByte)
+ if err != nil {
+ return 0, err
}
-
- if exceptFDs != 0 {
- if _, err := t.CopyIn(exceptFDs, &e); err != nil {
- return 0, err
- }
- if bitsInLastPartialByte != 0 {
- e[byteCount-1] &^= byte(0xff) << bitsInLastPartialByte
- }
+ e, err := CopyInFDSet(t, exceptFDs, nBytes, nBitsInLastPartialByte)
+ if err != nil {
+ return 0, err
}
// Count how many FDs are actually being requested so that we can build
// a PollFD array.
fdCount := 0
- for i := 0; i < byteCount; i++ {
+ for i := 0; i < nBytes; i++ {
v := r[i] | w[i] | e[i]
for v != 0 {
v &= (v - 1)
@@ -254,7 +252,7 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Add
// Build the PollFD array.
pfd := make([]linux.PollFD, 0, fdCount)
var fd int32
- for i := 0; i < byteCount; i++ {
+ for i := 0; i < nBytes; i++ {
rV, wV, eV := r[i], w[i], e[i]
v := rV | wV | eV
m := byte(1)
@@ -295,8 +293,7 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Add
}
// Do the syscall, then count the number of bits set.
- _, _, err := pollBlock(t, pfd, timeout)
- if err != nil {
+ if _, _, err = pollBlock(t, pfd, timeout); err != nil {
return 0, syserror.ConvertIntr(err, syserror.EINTR)
}
diff --git a/pkg/sentry/syscalls/linux/sys_rseq.go b/pkg/sentry/syscalls/linux/sys_rseq.go
new file mode 100644
index 000000000..90db10ea6
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_rseq.go
@@ -0,0 +1,48 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// RSeq implements syscall rseq(2).
+func RSeq(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ length := args[1].Uint()
+ flags := args[2].Int()
+ signature := args[3].Uint()
+
+ if !t.RSeqAvailable() {
+ // Event for applications that want rseq on a configuration
+ // that doesn't support them.
+ t.Kernel().EmitUnimplementedEvent(t)
+ return 0, nil, syserror.ENOSYS
+ }
+
+ switch flags {
+ case 0:
+ // Register.
+ return 0, nil, t.SetRSeq(addr, length, signature)
+ case linux.RSEQ_FLAG_UNREGISTER:
+ return 0, nil, t.ClearRSeq(addr, length, signature)
+ default:
+ // Unknown flag.
+ return 0, nil, syserror.EINVAL
+ }
+}
diff --git a/pkg/sentry/syscalls/linux/sys_shm.go b/pkg/sentry/syscalls/linux/sys_shm.go
index d57ffb3a1..4a8bc24a2 100644
--- a/pkg/sentry/syscalls/linux/sys_shm.go
+++ b/pkg/sentry/syscalls/linux/sys_shm.go
@@ -39,10 +39,13 @@ func Shmget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if err != nil {
return 0, nil, err
}
+ defer segment.DecRef()
return uintptr(segment.ID), nil, nil
}
// findSegment retrives a shm segment by the given id.
+//
+// findSegment returns a reference on Shm.
func findSegment(t *kernel.Task, id shm.ID) (*shm.Shm, error) {
r := t.IPCNamespace().ShmRegistry()
segment := r.FindByID(id)
@@ -63,6 +66,7 @@ func Shmat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if err != nil {
return 0, nil, syserror.EINVAL
}
+ defer segment.DecRef()
opts, err := segment.ConfigureAttach(t, addr, shm.AttachOpts{
Execute: flag&linux.SHM_EXEC == linux.SHM_EXEC,
@@ -72,7 +76,6 @@ func Shmat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
if err != nil {
return 0, nil, err
}
- defer segment.DecRef()
addr, err = t.MemoryManager().MMap(t, opts)
return uintptr(addr), nil, err
}
@@ -105,6 +108,7 @@ func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if err != nil {
return 0, nil, syserror.EINVAL
}
+ defer segment.DecRef()
stat, err := segment.IPCStat(t)
if err == nil {
@@ -128,6 +132,7 @@ func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
if err != nil {
return 0, nil, syserror.EINVAL
}
+ defer segment.DecRef()
switch cmd {
case linux.IPC_SET:
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index ab1001f16..cda517a81 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -41,7 +41,7 @@ const maxListenBacklog = 1024
const maxAddrLen = 200
// maxOptLen is the maximum sockopt parameter length we're willing to accept.
-const maxOptLen = 1024
+const maxOptLen = 1024 * 8
// maxControlLen is the maximum length of the msghdr.msg_control buffer we're
// willing to accept. Note that this limit is smaller than Linux, which allows
@@ -764,7 +764,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i
}
if !cms.Unix.Empty() {
mflags |= linux.MSG_CTRUNC
- cms.Unix.Release()
+ cms.Release()
}
if int(msg.Flags) != mflags {
@@ -784,24 +784,16 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i
if e != nil {
return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
}
- defer cms.Unix.Release()
+ defer cms.Release()
controlData := make([]byte, 0, msg.ControlLen)
+ controlData = control.PackControlMessages(t, cms, controlData)
if cr, ok := s.(transport.Credentialer); ok && cr.Passcred() {
creds, _ := cms.Unix.Credentials.(control.SCMCredentials)
controlData, mflags = control.PackCredentials(t, creds, controlData, mflags)
}
- if cms.IP.HasTimestamp {
- controlData = control.PackTimestamp(t, cms.IP.Timestamp, controlData)
- }
-
- if cms.IP.HasInq {
- // In Linux, TCP_CM_INQ is added after SO_TIMESTAMP.
- controlData = control.PackInq(t, cms.IP.Inq, controlData)
- }
-
if cms.Unix.Rights != nil {
controlData, mflags = control.PackRights(t, cms.Unix.Rights.(control.SCMRights), flags&linux.MSG_CMSG_CLOEXEC != 0, controlData, mflags)
}
@@ -877,7 +869,7 @@ func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flag
}
n, _, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0)
- cm.Unix.Release()
+ cm.Release()
if e != nil {
return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS)
}
@@ -1060,7 +1052,7 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr userme
}
// Call the syscall implementation.
- n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, socket.ControlMessages{Unix: controlMessages})
+ n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages)
err = handleIOError(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendmsg", file)
if err != nil {
controlMessages.Release()
diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
index 4115116ff..b47c3b5c4 100644
--- a/pkg/sentry/syscalls/linux/sys_thread.go
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -220,19 +220,6 @@ func clone(t *kernel.Task, flags int, stack usermem.Addr, parentTID usermem.Addr
return uintptr(ntid), ctrl, err
}
-// Clone implements linux syscall clone(2).
-// sys_clone has so many flavors. We implement the default one in linux 3.11
-// x86_64:
-// sys_clone(clone_flags, newsp, parent_tidptr, child_tidptr, tls_val)
-func Clone(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- flags := int(args[0].Int())
- stack := args[1].Pointer()
- parentTID := args[2].Pointer()
- childTID := args[3].Pointer()
- tls := args[4].Pointer()
- return clone(t, flags, stack, parentTID, childTID, tls)
-}
-
// Fork implements Linux syscall fork(2).
func Fork(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
// "A call to fork() is equivalent to a call to clone(2) specifying flags
diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go
new file mode 100644
index 000000000..97d9a65ea
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_xattr.go
@@ -0,0 +1,169 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "strings"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Getxattr implements linux syscall getxattr(2).
+func Getxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ pathAddr := args[0].Pointer()
+ nameAddr := args[1].Pointer()
+ valueAddr := args[2].Pointer()
+ size := args[3].SizeT()
+
+ path, dirPath, err := copyInPath(t, pathAddr, false /* allowEmpty */)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ valueLen := 0
+ err = fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+ value, err := getxattr(t, d, dirPath, nameAddr)
+ if err != nil {
+ return err
+ }
+
+ valueLen = len(value)
+ if size == 0 {
+ return nil
+ }
+ if size > linux.XATTR_SIZE_MAX {
+ size = linux.XATTR_SIZE_MAX
+ }
+ if valueLen > int(size) {
+ return syserror.ERANGE
+ }
+
+ _, err = t.CopyOutBytes(valueAddr, []byte(value))
+ return err
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+ return uintptr(valueLen), nil, nil
+}
+
+// getxattr implements getxattr from the given *fs.Dirent.
+func getxattr(t *kernel.Task, d *fs.Dirent, dirPath bool, nameAddr usermem.Addr) (string, error) {
+ if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+ return "", syserror.ENOTDIR
+ }
+
+ if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Read: true}); err != nil {
+ return "", err
+ }
+
+ name, err := copyInXattrName(t, nameAddr)
+ if err != nil {
+ return "", err
+ }
+
+ if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
+ return "", syserror.EOPNOTSUPP
+ }
+
+ return d.Inode.Getxattr(name)
+}
+
+// Setxattr implements linux syscall setxattr(2).
+func Setxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ pathAddr := args[0].Pointer()
+ nameAddr := args[1].Pointer()
+ valueAddr := args[2].Pointer()
+ size := args[3].SizeT()
+ flags := args[4].Uint()
+
+ path, dirPath, err := copyInPath(t, pathAddr, false /* allowEmpty */)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ return 0, nil, fileOpOn(t, linux.AT_FDCWD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+ return setxattr(t, d, dirPath, nameAddr, valueAddr, size, flags)
+ })
+}
+
+// setxattr implements setxattr from the given *fs.Dirent.
+func setxattr(t *kernel.Task, d *fs.Dirent, dirPath bool, nameAddr, valueAddr usermem.Addr, size uint, flags uint32) error {
+ if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+ return syserror.ENOTDIR
+ }
+
+ if err := checkXattrPermissions(t, d.Inode, fs.PermMask{Write: true}); err != nil {
+ return err
+ }
+
+ name, err := copyInXattrName(t, nameAddr)
+ if err != nil {
+ return err
+ }
+
+ if size > linux.XATTR_SIZE_MAX {
+ return syserror.E2BIG
+ }
+ buf := make([]byte, size)
+ if _, err = t.CopyInBytes(valueAddr, buf); err != nil {
+ return err
+ }
+ value := string(buf)
+
+ if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
+ return syserror.EOPNOTSUPP
+ }
+
+ return d.Inode.Setxattr(name, value)
+}
+
+func copyInXattrName(t *kernel.Task, nameAddr usermem.Addr) (string, error) {
+ name, err := t.CopyInString(nameAddr, linux.XATTR_NAME_MAX+1)
+ if err != nil {
+ if err == syserror.ENAMETOOLONG {
+ return "", syserror.ERANGE
+ }
+ return "", err
+ }
+ if len(name) == 0 {
+ return "", syserror.ERANGE
+ }
+ return name, nil
+}
+
+func checkXattrPermissions(t *kernel.Task, i *fs.Inode, perms fs.PermMask) error {
+ // Restrict xattrs to regular files and directories.
+ //
+ // In Linux, this restriction technically only applies to xattrs in the
+ // "user.*" namespace, but we don't allow any other xattr prefixes anyway.
+ if !fs.IsRegular(i.StableAttr) && !fs.IsDir(i.StableAttr) {
+ if perms.Write {
+ return syserror.EPERM
+ }
+ return syserror.ENODATA
+ }
+
+ return i.CheckPermission(t, perms)
+}
diff --git a/pkg/sentry/time/BUILD b/pkg/sentry/time/BUILD
index 18e212dff..3cde3a0be 100644
--- a/pkg/sentry/time/BUILD
+++ b/pkg/sentry/time/BUILD
@@ -9,7 +9,7 @@ go_template_instance(
out = "seqatomic_parameters_unsafe.go",
package = "time",
suffix = "Parameters",
- template = "//pkg/syncutil:generic_seqatomic",
+ template = "//pkg/sync:generic_seqatomic",
types = {
"Value": "Parameters",
},
@@ -36,7 +36,7 @@ go_library(
deps = [
"//pkg/log",
"//pkg/metric",
- "//pkg/syncutil",
+ "//pkg/sync",
"//pkg/syserror",
],
)
diff --git a/pkg/sentry/time/calibrated_clock.go b/pkg/sentry/time/calibrated_clock.go
index 318503277..f9a93115d 100644
--- a/pkg/sentry/time/calibrated_clock.go
+++ b/pkg/sentry/time/calibrated_clock.go
@@ -17,11 +17,11 @@
package time
import (
- "sync"
"time"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/metric"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
)
diff --git a/pkg/sentry/usage/BUILD b/pkg/sentry/usage/BUILD
index c32fe3241..5518ac3d0 100644
--- a/pkg/sentry/usage/BUILD
+++ b/pkg/sentry/usage/BUILD
@@ -18,5 +18,6 @@ go_library(
deps = [
"//pkg/bits",
"//pkg/memutil",
+ "//pkg/sync",
],
)
diff --git a/pkg/sentry/usage/memory.go b/pkg/sentry/usage/memory.go
index d6ef644d8..538c645eb 100644
--- a/pkg/sentry/usage/memory.go
+++ b/pkg/sentry/usage/memory.go
@@ -17,12 +17,12 @@ package usage
import (
"fmt"
"os"
- "sync"
"sync/atomic"
"syscall"
"gvisor.dev/gvisor/pkg/bits"
"gvisor.dev/gvisor/pkg/memutil"
+ "gvisor.dev/gvisor/pkg/sync"
)
// MemoryKind represents a type of memory used by the application.
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index 74a325309..35c7be259 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -9,6 +9,7 @@ go_library(
"context.go",
"debug.go",
"dentry.go",
+ "device.go",
"file_description.go",
"file_description_impl_util.go",
"filesystem.go",
@@ -17,9 +18,9 @@ go_library(
"mount.go",
"mount_unsafe.go",
"options.go",
+ "pathname.go",
"permissions.go",
"resolving_path.go",
- "syscalls.go",
"testutil.go",
"vfs.go",
],
@@ -33,7 +34,7 @@ go_library(
"//pkg/sentry/kernel/auth",
"//pkg/sentry/memmap",
"//pkg/sentry/usermem",
- "//pkg/syncutil",
+ "//pkg/sync",
"//pkg/syserror",
"//pkg/waiter",
],
@@ -53,6 +54,7 @@ go_test(
"//pkg/sentry/context/contexttest",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/usermem",
+ "//pkg/sync",
"//pkg/syserror",
],
)
diff --git a/pkg/sentry/vfs/context.go b/pkg/sentry/vfs/context.go
index 32cf9151b..705194ebc 100644
--- a/pkg/sentry/vfs/context.go
+++ b/pkg/sentry/vfs/context.go
@@ -24,6 +24,9 @@ type contextID int
const (
// CtxMountNamespace is a Context.Value key for a MountNamespace.
CtxMountNamespace contextID = iota
+
+ // CtxRoot is a Context.Value key for a VFS root.
+ CtxRoot
)
// MountNamespaceFromContext returns the MountNamespace used by ctx. It does
@@ -35,3 +38,13 @@ func MountNamespaceFromContext(ctx context.Context) *MountNamespace {
}
return nil
}
+
+// RootFromContext returns the VFS root used by ctx. It takes a reference on
+// the returned VirtualDentry. If ctx does not have a specific VFS root,
+// RootFromContext returns a zero-value VirtualDentry.
+func RootFromContext(ctx context.Context) VirtualDentry {
+ if v := ctx.Value(CtxRoot); v != nil {
+ return v.(VirtualDentry)
+ }
+ return VirtualDentry{}
+}
diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go
index 40f4c1d09..486a76475 100644
--- a/pkg/sentry/vfs/dentry.go
+++ b/pkg/sentry/vfs/dentry.go
@@ -16,9 +16,9 @@ package vfs
import (
"fmt"
- "sync"
"sync/atomic"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -85,12 +85,12 @@ type Dentry struct {
// mounts is accessed using atomic memory operations.
mounts uint32
- // mu synchronizes disowning and mounting over this Dentry.
- mu sync.Mutex
-
// children are child Dentries.
children map[string]*Dentry
+ // mu synchronizes disowning and mounting over this Dentry.
+ mu sync.Mutex
+
// impl is the DentryImpl associated with this Dentry. impl is immutable.
// This should be the last field in Dentry.
impl DentryImpl
@@ -199,6 +199,18 @@ func (d *Dentry) HasChildren() bool {
return len(d.children) != 0
}
+// Children returns a map containing all of d's children.
+func (d *Dentry) Children() map[string]*Dentry {
+ if !d.HasChildren() {
+ return nil
+ }
+ m := make(map[string]*Dentry)
+ for name, child := range d.children {
+ m[name] = child
+ }
+ return m
+}
+
// InsertChild makes child a child of d with the given name.
//
// InsertChild is a mutator of d and child.
@@ -222,6 +234,18 @@ func (d *Dentry) InsertChild(child *Dentry, name string) {
child.name = name
}
+// IsAncestorOf returns true if d is an ancestor of d2; that is, d is either
+// d2's parent or an ancestor of d2's parent.
+func (d *Dentry) IsAncestorOf(d2 *Dentry) bool {
+ for d2.parent != nil {
+ if d2.parent == d {
+ return true
+ }
+ d2 = d2.parent
+ }
+ return false
+}
+
// PrepareDeleteDentry must be called before attempting to delete the file
// represented by d. If PrepareDeleteDentry succeeds, the caller must call
// AbortDeleteDentry or CommitDeleteDentry depending on the deletion's outcome.
@@ -271,21 +295,6 @@ func (vfs *VirtualFilesystem) CommitDeleteDentry(d *Dentry) {
}
}
-// DeleteDentry combines PrepareDeleteDentry and CommitDeleteDentry, as
-// appropriate for in-memory filesystems that don't need to ensure that some
-// external state change succeeds before committing the deletion.
-//
-// DeleteDentry is a mutator of d and d.Parent().
-//
-// Preconditions: d is a child Dentry.
-func (vfs *VirtualFilesystem) DeleteDentry(mntns *MountNamespace, d *Dentry) error {
- if err := vfs.PrepareDeleteDentry(mntns, d); err != nil {
- return err
- }
- vfs.CommitDeleteDentry(d)
- return nil
-}
-
// ForceDeleteDentry causes d to become disowned. It should only be used in
// cases where VFS has no ability to stop the deletion (e.g. d represents the
// local state of a file on a remote filesystem on which the file has already
@@ -314,7 +323,7 @@ func (vfs *VirtualFilesystem) ForceDeleteDentry(d *Dentry) {
// CommitRenameExchangeDentry depending on the rename's outcome.
//
// Preconditions: from is a child Dentry. If to is not nil, it must be a child
-// Dentry from the same Filesystem.
+// Dentry from the same Filesystem. from != to.
func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, to *Dentry) error {
if checkInvariants {
if from.parent == nil {
diff --git a/pkg/sentry/vfs/device.go b/pkg/sentry/vfs/device.go
new file mode 100644
index 000000000..cb672e36f
--- /dev/null
+++ b/pkg/sentry/vfs/device.go
@@ -0,0 +1,100 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// DeviceKind indicates whether a device is a block or character device.
+type DeviceKind uint32
+
+const (
+ // BlockDevice indicates a block device.
+ BlockDevice DeviceKind = iota
+
+ // CharDevice indicates a character device.
+ CharDevice
+)
+
+// String implements fmt.Stringer.String.
+func (kind DeviceKind) String() string {
+ switch kind {
+ case BlockDevice:
+ return "block"
+ case CharDevice:
+ return "character"
+ default:
+ return fmt.Sprintf("invalid device kind %d", kind)
+ }
+}
+
+type devTuple struct {
+ kind DeviceKind
+ major uint32
+ minor uint32
+}
+
+// A Device backs device special files.
+type Device interface {
+ // Open returns a FileDescription representing this device.
+ Open(ctx context.Context, mnt *Mount, d *Dentry, opts OpenOptions) (*FileDescription, error)
+}
+
+type registeredDevice struct {
+ dev Device
+ opts RegisterDeviceOptions
+}
+
+// RegisterDeviceOptions contains options to
+// VirtualFilesystem.RegisterDevice().
+type RegisterDeviceOptions struct {
+ // GroupName is the name shown for this device registration in
+ // /proc/devices. If GroupName is empty, this registration will not be
+ // shown in /proc/devices.
+ GroupName string
+}
+
+// RegisterDevice registers the given Device in vfs with the given major and
+// minor device numbers.
+func (vfs *VirtualFilesystem) RegisterDevice(kind DeviceKind, major, minor uint32, dev Device, opts *RegisterDeviceOptions) error {
+ tup := devTuple{kind, major, minor}
+ vfs.devicesMu.Lock()
+ defer vfs.devicesMu.Unlock()
+ if existing, ok := vfs.devices[tup]; ok {
+ return fmt.Errorf("%s device number (%d, %d) is already registered to device type %T", kind, major, minor, existing.dev)
+ }
+ vfs.devices[tup] = &registeredDevice{
+ dev: dev,
+ opts: *opts,
+ }
+ return nil
+}
+
+// OpenDeviceSpecialFile returns a FileDescription representing the given
+// device.
+func (vfs *VirtualFilesystem) OpenDeviceSpecialFile(ctx context.Context, mnt *Mount, d *Dentry, kind DeviceKind, major, minor uint32, opts *OpenOptions) (*FileDescription, error) {
+ tup := devTuple{kind, major, minor}
+ vfs.devicesMu.RLock()
+ defer vfs.devicesMu.RUnlock()
+ rd, ok := vfs.devices[tup]
+ if !ok {
+ return nil, syserror.ENXIO
+ }
+ return rd.dev.Open(ctx, mnt, d, *opts)
+}
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 34007eb57..6afe280bc 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -20,8 +20,10 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -38,49 +40,58 @@ type FileDescription struct {
// operations.
refs int64
+ // statusFlags contains status flags, "initialized by open(2) and possibly
+ // modified by fcntl()" - fcntl(2). statusFlags is accessed using atomic
+ // memory operations.
+ statusFlags uint32
+
// vd is the filesystem location at which this FileDescription was opened.
// A reference is held on vd. vd is immutable.
vd VirtualDentry
+ opts FileDescriptionOptions
+
// impl is the FileDescriptionImpl associated with this Filesystem. impl is
// immutable. This should be the last field in FileDescription.
impl FileDescriptionImpl
}
-// Init must be called before first use of fd. It takes ownership of references
-// on mnt and d held by the caller.
-func (fd *FileDescription) Init(impl FileDescriptionImpl, mnt *Mount, d *Dentry) {
+// FileDescriptionOptions contains options to FileDescription.Init().
+type FileDescriptionOptions struct {
+ // If AllowDirectIO is true, allow O_DIRECT to be set on the file. This is
+ // usually only the case if O_DIRECT would actually have an effect.
+ AllowDirectIO bool
+
+ // If UseDentryMetadata is true, calls to FileDescription methods that
+ // interact with file and filesystem metadata (Stat, SetStat, StatFS,
+ // Listxattr, Getxattr, Setxattr, Removexattr) are implemented by calling
+ // the corresponding FilesystemImpl methods instead of the corresponding
+ // FileDescriptionImpl methods.
+ //
+ // UseDentryMetadata is intended for file descriptions that are implemented
+ // outside of individual filesystems, such as pipes, sockets, and device
+ // special files. FileDescriptions for which UseDentryMetadata is true may
+ // embed DentryMetadataFileDescriptionImpl to obtain appropriate
+ // implementations of FileDescriptionImpl methods that should not be
+ // called.
+ UseDentryMetadata bool
+}
+
+// Init must be called before first use of fd. It takes references on mnt and
+// d. statusFlags is the initial file description status flags, which is
+// usually the full set of flags passed to open(2).
+func (fd *FileDescription) Init(impl FileDescriptionImpl, statusFlags uint32, mnt *Mount, d *Dentry, opts *FileDescriptionOptions) {
fd.refs = 1
+ fd.statusFlags = statusFlags | linux.O_LARGEFILE
fd.vd = VirtualDentry{
mount: mnt,
dentry: d,
}
+ fd.vd.IncRef()
+ fd.opts = *opts
fd.impl = impl
}
-// Impl returns the FileDescriptionImpl associated with fd.
-func (fd *FileDescription) Impl() FileDescriptionImpl {
- return fd.impl
-}
-
-// Mount returns the mount on which fd was opened. It does not take a reference
-// on the returned Mount.
-func (fd *FileDescription) Mount() *Mount {
- return fd.vd.mount
-}
-
-// Dentry returns the dentry at which fd was opened. It does not take a
-// reference on the returned Dentry.
-func (fd *FileDescription) Dentry() *Dentry {
- return fd.vd.dentry
-}
-
-// VirtualDentry returns the location at which fd was opened. It does not take
-// a reference on the returned VirtualDentry.
-func (fd *FileDescription) VirtualDentry() VirtualDentry {
- return fd.vd
-}
-
// IncRef increments fd's reference count.
func (fd *FileDescription) IncRef() {
atomic.AddInt64(&fd.refs, 1)
@@ -112,6 +123,82 @@ func (fd *FileDescription) DecRef() {
}
}
+// Mount returns the mount on which fd was opened. It does not take a reference
+// on the returned Mount.
+func (fd *FileDescription) Mount() *Mount {
+ return fd.vd.mount
+}
+
+// Dentry returns the dentry at which fd was opened. It does not take a
+// reference on the returned Dentry.
+func (fd *FileDescription) Dentry() *Dentry {
+ return fd.vd.dentry
+}
+
+// VirtualDentry returns the location at which fd was opened. It does not take
+// a reference on the returned VirtualDentry.
+func (fd *FileDescription) VirtualDentry() VirtualDentry {
+ return fd.vd
+}
+
+// StatusFlags returns file description status flags, as for fcntl(F_GETFL).
+func (fd *FileDescription) StatusFlags() uint32 {
+ return atomic.LoadUint32(&fd.statusFlags)
+}
+
+// SetStatusFlags sets file description status flags, as for fcntl(F_SETFL).
+func (fd *FileDescription) SetStatusFlags(ctx context.Context, creds *auth.Credentials, flags uint32) error {
+ // Compare Linux's fs/fcntl.c:setfl().
+ oldFlags := fd.StatusFlags()
+ // Linux documents this check as "O_APPEND cannot be cleared if the file is
+ // marked as append-only and the file is open for write", which would make
+ // sense. However, the check as actually implemented seems to be "O_APPEND
+ // cannot be changed if the file is marked as append-only".
+ if (flags^oldFlags)&linux.O_APPEND != 0 {
+ stat, err := fd.Stat(ctx, StatOptions{
+ // There is no mask bit for stx_attributes.
+ Mask: 0,
+ // Linux just reads inode::i_flags directly.
+ Sync: linux.AT_STATX_DONT_SYNC,
+ })
+ if err != nil {
+ return err
+ }
+ if (stat.AttributesMask&linux.STATX_ATTR_APPEND != 0) && (stat.Attributes&linux.STATX_ATTR_APPEND != 0) {
+ return syserror.EPERM
+ }
+ }
+ if (flags&linux.O_NOATIME != 0) && (oldFlags&linux.O_NOATIME == 0) {
+ stat, err := fd.Stat(ctx, StatOptions{
+ Mask: linux.STATX_UID,
+ // Linux's inode_owner_or_capable() just reads inode::i_uid
+ // directly.
+ Sync: linux.AT_STATX_DONT_SYNC,
+ })
+ if err != nil {
+ return err
+ }
+ if stat.Mask&linux.STATX_UID == 0 {
+ return syserror.EPERM
+ }
+ if !CanActAsOwner(creds, auth.KUID(stat.UID)) {
+ return syserror.EPERM
+ }
+ }
+ if flags&linux.O_DIRECT != 0 && !fd.opts.AllowDirectIO {
+ return syserror.EINVAL
+ }
+ // TODO(jamieliu): FileDescriptionImpl.SetOAsync()?
+ const settableFlags = linux.O_APPEND | linux.O_ASYNC | linux.O_DIRECT | linux.O_NOATIME | linux.O_NONBLOCK
+ atomic.StoreUint32(&fd.statusFlags, (oldFlags&^settableFlags)|(flags&settableFlags))
+ return nil
+}
+
+// Impl returns the FileDescriptionImpl associated with fd.
+func (fd *FileDescription) Impl() FileDescriptionImpl {
+ return fd.impl
+}
+
// FileDescriptionImpl contains implementation details for an FileDescription.
// Implementations of FileDescriptionImpl should contain their associated
// FileDescription by value as their first field.
@@ -120,6 +207,8 @@ func (fd *FileDescription) DecRef() {
// be interpreted as IDs in the root UserNamespace (i.e. as auth.KUID and
// auth.KGID respectively).
//
+// All methods may return errors not specified.
+//
// FileDescriptionImpl is analogous to Linux's struct file_operations.
type FileDescriptionImpl interface {
// Release is called when the associated FileDescription reaches zero
@@ -131,14 +220,6 @@ type FileDescriptionImpl interface {
// prevent the file descriptor from being closed.
OnClose(ctx context.Context) error
- // StatusFlags returns file description status flags, as for
- // fcntl(F_GETFL).
- StatusFlags(ctx context.Context) (uint32, error)
-
- // SetStatusFlags sets file description status flags, as for
- // fcntl(F_SETFL).
- SetStatusFlags(ctx context.Context, flags uint32) error
-
// Stat returns metadata for the file represented by the FileDescription.
Stat(ctx context.Context, opts StatOptions) (linux.Statx, error)
@@ -156,6 +237,10 @@ type FileDescriptionImpl interface {
// PRead reads from the file into dst, starting at the given offset, and
// returns the number of bytes read. PRead is permitted to return partial
// reads with a nil error.
+ //
+ // Errors:
+ //
+ // - If opts.Flags specifies unsupported options, PRead returns EOPNOTSUPP.
PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error)
// Read is similar to PRead, but does not specify an offset.
@@ -165,6 +250,10 @@ type FileDescriptionImpl interface {
// the number of bytes read; note that POSIX 2.9.7 "Thread Interactions
// with Regular File Operations" requires that all operations that may
// mutate the FileDescription offset are serialized.
+ //
+ // Errors:
+ //
+ // - If opts.Flags specifies unsupported options, Read returns EOPNOTSUPP.
Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error)
// PWrite writes src to the file, starting at the given offset, and returns
@@ -174,6 +263,11 @@ type FileDescriptionImpl interface {
// As in Linux (but not POSIX), if O_APPEND is in effect for the
// FileDescription, PWrite should ignore the offset and append data to the
// end of the file.
+ //
+ // Errors:
+ //
+ // - If opts.Flags specifies unsupported options, PWrite returns
+ // EOPNOTSUPP.
PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error)
// Write is similar to PWrite, but does not specify an offset, which is
@@ -183,6 +277,10 @@ type FileDescriptionImpl interface {
// PWrite that uses a FileDescription offset, to make it possible for
// remote filesystems to implement O_APPEND correctly (i.e. atomically with
// respect to writers outside the scope of VFS).
+ //
+ // Errors:
+ //
+ // - If opts.Flags specifies unsupported options, Write returns EOPNOTSUPP.
Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error)
// IterDirents invokes cb on each entry in the directory represented by the
@@ -212,7 +310,21 @@ type FileDescriptionImpl interface {
// Ioctl implements the ioctl(2) syscall.
Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error)
- // TODO: extended attributes; file locking
+ // Listxattr returns all extended attribute names for the file.
+ Listxattr(ctx context.Context) ([]string, error)
+
+ // Getxattr returns the value associated with the given extended attribute
+ // for the file.
+ Getxattr(ctx context.Context, name string) (string, error)
+
+ // Setxattr changes the value associated with the given extended attribute
+ // for the file.
+ Setxattr(ctx context.Context, opts SetxattrOptions) error
+
+ // Removexattr removes the given extended attribute from the file.
+ Removexattr(ctx context.Context, name string) error
+
+ // TODO: file locking
}
// Dirent holds the information contained in struct linux_dirent64.
@@ -241,3 +353,230 @@ type IterDirentsCallback interface {
// called.
Handle(dirent Dirent) bool
}
+
+// OnClose is called when a file descriptor representing the FileDescription is
+// closed. Returning a non-nil error should not prevent the file descriptor
+// from being closed.
+func (fd *FileDescription) OnClose(ctx context.Context) error {
+ return fd.impl.OnClose(ctx)
+}
+
+// Stat returns metadata for the file represented by fd.
+func (fd *FileDescription) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) {
+ if fd.opts.UseDentryMetadata {
+ vfsObj := fd.vd.mount.vfs
+ rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+ Root: fd.vd,
+ Start: fd.vd,
+ })
+ stat, err := fd.vd.mount.fs.impl.StatAt(ctx, rp, opts)
+ vfsObj.putResolvingPath(rp)
+ return stat, err
+ }
+ return fd.impl.Stat(ctx, opts)
+}
+
+// SetStat updates metadata for the file represented by fd.
+func (fd *FileDescription) SetStat(ctx context.Context, opts SetStatOptions) error {
+ if fd.opts.UseDentryMetadata {
+ vfsObj := fd.vd.mount.vfs
+ rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+ Root: fd.vd,
+ Start: fd.vd,
+ })
+ err := fd.vd.mount.fs.impl.SetStatAt(ctx, rp, opts)
+ vfsObj.putResolvingPath(rp)
+ return err
+ }
+ return fd.impl.SetStat(ctx, opts)
+}
+
+// StatFS returns metadata for the filesystem containing the file represented
+// by fd.
+func (fd *FileDescription) StatFS(ctx context.Context) (linux.Statfs, error) {
+ if fd.opts.UseDentryMetadata {
+ vfsObj := fd.vd.mount.vfs
+ rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+ Root: fd.vd,
+ Start: fd.vd,
+ })
+ statfs, err := fd.vd.mount.fs.impl.StatFSAt(ctx, rp)
+ vfsObj.putResolvingPath(rp)
+ return statfs, err
+ }
+ return fd.impl.StatFS(ctx)
+}
+
+// PRead reads from the file represented by fd into dst, starting at the given
+// offset, and returns the number of bytes read. PRead is permitted to return
+// partial reads with a nil error.
+func (fd *FileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
+ return fd.impl.PRead(ctx, dst, offset, opts)
+}
+
+// Read is similar to PRead, but does not specify an offset.
+func (fd *FileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) {
+ return fd.impl.Read(ctx, dst, opts)
+}
+
+// PWrite writes src to the file represented by fd, starting at the given
+// offset, and returns the number of bytes written. PWrite is permitted to
+// return partial writes with a nil error.
+func (fd *FileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
+ return fd.impl.PWrite(ctx, src, offset, opts)
+}
+
+// Write is similar to PWrite, but does not specify an offset.
+func (fd *FileDescription) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) {
+ return fd.impl.Write(ctx, src, opts)
+}
+
+// IterDirents invokes cb on each entry in the directory represented by fd. If
+// IterDirents has been called since the last call to Seek, it continues
+// iteration from the end of the last call.
+func (fd *FileDescription) IterDirents(ctx context.Context, cb IterDirentsCallback) error {
+ return fd.impl.IterDirents(ctx, cb)
+}
+
+// Seek changes fd's offset (assuming one exists) and returns its new value.
+func (fd *FileDescription) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+ return fd.impl.Seek(ctx, offset, whence)
+}
+
+// Sync has the semantics of fsync(2).
+func (fd *FileDescription) Sync(ctx context.Context) error {
+ return fd.impl.Sync(ctx)
+}
+
+// ConfigureMMap mutates opts to implement mmap(2) for the file represented by
+// fd.
+func (fd *FileDescription) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+ return fd.impl.ConfigureMMap(ctx, opts)
+}
+
+// Ioctl implements the ioctl(2) syscall.
+func (fd *FileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ return fd.impl.Ioctl(ctx, uio, args)
+}
+
+// Listxattr returns all extended attribute names for the file represented by
+// fd.
+func (fd *FileDescription) Listxattr(ctx context.Context) ([]string, error) {
+ if fd.opts.UseDentryMetadata {
+ vfsObj := fd.vd.mount.vfs
+ rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+ Root: fd.vd,
+ Start: fd.vd,
+ })
+ names, err := fd.vd.mount.fs.impl.ListxattrAt(ctx, rp)
+ vfsObj.putResolvingPath(rp)
+ return names, err
+ }
+ names, err := fd.impl.Listxattr(ctx)
+ if err == syserror.ENOTSUP {
+ // Linux doesn't actually return ENOTSUP in this case; instead,
+ // fs/xattr.c:vfs_listxattr() falls back to allowing the security
+ // subsystem to return security extended attributes, which by default
+ // don't exist.
+ return nil, nil
+ }
+ return names, err
+}
+
+// Getxattr returns the value associated with the given extended attribute for
+// the file represented by fd.
+func (fd *FileDescription) Getxattr(ctx context.Context, name string) (string, error) {
+ if fd.opts.UseDentryMetadata {
+ vfsObj := fd.vd.mount.vfs
+ rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+ Root: fd.vd,
+ Start: fd.vd,
+ })
+ val, err := fd.vd.mount.fs.impl.GetxattrAt(ctx, rp, name)
+ vfsObj.putResolvingPath(rp)
+ return val, err
+ }
+ return fd.impl.Getxattr(ctx, name)
+}
+
+// Setxattr changes the value associated with the given extended attribute for
+// the file represented by fd.
+func (fd *FileDescription) Setxattr(ctx context.Context, opts SetxattrOptions) error {
+ if fd.opts.UseDentryMetadata {
+ vfsObj := fd.vd.mount.vfs
+ rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+ Root: fd.vd,
+ Start: fd.vd,
+ })
+ err := fd.vd.mount.fs.impl.SetxattrAt(ctx, rp, opts)
+ vfsObj.putResolvingPath(rp)
+ return err
+ }
+ return fd.impl.Setxattr(ctx, opts)
+}
+
+// Removexattr removes the given extended attribute from the file represented
+// by fd.
+func (fd *FileDescription) Removexattr(ctx context.Context, name string) error {
+ if fd.opts.UseDentryMetadata {
+ vfsObj := fd.vd.mount.vfs
+ rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+ Root: fd.vd,
+ Start: fd.vd,
+ })
+ err := fd.vd.mount.fs.impl.RemovexattrAt(ctx, rp, name)
+ vfsObj.putResolvingPath(rp)
+ return err
+ }
+ return fd.impl.Removexattr(ctx, name)
+}
+
+// SyncFS instructs the filesystem containing fd to execute the semantics of
+// syncfs(2).
+func (fd *FileDescription) SyncFS(ctx context.Context) error {
+ return fd.vd.mount.fs.impl.Sync(ctx)
+}
+
+// MappedName implements memmap.MappingIdentity.MappedName.
+func (fd *FileDescription) MappedName(ctx context.Context) string {
+ vfsroot := RootFromContext(ctx)
+ s, _ := fd.vd.mount.vfs.PathnameWithDeleted(ctx, vfsroot, fd.vd)
+ if vfsroot.Ok() {
+ vfsroot.DecRef()
+ }
+ return s
+}
+
+// DeviceID implements memmap.MappingIdentity.DeviceID.
+func (fd *FileDescription) DeviceID() uint64 {
+ stat, err := fd.Stat(context.Background(), StatOptions{
+ // There is no STATX_DEV; we assume that Stat will return it if it's
+ // available regardless of mask.
+ Mask: 0,
+ // fs/proc/task_mmu.c:show_map_vma() just reads inode::i_sb->s_dev
+ // directly.
+ Sync: linux.AT_STATX_DONT_SYNC,
+ })
+ if err != nil {
+ return 0
+ }
+ return uint64(linux.MakeDeviceID(uint16(stat.DevMajor), stat.DevMinor))
+}
+
+// InodeID implements memmap.MappingIdentity.InodeID.
+func (fd *FileDescription) InodeID() uint64 {
+ stat, err := fd.Stat(context.Background(), StatOptions{
+ Mask: linux.STATX_INO,
+ // fs/proc/task_mmu.c:show_map_vma() just reads inode::i_ino directly.
+ Sync: linux.AT_STATX_DONT_SYNC,
+ })
+ if err != nil || stat.Mask&linux.STATX_INO == 0 {
+ return 0
+ }
+ return stat.Ino
+}
+
+// Msync implements memmap.MappingIdentity.Msync.
+func (fd *FileDescription) Msync(ctx context.Context, mr memmap.MappableRange) error {
+ return fd.Sync(ctx)
+}
diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go
index 4fbad7840..c00b3c84b 100644
--- a/pkg/sentry/vfs/file_description_impl_util.go
+++ b/pkg/sentry/vfs/file_description_impl_util.go
@@ -17,13 +17,13 @@ package vfs
import (
"bytes"
"io"
- "sync"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -127,6 +127,31 @@ func (FileDescriptionDefaultImpl) Ioctl(ctx context.Context, uio usermem.IO, arg
return 0, syserror.ENOTTY
}
+// Listxattr implements FileDescriptionImpl.Listxattr analogously to
+// inode_operations::listxattr == NULL in Linux.
+func (FileDescriptionDefaultImpl) Listxattr(ctx context.Context) ([]string, error) {
+ // This isn't exactly accurate; see FileDescription.Listxattr.
+ return nil, syserror.ENOTSUP
+}
+
+// Getxattr implements FileDescriptionImpl.Getxattr analogously to
+// inode::i_opflags & IOP_XATTR == 0 in Linux.
+func (FileDescriptionDefaultImpl) Getxattr(ctx context.Context, name string) (string, error) {
+ return "", syserror.ENOTSUP
+}
+
+// Setxattr implements FileDescriptionImpl.Setxattr analogously to
+// inode::i_opflags & IOP_XATTR == 0 in Linux.
+func (FileDescriptionDefaultImpl) Setxattr(ctx context.Context, opts SetxattrOptions) error {
+ return syserror.ENOTSUP
+}
+
+// Removexattr implements FileDescriptionImpl.Removexattr analogously to
+// inode::i_opflags & IOP_XATTR == 0 in Linux.
+func (FileDescriptionDefaultImpl) Removexattr(ctx context.Context, name string) error {
+ return syserror.ENOTSUP
+}
+
// DirectoryFileDescriptionDefaultImpl may be embedded by implementations of
// FileDescriptionImpl that always represent directories to obtain
// implementations of non-directory I/O methods that return EISDIR.
@@ -152,6 +177,21 @@ func (DirectoryFileDescriptionDefaultImpl) Write(ctx context.Context, src userme
return 0, syserror.EISDIR
}
+// DentryMetadataFileDescriptionImpl may be embedded by implementations of
+// FileDescriptionImpl for which FileDescriptionOptions.UseDentryMetadata is
+// true to obtain implementations of Stat and SetStat that panic.
+type DentryMetadataFileDescriptionImpl struct{}
+
+// Stat implements FileDescriptionImpl.Stat.
+func (DentryMetadataFileDescriptionImpl) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) {
+ panic("illegal call to DentryMetadataFileDescriptionImpl.Stat")
+}
+
+// SetStat implements FileDescriptionImpl.SetStat.
+func (DentryMetadataFileDescriptionImpl) SetStat(ctx context.Context, opts SetStatOptions) error {
+ panic("illegal call to DentryMetadataFileDescriptionImpl.SetStat")
+}
+
// DynamicBytesFileDescriptionImpl may be embedded by implementations of
// FileDescriptionImpl that represent read-only regular files whose contents
// are backed by a bytes.Buffer that is regenerated when necessary, consistent
@@ -174,6 +214,17 @@ type DynamicBytesSource interface {
Generate(ctx context.Context, buf *bytes.Buffer) error
}
+// StaticData implements DynamicBytesSource over a static string.
+type StaticData struct {
+ Data string
+}
+
+// Generate implements DynamicBytesSource.
+func (s *StaticData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+ buf.WriteString(s.Data)
+ return nil
+}
+
// SetDataSource must be called exactly once on fd before first use.
func (fd *DynamicBytesFileDescriptionImpl) SetDataSource(data DynamicBytesSource) {
fd.data = data
@@ -252,3 +303,12 @@ func (fd *DynamicBytesFileDescriptionImpl) Seek(ctx context.Context, offset int6
fd.off = offset
return offset, nil
}
+
+// GenericConfigureMMap may be used by most implementations of
+// FileDescriptionImpl.ConfigureMMap.
+func GenericConfigureMMap(fd *FileDescription, m memmap.Mappable, opts *memmap.MMapOpts) error {
+ opts.Mappable = m
+ opts.MappingIdentity = fd
+ fd.IncRef()
+ return nil
+}
diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go
index a5561dcbe..9ed58512f 100644
--- a/pkg/sentry/vfs/file_description_impl_util_test.go
+++ b/pkg/sentry/vfs/file_description_impl_util_test.go
@@ -48,7 +48,7 @@ type genCountFD struct {
func newGenCountFD(mnt *Mount, vfsd *Dentry) *FileDescription {
var fd genCountFD
- fd.vfsfd.Init(&fd, mnt, vfsd)
+ fd.vfsfd.Init(&fd, 0 /* statusFlags */, mnt, vfsd, &FileDescriptionOptions{})
fd.DynamicBytesFileDescriptionImpl.SetDataSource(&fd)
return &fd.vfsfd
}
@@ -89,7 +89,7 @@ func TestGenCountFD(t *testing.T) {
creds := auth.CredentialsFromContext(ctx)
vfsObj := New() // vfs.New()
- vfsObj.MustRegisterFilesystemType("testfs", FDTestFilesystemType{})
+ vfsObj.MustRegisterFilesystemType("testfs", FDTestFilesystemType{}, &RegisterFilesystemTypeOptions{})
mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "testfs", &GetFilesystemOptions{})
if err != nil {
t.Fatalf("failed to create testfs root mount: %v", err)
@@ -103,7 +103,7 @@ func TestGenCountFD(t *testing.T) {
// The first read causes Generate to be called to fill the FD's buffer.
buf := make([]byte, 2)
ioseq := usermem.BytesIOSequence(buf)
- n, err := fd.Impl().Read(ctx, ioseq, ReadOptions{})
+ n, err := fd.Read(ctx, ioseq, ReadOptions{})
if n != 1 || (err != nil && err != io.EOF) {
t.Fatalf("first Read: got (%d, %v), wanted (1, nil or EOF)", n, err)
}
@@ -112,17 +112,17 @@ func TestGenCountFD(t *testing.T) {
}
// A second read without seeking is still at EOF.
- n, err = fd.Impl().Read(ctx, ioseq, ReadOptions{})
+ n, err = fd.Read(ctx, ioseq, ReadOptions{})
if n != 0 || err != io.EOF {
t.Fatalf("second Read: got (%d, %v), wanted (0, EOF)", n, err)
}
// Seeking to the beginning of the file causes it to be regenerated.
- n, err = fd.Impl().Seek(ctx, 0, linux.SEEK_SET)
+ n, err = fd.Seek(ctx, 0, linux.SEEK_SET)
if n != 0 || err != nil {
t.Fatalf("Seek: got (%d, %v), wanted (0, nil)", n, err)
}
- n, err = fd.Impl().Read(ctx, ioseq, ReadOptions{})
+ n, err = fd.Read(ctx, ioseq, ReadOptions{})
if n != 1 || (err != nil && err != io.EOF) {
t.Fatalf("Read after Seek: got (%d, %v), wanted (1, nil or EOF)", n, err)
}
@@ -131,7 +131,7 @@ func TestGenCountFD(t *testing.T) {
}
// PRead at the beginning of the file also causes it to be regenerated.
- n, err = fd.Impl().PRead(ctx, ioseq, 0, ReadOptions{})
+ n, err = fd.PRead(ctx, ioseq, 0, ReadOptions{})
if n != 1 || (err != nil && err != io.EOF) {
t.Fatalf("PRead: got (%d, %v), wanted (1, nil or EOF)", n, err)
}
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index 76ff8cf51..ea78f555b 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -18,6 +18,7 @@ import (
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/sentry/context"
)
@@ -47,6 +48,9 @@ func (fs *Filesystem) Init(vfsObj *VirtualFilesystem, impl FilesystemImpl) {
fs.refs = 1
fs.vfs = vfsObj
fs.impl = impl
+ vfsObj.filesystemsMu.Lock()
+ vfsObj.filesystems[fs] = struct{}{}
+ vfsObj.filesystemsMu.Unlock()
}
// VirtualFilesystem returns the containing VirtualFilesystem.
@@ -66,9 +70,28 @@ func (fs *Filesystem) IncRef() {
}
}
+// TryIncRef increments fs' reference count and returns true. If fs' reference
+// count is zero, TryIncRef does nothing and returns false.
+//
+// TryIncRef does not require that a reference is held on fs.
+func (fs *Filesystem) TryIncRef() bool {
+ for {
+ refs := atomic.LoadInt64(&fs.refs)
+ if refs <= 0 {
+ return false
+ }
+ if atomic.CompareAndSwapInt64(&fs.refs, refs, refs+1) {
+ return true
+ }
+ }
+}
+
// DecRef decrements fs' reference count.
func (fs *Filesystem) DecRef() {
if refs := atomic.AddInt64(&fs.refs, -1); refs == 0 {
+ fs.vfs.filesystemsMu.Lock()
+ delete(fs.vfs.filesystems, fs)
+ fs.vfs.filesystemsMu.Unlock()
fs.impl.Release()
} else if refs < 0 {
panic("Filesystem.decRef() called without holding a reference")
@@ -85,6 +108,24 @@ func (fs *Filesystem) DecRef() {
// (responsible for actually implementing the operation) isn't known until path
// resolution is complete.
//
+// Unless otherwise specified, FilesystemImpl methods are responsible for
+// performing permission checks. In many cases, vfs package functions in
+// permissions.go may be used to help perform these checks.
+//
+// When multiple specified error conditions apply to a given method call, the
+// implementation may return any applicable errno unless otherwise specified,
+// but returning the earliest error specified is preferable to maximize
+// compatibility with Linux.
+//
+// All methods may return errors not specified, notably including:
+//
+// - ENOENT if a required path component does not exist.
+//
+// - ENOTDIR if an intermediate path component is not a directory.
+//
+// - Errors from vfs-package functions (ResolvingPath.Resolve*(),
+// Mount.CheckBeginWrite(), permission-checking functions, etc.)
+//
// For all methods that take or return linux.Statx, Statx.Uid and Statx.Gid
// should be interpreted as IDs in the root UserNamespace (i.e. as auth.KUID
// and auth.KGID respectively).
@@ -107,46 +148,223 @@ type FilesystemImpl interface {
// GetDentryAt does not correspond directly to a Linux syscall; it is used
// in the implementation of:
//
- // - Syscalls that need to resolve two paths: rename(), renameat(),
- // renameat2(), link(), linkat().
+ // - Syscalls that need to resolve two paths: link(), linkat().
//
// - Syscalls that need to refer to a filesystem position outside the
// context of a file description: chdir(), fchdir(), chroot(), mount(),
// umount().
GetDentryAt(ctx context.Context, rp *ResolvingPath, opts GetDentryOptions) (*Dentry, error)
+ // GetParentDentryAt returns a Dentry representing the directory at the
+ // second-to-last path component in rp. (Note that, despite the name, this
+ // is not necessarily the parent directory of the file at rp, since the
+ // last path component in rp may be "." or "..".) A reference is taken on
+ // the returned Dentry.
+ //
+ // GetParentDentryAt does not correspond directly to a Linux syscall; it is
+ // used in the implementation of the rename() family of syscalls, which
+ // must resolve the parent directories of two paths.
+ //
+ // Preconditions: !rp.Done().
+ //
+ // Postconditions: If GetParentDentryAt returns a nil error, then
+ // rp.Final(). If GetParentDentryAt returns an error returned by
+ // ResolvingPath.Resolve*(), then !rp.Done().
+ GetParentDentryAt(ctx context.Context, rp *ResolvingPath) (*Dentry, error)
+
// LinkAt creates a hard link at rp representing the same file as vd. It
// does not take ownership of references on vd.
//
- // The implementation is responsible for checking that vd.Mount() ==
- // rp.Mount(), and that vd does not represent a directory.
+ // Errors:
+ //
+ // - If the last path component in rp is "." or "..", LinkAt returns
+ // EEXIST.
+ //
+ // - If a file already exists at rp, LinkAt returns EEXIST.
+ //
+ // - If rp.MustBeDir(), LinkAt returns ENOENT.
+ //
+ // - If the directory in which the link would be created has been removed
+ // by RmdirAt or RenameAt, LinkAt returns ENOENT.
+ //
+ // - If rp.Mount != vd.Mount(), LinkAt returns EXDEV.
+ //
+ // - If vd represents a directory, LinkAt returns EPERM.
+ //
+ // - If vd represents a file for which all existing links have been
+ // removed, or a file created by open(O_TMPFILE|O_EXCL), LinkAt returns
+ // ENOENT. Equivalently, if vd represents a file with a link count of 0 not
+ // created by open(O_TMPFILE) without O_EXCL, LinkAt returns ENOENT.
+ //
+ // Preconditions: !rp.Done(). For the final path component in rp,
+ // !rp.ShouldFollowSymlink().
+ //
+ // Postconditions: If LinkAt returns an error returned by
+ // ResolvingPath.Resolve*(), then !rp.Done().
LinkAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry) error
// MkdirAt creates a directory at rp.
+ //
+ // Errors:
+ //
+ // - If the last path component in rp is "." or "..", MkdirAt returns
+ // EEXIST.
+ //
+ // - If a file already exists at rp, MkdirAt returns EEXIST.
+ //
+ // - If the directory in which the new directory would be created has been
+ // removed by RmdirAt or RenameAt, MkdirAt returns ENOENT.
+ //
+ // Preconditions: !rp.Done(). For the final path component in rp,
+ // !rp.ShouldFollowSymlink().
+ //
+ // Postconditions: If MkdirAt returns an error returned by
+ // ResolvingPath.Resolve*(), then !rp.Done().
MkdirAt(ctx context.Context, rp *ResolvingPath, opts MkdirOptions) error
// MknodAt creates a regular file, device special file, or named pipe at
// rp.
+ //
+ // Errors:
+ //
+ // - If the last path component in rp is "." or "..", MknodAt returns
+ // EEXIST.
+ //
+ // - If a file already exists at rp, MknodAt returns EEXIST.
+ //
+ // - If rp.MustBeDir(), MknodAt returns ENOENT.
+ //
+ // - If the directory in which the file would be created has been removed
+ // by RmdirAt or RenameAt, MknodAt returns ENOENT.
+ //
+ // Preconditions: !rp.Done(). For the final path component in rp,
+ // !rp.ShouldFollowSymlink().
+ //
+ // Postconditions: If MknodAt returns an error returned by
+ // ResolvingPath.Resolve*(), then !rp.Done().
MknodAt(ctx context.Context, rp *ResolvingPath, opts MknodOptions) error
// OpenAt returns an FileDescription providing access to the file at rp. A
// reference is taken on the returned FileDescription.
+ //
+ // Errors:
+ //
+ // - If opts.Flags specifies O_TMPFILE and this feature is unsupported by
+ // the implementation, OpenAt returns EOPNOTSUPP. (All other unsupported
+ // features are silently ignored, consistently with Linux's open*(2).)
OpenAt(ctx context.Context, rp *ResolvingPath, opts OpenOptions) (*FileDescription, error)
// ReadlinkAt returns the target of the symbolic link at rp.
+ //
+ // Errors:
+ //
+ // - If the file at rp is not a symbolic link, ReadlinkAt returns EINVAL.
ReadlinkAt(ctx context.Context, rp *ResolvingPath) (string, error)
- // RenameAt renames the Dentry represented by vd to rp. It does not take
- // ownership of references on vd.
+ // RenameAt renames the file named oldName in directory oldParentVD to rp.
+ // It does not take ownership of references on oldParentVD.
+ //
+ // Errors [1]:
+ //
+ // - If opts.Flags specifies unsupported options, RenameAt returns EINVAL.
+ //
+ // - If the last path component in rp is "." or "..", and opts.Flags
+ // contains RENAME_NOREPLACE, RenameAt returns EEXIST.
+ //
+ // - If the last path component in rp is "." or "..", and opts.Flags does
+ // not contain RENAME_NOREPLACE, RenameAt returns EBUSY.
+ //
+ // - If rp.Mount != oldParentVD.Mount(), RenameAt returns EXDEV.
+ //
+ // - If the renamed file is not a directory, and opts.MustBeDir is true,
+ // RenameAt returns ENOTDIR.
+ //
+ // - If renaming would replace an existing file and opts.Flags contains
+ // RENAME_NOREPLACE, RenameAt returns EEXIST.
+ //
+ // - If there is no existing file at rp and opts.Flags contains
+ // RENAME_EXCHANGE, RenameAt returns ENOENT.
+ //
+ // - If there is an existing non-directory file at rp, and rp.MustBeDir()
+ // is true, RenameAt returns ENOTDIR.
+ //
+ // - If the renamed file is not a directory, opts.Flags does not contain
+ // RENAME_EXCHANGE, and rp.MustBeDir() is true, RenameAt returns ENOTDIR.
+ // (This check is not subsumed by the check for directory replacement below
+ // since it applies even if there is no file to replace.)
+ //
+ // - If the renamed file is a directory, and the new parent directory of
+ // the renamed file is either the renamed directory or a descendant
+ // subdirectory of the renamed directory, RenameAt returns EINVAL.
+ //
+ // - If renaming would exchange the renamed file with an ancestor directory
+ // of the renamed file, RenameAt returns EINVAL.
+ //
+ // - If renaming would replace an ancestor directory of the renamed file,
+ // RenameAt returns ENOTEMPTY. (This check would be subsumed by the
+ // non-empty directory check below; however, this check takes place before
+ // the self-rename check.)
+ //
+ // - If the renamed file would replace or exchange with itself (i.e. the
+ // source and destination paths resolve to the same file), RenameAt returns
+ // nil, skipping the checks described below.
+ //
+ // - If the source or destination directory is not writable by the provider
+ // of rp.Credentials(), RenameAt returns EACCES.
+ //
+ // - If the renamed file is a directory, and renaming would replace a
+ // non-directory file, RenameAt returns ENOTDIR.
//
- // The implementation is responsible for checking that vd.Mount() ==
- // rp.Mount().
- RenameAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry, opts RenameOptions) error
+ // - If the renamed file is not a directory, and renaming would replace a
+ // directory, RenameAt returns EISDIR.
+ //
+ // - If the new parent directory of the renamed file has been removed by
+ // RmdirAt or a preceding call to RenameAt, RenameAt returns ENOENT.
+ //
+ // - If the renamed file is a directory, it is not writable by the
+ // provider of rp.Credentials(), and the source and destination parent
+ // directories are different, RenameAt returns EACCES. (This is nominally
+ // required to change the ".." entry in the renamed directory.)
+ //
+ // - If renaming would replace a non-empty directory, RenameAt returns
+ // ENOTEMPTY.
+ //
+ // Preconditions: !rp.Done(). For the final path component in rp,
+ // !rp.ShouldFollowSymlink(). oldName is not "." or "..".
+ //
+ // Postconditions: If RenameAt returns an error returned by
+ // ResolvingPath.Resolve*(), then !rp.Done().
+ //
+ // [1] "The worst of all namespace operations - renaming directory.
+ // "Perverted" doesn't even start to describe it. Somebody in UCB had a
+ // heck of a trip..." - fs/namei.c:vfs_rename()
+ RenameAt(ctx context.Context, rp *ResolvingPath, oldParentVD VirtualDentry, oldName string, opts RenameOptions) error
// RmdirAt removes the directory at rp.
+ //
+ // Errors:
+ //
+ // - If the last path component in rp is ".", RmdirAt returns EINVAL.
+ //
+ // - If the last path component in rp is "..", RmdirAt returns ENOTEMPTY.
+ //
+ // - If no file exists at rp, RmdirAt returns ENOENT.
+ //
+ // - If the file at rp exists but is not a directory, RmdirAt returns
+ // ENOTDIR.
+ //
+ // Preconditions: !rp.Done(). For the final path component in rp,
+ // !rp.ShouldFollowSymlink().
+ //
+ // Postconditions: If RmdirAt returns an error returned by
+ // ResolvingPath.Resolve*(), then !rp.Done().
RmdirAt(ctx context.Context, rp *ResolvingPath) error
// SetStatAt updates metadata for the file at the given path.
+ //
+ // Errors:
+ //
+ // - If opts specifies unsupported options, SetStatAt returns EINVAL.
SetStatAt(ctx context.Context, rp *ResolvingPath, opts SetStatOptions) error
// StatAt returns metadata for the file at rp.
@@ -158,10 +376,132 @@ type FilesystemImpl interface {
StatFSAt(ctx context.Context, rp *ResolvingPath) (linux.Statfs, error)
// SymlinkAt creates a symbolic link at rp referring to the given target.
+ //
+ // Errors:
+ //
+ // - If the last path component in rp is "." or "..", SymlinkAt returns
+ // EEXIST.
+ //
+ // - If a file already exists at rp, SymlinkAt returns EEXIST.
+ //
+ // - If rp.MustBeDir(), SymlinkAt returns ENOENT.
+ //
+ // - If the directory in which the symbolic link would be created has been
+ // removed by RmdirAt or RenameAt, SymlinkAt returns ENOENT.
+ //
+ // Preconditions: !rp.Done(). For the final path component in rp,
+ // !rp.ShouldFollowSymlink().
+ //
+ // Postconditions: If SymlinkAt returns an error returned by
+ // ResolvingPath.Resolve*(), then !rp.Done().
SymlinkAt(ctx context.Context, rp *ResolvingPath, target string) error
- // UnlinkAt removes the non-directory file at rp.
+ // UnlinkAt removes the file at rp.
+ //
+ // Errors:
+ //
+ // - If the last path component in rp is "." or "..", UnlinkAt returns
+ // EISDIR.
+ //
+ // - If no file exists at rp, UnlinkAt returns ENOENT.
+ //
+ // - If rp.MustBeDir(), and the file at rp exists and is not a directory,
+ // UnlinkAt returns ENOTDIR.
+ //
+ // - If the file at rp exists but is a directory, UnlinkAt returns EISDIR.
+ //
+ // Preconditions: !rp.Done(). For the final path component in rp,
+ // !rp.ShouldFollowSymlink().
+ //
+ // Postconditions: If UnlinkAt returns an error returned by
+ // ResolvingPath.Resolve*(), then !rp.Done().
UnlinkAt(ctx context.Context, rp *ResolvingPath) error
- // TODO: d_path(); extended attributes; inotify_add_watch(); bind()
+ // ListxattrAt returns all extended attribute names for the file at rp.
+ //
+ // Errors:
+ //
+ // - If extended attributes are not supported by the filesystem,
+ // ListxattrAt returns nil. (See FileDescription.Listxattr for an
+ // explanation.)
+ ListxattrAt(ctx context.Context, rp *ResolvingPath) ([]string, error)
+
+ // GetxattrAt returns the value associated with the given extended
+ // attribute for the file at rp.
+ //
+ // Errors:
+ //
+ // - If extended attributes are not supported by the filesystem, GetxattrAt
+ // returns ENOTSUP.
+ GetxattrAt(ctx context.Context, rp *ResolvingPath, name string) (string, error)
+
+ // SetxattrAt changes the value associated with the given extended
+ // attribute for the file at rp.
+ //
+ // Errors:
+ //
+ // - If extended attributes are not supported by the filesystem, SetxattrAt
+ // returns ENOTSUP.
+ SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error
+
+ // RemovexattrAt removes the given extended attribute from the file at rp.
+ //
+ // Errors:
+ //
+ // - If extended attributes are not supported by the filesystem,
+ // RemovexattrAt returns ENOTSUP.
+ RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error
+
+ // PrependPath prepends a path from vd to vd.Mount().Root() to b.
+ //
+ // If vfsroot.Ok(), it is the contextual VFS root; if it is encountered
+ // before vd.Mount().Root(), PrependPath should stop prepending path
+ // components and return a PrependPathAtVFSRootError.
+ //
+ // If traversal of vd.Dentry()'s ancestors encounters an independent
+ // ("root") Dentry that is not vd.Mount().Root() (i.e. vd.Dentry() is not a
+ // descendant of vd.Mount().Root()), PrependPath should stop prepending
+ // path components and return a PrependPathAtNonMountRootError.
+ //
+ // Filesystems for which Dentries do not have meaningful paths may prepend
+ // an arbitrary descriptive string to b and then return a
+ // PrependPathSyntheticError.
+ //
+ // Most implementations can acquire the appropriate locks to ensure that
+ // Dentry.Name() and Dentry.Parent() are fixed for vd.Dentry() and all of
+ // its ancestors, then call GenericPrependPath.
+ //
+ // Preconditions: vd.Mount().Filesystem().Impl() == this FilesystemImpl.
+ PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error
+
+ // TODO: inotify_add_watch(); bind()
+}
+
+// PrependPathAtVFSRootError is returned by implementations of
+// FilesystemImpl.PrependPath() when they encounter the contextual VFS root.
+type PrependPathAtVFSRootError struct{}
+
+// Error implements error.Error.
+func (PrependPathAtVFSRootError) Error() string {
+ return "vfs.FilesystemImpl.PrependPath() reached VFS root"
+}
+
+// PrependPathAtNonMountRootError is returned by implementations of
+// FilesystemImpl.PrependPath() when they encounter an independent ancestor
+// Dentry that is not the Mount root.
+type PrependPathAtNonMountRootError struct{}
+
+// Error implements error.Error.
+func (PrependPathAtNonMountRootError) Error() string {
+ return "vfs.FilesystemImpl.PrependPath() reached root other than Mount root"
+}
+
+// PrependPathSyntheticError is returned by implementations of
+// FilesystemImpl.PrependPath() for which prepended names do not represent real
+// paths.
+type PrependPathSyntheticError struct{}
+
+// Error implements error.Error.
+func (PrependPathSyntheticError) Error() string {
+ return "vfs.FilesystemImpl.PrependPath() prepended synthetic name"
}
diff --git a/pkg/sentry/vfs/filesystem_impl_util.go b/pkg/sentry/vfs/filesystem_impl_util.go
index 465e610e0..7315a588e 100644
--- a/pkg/sentry/vfs/filesystem_impl_util.go
+++ b/pkg/sentry/vfs/filesystem_impl_util.go
@@ -16,6 +16,8 @@ package vfs
import (
"strings"
+
+ "gvisor.dev/gvisor/pkg/fspath"
)
// GenericParseMountOptions parses a comma-separated list of options of the
@@ -41,3 +43,27 @@ func GenericParseMountOptions(str string) map[string]string {
}
return m
}
+
+// GenericPrependPath may be used by implementations of
+// FilesystemImpl.PrependPath() for which a single statically-determined lock
+// or set of locks is sufficient to ensure its preconditions (as opposed to
+// e.g. per-Dentry locks).
+//
+// Preconditions: Dentry.Name() and Dentry.Parent() must be held constant for
+// vd.Dentry() and all of its ancestors.
+func GenericPrependPath(vfsroot, vd VirtualDentry, b *fspath.Builder) error {
+ mnt, d := vd.mount, vd.dentry
+ for {
+ if mnt == vfsroot.mount && d == vfsroot.dentry {
+ return PrependPathAtVFSRootError{}
+ }
+ if d == mnt.root {
+ return nil
+ }
+ if d.parent == nil {
+ return PrependPathAtNonMountRootError{}
+ }
+ b.PrependComponent(d.name)
+ d = d.parent
+ }
+}
diff --git a/pkg/sentry/vfs/filesystem_type.go b/pkg/sentry/vfs/filesystem_type.go
index c335e206d..023301780 100644
--- a/pkg/sentry/vfs/filesystem_type.go
+++ b/pkg/sentry/vfs/filesystem_type.go
@@ -15,6 +15,7 @@
package vfs
import (
+ "bytes"
"fmt"
"gvisor.dev/gvisor/pkg/sentry/context"
@@ -43,28 +44,70 @@ type GetFilesystemOptions struct {
InternalData interface{}
}
+type registeredFilesystemType struct {
+ fsType FilesystemType
+ opts RegisterFilesystemTypeOptions
+}
+
+// RegisterFilesystemTypeOptions contains options to
+// VirtualFilesystem.RegisterFilesystem().
+type RegisterFilesystemTypeOptions struct {
+ // If AllowUserMount is true, allow calls to VirtualFilesystem.MountAt()
+ // for which MountOptions.InternalMount == false to use this filesystem
+ // type.
+ AllowUserMount bool
+
+ // If AllowUserList is true, make this filesystem type visible in
+ // /proc/filesystems.
+ AllowUserList bool
+
+ // If RequiresDevice is true, indicate that mounting this filesystem
+ // requires a block device as the mount source in /proc/filesystems.
+ RequiresDevice bool
+}
+
// RegisterFilesystemType registers the given FilesystemType in vfs with the
// given name.
-func (vfs *VirtualFilesystem) RegisterFilesystemType(name string, fsType FilesystemType) error {
+func (vfs *VirtualFilesystem) RegisterFilesystemType(name string, fsType FilesystemType, opts *RegisterFilesystemTypeOptions) error {
vfs.fsTypesMu.Lock()
defer vfs.fsTypesMu.Unlock()
if existing, ok := vfs.fsTypes[name]; ok {
- return fmt.Errorf("name %q is already registered to filesystem type %T", name, existing)
+ return fmt.Errorf("name %q is already registered to filesystem type %T", name, existing.fsType)
+ }
+ vfs.fsTypes[name] = &registeredFilesystemType{
+ fsType: fsType,
+ opts: *opts,
}
- vfs.fsTypes[name] = fsType
return nil
}
// MustRegisterFilesystemType is equivalent to RegisterFilesystemType but
// panics on failure.
-func (vfs *VirtualFilesystem) MustRegisterFilesystemType(name string, fsType FilesystemType) {
- if err := vfs.RegisterFilesystemType(name, fsType); err != nil {
+func (vfs *VirtualFilesystem) MustRegisterFilesystemType(name string, fsType FilesystemType, opts *RegisterFilesystemTypeOptions) {
+ if err := vfs.RegisterFilesystemType(name, fsType, opts); err != nil {
panic(fmt.Sprintf("failed to register filesystem type %T: %v", fsType, err))
}
}
-func (vfs *VirtualFilesystem) getFilesystemType(name string) FilesystemType {
+func (vfs *VirtualFilesystem) getFilesystemType(name string) *registeredFilesystemType {
vfs.fsTypesMu.RLock()
defer vfs.fsTypesMu.RUnlock()
return vfs.fsTypes[name]
}
+
+// GenerateProcFilesystems emits the contents of /proc/filesystems for vfs to
+// buf.
+func (vfs *VirtualFilesystem) GenerateProcFilesystems(buf *bytes.Buffer) {
+ vfs.fsTypesMu.RLock()
+ defer vfs.fsTypesMu.RUnlock()
+ for name, rft := range vfs.fsTypes {
+ if !rft.opts.AllowUserList {
+ continue
+ }
+ var nodev string
+ if !rft.opts.RequiresDevice {
+ nodev = "nodev"
+ }
+ fmt.Fprintf(buf, "%s\t%s\n", nodev, name)
+ }
+}
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index 1c3b2e987..00177b371 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -18,6 +18,7 @@ import (
"math"
"sync/atomic"
+ "gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/syserror"
@@ -111,11 +112,11 @@ type MountNamespace struct {
// configured by the given arguments. A reference is taken on the returned
// MountNamespace.
func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *GetFilesystemOptions) (*MountNamespace, error) {
- fsType := vfs.getFilesystemType(fsTypeName)
- if fsType == nil {
+ rft := vfs.getFilesystemType(fsTypeName)
+ if rft == nil {
return nil, syserror.ENODEV
}
- fs, root, err := fsType.GetFilesystem(ctx, vfs, creds, source, *opts)
+ fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, *opts)
if err != nil {
return nil, err
}
@@ -133,13 +134,16 @@ func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth
return mntns, nil
}
-// NewMount creates and mounts a Filesystem configured by the given arguments.
-func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *GetFilesystemOptions) error {
- fsType := vfs.getFilesystemType(fsTypeName)
- if fsType == nil {
+// MountAt creates and mounts a Filesystem configured by the given arguments.
+func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) error {
+ rft := vfs.getFilesystemType(fsTypeName)
+ if rft == nil {
return syserror.ENODEV
}
- fs, root, err := fsType.GetFilesystem(ctx, vfs, creds, source, *opts)
+ if !opts.InternalMount && !rft.opts.AllowUserMount {
+ return syserror.ENODEV
+ }
+ fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions)
if err != nil {
return err
}
@@ -207,6 +211,68 @@ func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credenti
return nil
}
+// UmountAt removes the Mount at the given path.
+func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *UmountOptions) error {
+ if opts.Flags&^(linux.MNT_FORCE|linux.MNT_DETACH) != 0 {
+ return syserror.EINVAL
+ }
+
+ // MNT_FORCE is currently unimplemented except for the permission check.
+ if opts.Flags&linux.MNT_FORCE != 0 && creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, creds.UserNamespace.Root()) {
+ return syserror.EPERM
+ }
+
+ vd, err := vfs.GetDentryAt(ctx, creds, pop, &GetDentryOptions{})
+ if err != nil {
+ return err
+ }
+ defer vd.DecRef()
+ if vd.dentry != vd.mount.root {
+ return syserror.EINVAL
+ }
+ vfs.mountMu.Lock()
+ if mntns := MountNamespaceFromContext(ctx); mntns != nil && mntns != vd.mount.ns {
+ vfs.mountMu.Unlock()
+ return syserror.EINVAL
+ }
+
+ // TODO(jamieliu): Linux special-cases umount of the caller's root, which
+ // we don't implement yet (we'll just fail it since the caller holds a
+ // reference on it).
+
+ vfs.mounts.seq.BeginWrite()
+ if opts.Flags&linux.MNT_DETACH == 0 {
+ if len(vd.mount.children) != 0 {
+ vfs.mounts.seq.EndWrite()
+ vfs.mountMu.Unlock()
+ return syserror.EBUSY
+ }
+ // We are holding a reference on vd.mount.
+ expectedRefs := int64(1)
+ if !vd.mount.umounted {
+ expectedRefs = 2
+ }
+ if atomic.LoadInt64(&vd.mount.refs)&^math.MinInt64 != expectedRefs { // mask out MSB
+ vfs.mounts.seq.EndWrite()
+ vfs.mountMu.Unlock()
+ return syserror.EBUSY
+ }
+ }
+ vdsToDecRef, mountsToDecRef := vfs.umountRecursiveLocked(vd.mount, &umountRecursiveOptions{
+ eager: opts.Flags&linux.MNT_DETACH == 0,
+ disconnectHierarchy: true,
+ }, nil, nil)
+ vfs.mounts.seq.EndWrite()
+ vfs.mountMu.Unlock()
+ for _, vd := range vdsToDecRef {
+ vd.DecRef()
+ }
+ for _, mnt := range mountsToDecRef {
+ mnt.DecRef()
+ }
+ return nil
+}
+
type umountRecursiveOptions struct {
// If eager is true, ensure that future calls to Mount.tryIncMountedRef()
// on umounted mounts fail.
diff --git a/pkg/sentry/vfs/mount_test.go b/pkg/sentry/vfs/mount_test.go
index adff0b94b..3b933468d 100644
--- a/pkg/sentry/vfs/mount_test.go
+++ b/pkg/sentry/vfs/mount_test.go
@@ -17,8 +17,9 @@ package vfs
import (
"fmt"
"runtime"
- "sync"
"testing"
+
+ "gvisor.dev/gvisor/pkg/sync"
)
func TestMountTableLookupEmpty(t *testing.T) {
diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go
index ab13fa461..bd90d36c4 100644
--- a/pkg/sentry/vfs/mount_unsafe.go
+++ b/pkg/sentry/vfs/mount_unsafe.go
@@ -26,7 +26,7 @@ import (
"sync/atomic"
"unsafe"
- "gvisor.dev/gvisor/pkg/syncutil"
+ "gvisor.dev/gvisor/pkg/sync"
)
// mountKey represents the location at which a Mount is mounted. It is
@@ -75,7 +75,7 @@ type mountTable struct {
// intrinsics and inline assembly, limiting the performance of this
// approach.)
- seq syncutil.SeqCount
+ seq sync.SeqCount
seed uint32 // for hashing keys
// size holds both length (number of elements) and capacity (number of
diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go
index 3aa73d911..b7774bf28 100644
--- a/pkg/sentry/vfs/options.go
+++ b/pkg/sentry/vfs/options.go
@@ -46,6 +46,16 @@ type MknodOptions struct {
DevMinor uint32
}
+// MountOptions contains options to VirtualFilesystem.MountAt().
+type MountOptions struct {
+ // GetFilesystemOptions contains options to FilesystemType.GetFilesystem().
+ GetFilesystemOptions GetFilesystemOptions
+
+ // If InternalMount is true, allow the use of filesystem types for which
+ // RegisterFilesystemTypeOptions.AllowUserMount == false.
+ InternalMount bool
+}
+
// OpenOptions contains options to VirtualFilesystem.OpenAt() and
// FilesystemImpl.OpenAt().
type OpenOptions struct {
@@ -77,6 +87,9 @@ type ReadOptions struct {
type RenameOptions struct {
// Flags contains flags as specified for renameat2(2).
Flags uint32
+
+ // If MustBeDir is true, the renamed file must be a directory.
+ MustBeDir bool
}
// SetStatOptions contains options to VirtualFilesystem.SetStatAt(),
@@ -95,6 +108,20 @@ type SetStatOptions struct {
Stat linux.Statx
}
+// SetxattrOptions contains options to VirtualFilesystem.SetxattrAt(),
+// FilesystemImpl.SetxattrAt(), FileDescription.Setxattr(), and
+// FileDescriptionImpl.Setxattr().
+type SetxattrOptions struct {
+ // Name is the name of the extended attribute being mutated.
+ Name string
+
+ // Value is the extended attribute's new value.
+ Value string
+
+ // Flags contains flags as specified for setxattr/lsetxattr/fsetxattr(2).
+ Flags uint32
+}
+
// StatOptions contains options to VirtualFilesystem.StatAt(),
// FilesystemImpl.StatAt(), FileDescription.Stat(), and
// FileDescriptionImpl.Stat().
@@ -114,6 +141,12 @@ type StatOptions struct {
Sync uint32
}
+// UmountOptions contains options to VirtualFilesystem.UmountAt().
+type UmountOptions struct {
+ // Flags contains flags as specified for umount2(2).
+ Flags uint32
+}
+
// WriteOptions contains options to FileDescription.PWrite(),
// FileDescriptionImpl.PWrite(), FileDescription.Write(), and
// FileDescriptionImpl.Write().
diff --git a/pkg/sentry/vfs/pathname.go b/pkg/sentry/vfs/pathname.go
new file mode 100644
index 000000000..cf80df90e
--- /dev/null
+++ b/pkg/sentry/vfs/pathname.go
@@ -0,0 +1,152 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+var fspathBuilderPool = sync.Pool{
+ New: func() interface{} {
+ return &fspath.Builder{}
+ },
+}
+
+func getFSPathBuilder() *fspath.Builder {
+ return fspathBuilderPool.Get().(*fspath.Builder)
+}
+
+func putFSPathBuilder(b *fspath.Builder) {
+ // No methods can be called on b after b.String(), so reset it to its zero
+ // value (as returned by fspathBuilderPool.New) instead.
+ *b = fspath.Builder{}
+ fspathBuilderPool.Put(b)
+}
+
+// PathnameWithDeleted returns an absolute pathname to vd, consistent with
+// Linux's d_path(). In particular, if vd.Dentry() has been disowned,
+// PathnameWithDeleted appends " (deleted)" to the returned pathname.
+func (vfs *VirtualFilesystem) PathnameWithDeleted(ctx context.Context, vfsroot, vd VirtualDentry) (string, error) {
+ b := getFSPathBuilder()
+ defer putFSPathBuilder(b)
+ haveRef := false
+ defer func() {
+ if haveRef {
+ vd.DecRef()
+ }
+ }()
+
+ origD := vd.dentry
+loop:
+ for {
+ err := vd.mount.fs.impl.PrependPath(ctx, vfsroot, vd, b)
+ switch err.(type) {
+ case nil:
+ if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry {
+ // GenericPrependPath() will have returned
+ // PrependPathAtVFSRootError in this case since it checks
+ // against vfsroot before mnt.root, but other implementations
+ // of FilesystemImpl.PrependPath() may return nil instead.
+ break loop
+ }
+ nextVD := vfs.getMountpointAt(vd.mount, vfsroot)
+ if !nextVD.Ok() {
+ break loop
+ }
+ if haveRef {
+ vd.DecRef()
+ }
+ vd = nextVD
+ haveRef = true
+ // continue loop
+ case PrependPathSyntheticError:
+ // Skip prepending "/" and appending " (deleted)".
+ return b.String(), nil
+ case PrependPathAtVFSRootError, PrependPathAtNonMountRootError:
+ break loop
+ default:
+ return "", err
+ }
+ }
+ b.PrependByte('/')
+ if origD.IsDisowned() {
+ b.AppendString(" (deleted)")
+ }
+ return b.String(), nil
+}
+
+// PathnameForGetcwd returns an absolute pathname to vd, consistent with
+// Linux's sys_getcwd().
+func (vfs *VirtualFilesystem) PathnameForGetcwd(ctx context.Context, vfsroot, vd VirtualDentry) (string, error) {
+ if vd.dentry.IsDisowned() {
+ return "", syserror.ENOENT
+ }
+
+ b := getFSPathBuilder()
+ defer putFSPathBuilder(b)
+ haveRef := false
+ defer func() {
+ if haveRef {
+ vd.DecRef()
+ }
+ }()
+ unreachable := false
+loop:
+ for {
+ err := vd.mount.fs.impl.PrependPath(ctx, vfsroot, vd, b)
+ switch err.(type) {
+ case nil:
+ if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry {
+ break loop
+ }
+ nextVD := vfs.getMountpointAt(vd.mount, vfsroot)
+ if !nextVD.Ok() {
+ unreachable = true
+ break loop
+ }
+ if haveRef {
+ vd.DecRef()
+ }
+ vd = nextVD
+ haveRef = true
+ case PrependPathAtVFSRootError:
+ break loop
+ case PrependPathAtNonMountRootError, PrependPathSyntheticError:
+ unreachable = true
+ break loop
+ default:
+ return "", err
+ }
+ }
+ b.PrependByte('/')
+ if unreachable {
+ b.PrependString("(unreachable)")
+ }
+ return b.String(), nil
+}
+
+// As of this writing, we do not have equivalents to:
+//
+// - d_absolute_path(), which returns EINVAL if (effectively) any call to
+// FilesystemImpl.PrependPath() would return PrependPathAtNonMountRootError.
+//
+// - dentry_path(), which does not walk up mounts (and only returns the path
+// relative to Filesystem root), but also appends "//deleted" for disowned
+// Dentries.
+//
+// These should be added as necessary.
diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go
index f8e74355c..f1edb0680 100644
--- a/pkg/sentry/vfs/permissions.go
+++ b/pkg/sentry/vfs/permissions.go
@@ -119,3 +119,65 @@ func MayWriteFileWithOpenFlags(flags uint32) bool {
return false
}
}
+
+// CheckSetStat checks that creds has permission to change the metadata of a
+// file with the given permissions, UID, and GID as specified by stat, subject
+// to the rules of Linux's fs/attr.c:setattr_prepare().
+func CheckSetStat(creds *auth.Credentials, stat *linux.Statx, mode uint16, kuid auth.KUID, kgid auth.KGID) error {
+ if stat.Mask&linux.STATX_MODE != 0 {
+ if !CanActAsOwner(creds, kuid) {
+ return syserror.EPERM
+ }
+ // TODO(b/30815691): "If the calling process is not privileged (Linux:
+ // does not have the CAP_FSETID capability), and the group of the file
+ // does not match the effective group ID of the process or one of its
+ // supplementary group IDs, the S_ISGID bit will be turned off, but
+ // this will not cause an error to be returned." - chmod(2)
+ }
+ if stat.Mask&linux.STATX_UID != 0 {
+ if !((creds.EffectiveKUID == kuid && auth.KUID(stat.UID) == kuid) ||
+ HasCapabilityOnFile(creds, linux.CAP_CHOWN, kuid, kgid)) {
+ return syserror.EPERM
+ }
+ }
+ if stat.Mask&linux.STATX_GID != 0 {
+ if !((creds.EffectiveKUID == kuid && creds.InGroup(auth.KGID(stat.GID))) ||
+ HasCapabilityOnFile(creds, linux.CAP_CHOWN, kuid, kgid)) {
+ return syserror.EPERM
+ }
+ }
+ if stat.Mask&(linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME) != 0 {
+ if !CanActAsOwner(creds, kuid) {
+ if (stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec != linux.UTIME_NOW) ||
+ (stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec != linux.UTIME_NOW) ||
+ (stat.Mask&linux.STATX_CTIME != 0 && stat.Ctime.Nsec != linux.UTIME_NOW) {
+ return syserror.EPERM
+ }
+ // isDir is irrelevant in the following call to
+ // GenericCheckPermissions since ats == MayWrite means that
+ // CAP_DAC_READ_SEARCH does not apply, and CAP_DAC_OVERRIDE
+ // applies, regardless of isDir.
+ if err := GenericCheckPermissions(creds, MayWrite, false /* isDir */, mode, kuid, kgid); err != nil {
+ return err
+ }
+ }
+ }
+ return nil
+}
+
+// CanActAsOwner returns true if creds can act as the owner of a file with the
+// given owning UID, consistent with Linux's
+// fs/inode.c:inode_owner_or_capable().
+func CanActAsOwner(creds *auth.Credentials, kuid auth.KUID) bool {
+ if creds.EffectiveKUID == kuid {
+ return true
+ }
+ return creds.HasCapability(linux.CAP_FOWNER) && creds.UserNamespace.MapFromKUID(kuid).Ok()
+}
+
+// HasCapabilityOnFile returns true if creds has the given capability with
+// respect to a file with the given owning UID and GID, consistent with Linux's
+// kernel/capability.c:capable_wrt_inode_uidgid().
+func HasCapabilityOnFile(creds *auth.Credentials, cp linux.Capability, kuid auth.KUID, kgid auth.KGID) bool {
+ return creds.HasCapability(cp) && creds.UserNamespace.MapFromKUID(kuid).Ok() && creds.UserNamespace.MapFromKGID(kgid).Ok()
+}
diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go
index 621f5a6f8..8a0b382f6 100644
--- a/pkg/sentry/vfs/resolving_path.go
+++ b/pkg/sentry/vfs/resolving_path.go
@@ -16,11 +16,11 @@ package vfs
import (
"fmt"
- "sync"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -85,11 +85,11 @@ func init() {
// so error "constants" are really mutable vars, necessitating somewhat
// expensive interface object comparisons.
-type resolveMountRootError struct{}
+type resolveMountRootOrJumpError struct{}
// Error implements error.Error.
-func (resolveMountRootError) Error() string {
- return "resolving mount root"
+func (resolveMountRootOrJumpError) Error() string {
+ return "resolving mount root or jump"
}
type resolveMountPointError struct{}
@@ -112,30 +112,26 @@ var resolvingPathPool = sync.Pool{
},
}
-func (vfs *VirtualFilesystem) getResolvingPath(creds *auth.Credentials, pop *PathOperation) (*ResolvingPath, error) {
- path, err := fspath.Parse(pop.Pathname)
- if err != nil {
- return nil, err
- }
+func (vfs *VirtualFilesystem) getResolvingPath(creds *auth.Credentials, pop *PathOperation) *ResolvingPath {
rp := resolvingPathPool.Get().(*ResolvingPath)
rp.vfs = vfs
rp.root = pop.Root
rp.mount = pop.Start.mount
rp.start = pop.Start.dentry
- rp.pit = path.Begin
+ rp.pit = pop.Path.Begin
rp.flags = 0
if pop.FollowFinalSymlink {
rp.flags |= rpflagsFollowFinalSymlink
}
- rp.mustBeDir = path.Dir
- rp.mustBeDirOrig = path.Dir
+ rp.mustBeDir = pop.Path.Dir
+ rp.mustBeDirOrig = pop.Path.Dir
rp.symlinks = 0
rp.curPart = 0
rp.numOrigParts = 1
rp.creds = creds
- rp.parts[0] = path.Begin
- rp.origParts[0] = path.Begin
- return rp, nil
+ rp.parts[0] = pop.Path.Begin
+ rp.origParts[0] = pop.Path.Begin
+ return rp
}
func (vfs *VirtualFilesystem) putResolvingPath(rp *ResolvingPath) {
@@ -274,7 +270,7 @@ func (rp *ResolvingPath) ResolveParent(d *Dentry) (*Dentry, error) {
// ... of non-root mount.
rp.nextMount = vd.mount
rp.nextStart = vd.dentry
- return nil, resolveMountRootError{}
+ return nil, resolveMountRootOrJumpError{}
}
// ... of root mount.
parent = d
@@ -345,29 +341,34 @@ func (rp *ResolvingPath) ShouldFollowSymlink() bool {
// symlink target and returns nil. Otherwise it returns a non-nil error.
//
// Preconditions: !rp.Done().
+//
+// Postconditions: If HandleSymlink returns a nil error, then !rp.Done().
func (rp *ResolvingPath) HandleSymlink(target string) error {
if rp.symlinks >= linux.MaxSymlinkTraversals {
return syserror.ELOOP
}
- targetPath, err := fspath.Parse(target)
- if err != nil {
- return err
+ if len(target) == 0 {
+ return syserror.ENOENT
}
rp.symlinks++
+ targetPath := fspath.Parse(target)
if targetPath.Absolute {
rp.absSymlinkTarget = targetPath
return resolveAbsSymlinkError{}
}
- if !targetPath.Begin.Ok() {
- panic(fmt.Sprintf("symbolic link has non-empty target %q that is both relative and has no path components?", target))
- }
// Consume the path component that represented the symlink.
rp.Advance()
// Prepend the symlink target to the relative path.
+ if checkInvariants {
+ if !targetPath.HasComponents() {
+ panic(fmt.Sprintf("non-empty pathname %q parsed to relative path with no components", target))
+ }
+ }
rp.relpathPrepend(targetPath)
return nil
}
+// Preconditions: path.HasComponents().
func (rp *ResolvingPath) relpathPrepend(path fspath.Path) {
if rp.pit.Ok() {
rp.parts[rp.curPart] = rp.pit
@@ -385,11 +386,32 @@ func (rp *ResolvingPath) relpathPrepend(path fspath.Path) {
}
}
+// HandleJump is called when the current path component is a "magic" link to
+// the given VirtualDentry, like /proc/[pid]/fd/[fd]. If the calling Filesystem
+// method should continue path traversal, HandleMagicSymlink updates the path
+// component stream to reflect the magic link target and returns nil. Otherwise
+// it returns a non-nil error.
+//
+// Preconditions: !rp.Done().
+func (rp *ResolvingPath) HandleJump(target VirtualDentry) error {
+ if rp.symlinks >= linux.MaxSymlinkTraversals {
+ return syserror.ELOOP
+ }
+ rp.symlinks++
+ // Consume the path component that represented the magic link.
+ rp.Advance()
+ // Unconditionally return a resolveMountRootOrJumpError, even if the Mount
+ // isn't changing, to force restarting at the new Dentry.
+ target.IncRef()
+ rp.nextMount = target.mount
+ rp.nextStart = target.dentry
+ return resolveMountRootOrJumpError{}
+}
+
func (rp *ResolvingPath) handleError(err error) bool {
switch err.(type) {
- case resolveMountRootError:
- // Switch to the new Mount. We hold references on the Mount and Dentry
- // (from VFS.getMountpointAt()).
+ case resolveMountRootOrJumpError:
+ // Switch to the new Mount. We hold references on the Mount and Dentry.
rp.decRefStartAndMount()
rp.mount = rp.nextMount
rp.start = rp.nextStart
@@ -407,9 +429,8 @@ func (rp *ResolvingPath) handleError(err error) bool {
return true
case resolveMountPointError:
- // Switch to the new Mount. We hold a reference on the Mount (from
- // VFS.getMountAt()), but borrow the reference on the mount root from
- // the Mount.
+ // Switch to the new Mount. We hold a reference on the Mount, but
+ // borrow the reference on the mount root from the Mount.
rp.decRefStartAndMount()
rp.mount = rp.nextMount
rp.start = rp.nextMount.root
@@ -447,6 +468,17 @@ func (rp *ResolvingPath) handleError(err error) bool {
}
}
+// canHandleError returns true if err is an error returned by rp.Resolve*()
+// that rp.handleError() may attempt to handle.
+func (rp *ResolvingPath) canHandleError(err error) bool {
+ switch err.(type) {
+ case resolveMountRootOrJumpError, resolveMountPointError, resolveAbsSymlinkError:
+ return true
+ default:
+ return false
+ }
+}
+
// MustBeDir returns true if the file traversed by rp must be a directory.
func (rp *ResolvingPath) MustBeDir() bool {
return rp.mustBeDir
diff --git a/pkg/sentry/vfs/syscalls.go b/pkg/sentry/vfs/syscalls.go
deleted file mode 100644
index 436151afa..000000000
--- a/pkg/sentry/vfs/syscalls.go
+++ /dev/null
@@ -1,237 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package vfs
-
-import (
- "gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/sentry/context"
- "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
- "gvisor.dev/gvisor/pkg/syserror"
-)
-
-// PathOperation specifies the path operated on by a VFS method.
-//
-// PathOperation is passed to VFS methods by pointer to reduce memory copying:
-// it's somewhat large and should never escape. (Options structs are passed by
-// pointer to VFS and FileDescription methods for the same reason.)
-type PathOperation struct {
- // Root is the VFS root. References on Root are borrowed from the provider
- // of the PathOperation.
- //
- // Invariants: Root.Ok().
- Root VirtualDentry
-
- // Start is the starting point for the path traversal. References on Start
- // are borrowed from the provider of the PathOperation (i.e. the caller of
- // the VFS method to which the PathOperation was passed).
- //
- // Invariants: Start.Ok(). If Pathname.Absolute, then Start == Root.
- Start VirtualDentry
-
- // Path is the pathname traversed by this operation.
- Pathname string
-
- // If FollowFinalSymlink is true, and the Dentry traversed by the final
- // path component represents a symbolic link, the symbolic link should be
- // followed.
- FollowFinalSymlink bool
-}
-
-// GetDentryAt returns a VirtualDentry representing the given path, at which a
-// file must exist. A reference is taken on the returned VirtualDentry.
-func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetDentryOptions) (VirtualDentry, error) {
- rp, err := vfs.getResolvingPath(creds, pop)
- if err != nil {
- return VirtualDentry{}, err
- }
- for {
- d, err := rp.mount.fs.impl.GetDentryAt(ctx, rp, *opts)
- if err == nil {
- vd := VirtualDentry{
- mount: rp.mount,
- dentry: d,
- }
- rp.mount.IncRef()
- vfs.putResolvingPath(rp)
- return vd, nil
- }
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
- return VirtualDentry{}, err
- }
- }
-}
-
-// MkdirAt creates a directory at the given path.
-func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error {
- // "Under Linux, apart from the permission bits, the S_ISVTX mode bit is
- // also honored." - mkdir(2)
- opts.Mode &= 01777
- rp, err := vfs.getResolvingPath(creds, pop)
- if err != nil {
- return err
- }
- for {
- err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts)
- if err == nil {
- vfs.putResolvingPath(rp)
- return nil
- }
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
- return err
- }
- }
-}
-
-// MknodAt creates a file of the given mode at the given path. It returns an
-// error from the syserror package.
-func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MknodOptions) error {
- rp, err := vfs.getResolvingPath(creds, pop)
- if err != nil {
- return nil
- }
- for {
- if err = rp.mount.fs.impl.MknodAt(ctx, rp, *opts); err == nil {
- vfs.putResolvingPath(rp)
- return nil
- }
- // Handle mount traversals.
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
- return err
- }
- }
-}
-
-// OpenAt returns a FileDescription providing access to the file at the given
-// path. A reference is taken on the returned FileDescription.
-func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *OpenOptions) (*FileDescription, error) {
- // Remove:
- //
- // - O_LARGEFILE, which we always report in FileDescription status flags
- // since only 64-bit architectures are supported at this time.
- //
- // - O_CLOEXEC, which affects file descriptors and therefore must be
- // handled outside of VFS.
- //
- // - Unknown flags.
- opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_APPEND | linux.O_NONBLOCK | linux.O_DSYNC | linux.O_ASYNC | linux.O_DIRECT | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NOATIME | linux.O_SYNC | linux.O_PATH | linux.O_TMPFILE
- // Linux's __O_SYNC (which we call linux.O_SYNC) implies O_DSYNC.
- if opts.Flags&linux.O_SYNC != 0 {
- opts.Flags |= linux.O_DSYNC
- }
- // Linux's __O_TMPFILE (which we call linux.O_TMPFILE) must be specified
- // with O_DIRECTORY and a writable access mode (to ensure that it fails on
- // filesystem implementations that do not support it).
- if opts.Flags&linux.O_TMPFILE != 0 {
- if opts.Flags&linux.O_DIRECTORY == 0 {
- return nil, syserror.EINVAL
- }
- if opts.Flags&linux.O_CREAT != 0 {
- return nil, syserror.EINVAL
- }
- if opts.Flags&linux.O_ACCMODE == linux.O_RDONLY {
- return nil, syserror.EINVAL
- }
- }
- // O_PATH causes most other flags to be ignored.
- if opts.Flags&linux.O_PATH != 0 {
- opts.Flags &= linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_PATH
- }
- // "On Linux, the following bits are also honored in mode: [S_ISUID,
- // S_ISGID, S_ISVTX]" - open(2)
- opts.Mode &= 07777
-
- if opts.Flags&linux.O_NOFOLLOW != 0 {
- pop.FollowFinalSymlink = false
- }
- rp, err := vfs.getResolvingPath(creds, pop)
- if err != nil {
- return nil, err
- }
- if opts.Flags&linux.O_DIRECTORY != 0 {
- rp.mustBeDir = true
- rp.mustBeDirOrig = true
- }
- for {
- fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts)
- if err == nil {
- vfs.putResolvingPath(rp)
- return fd, nil
- }
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
- return nil, err
- }
- }
-}
-
-// StatAt returns metadata for the file at the given path.
-func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *StatOptions) (linux.Statx, error) {
- rp, err := vfs.getResolvingPath(creds, pop)
- if err != nil {
- return linux.Statx{}, err
- }
- for {
- stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts)
- if err == nil {
- vfs.putResolvingPath(rp)
- return stat, nil
- }
- if !rp.handleError(err) {
- vfs.putResolvingPath(rp)
- return linux.Statx{}, err
- }
- }
-}
-
-// StatusFlags returns file description status flags.
-func (fd *FileDescription) StatusFlags(ctx context.Context) (uint32, error) {
- flags, err := fd.impl.StatusFlags(ctx)
- flags |= linux.O_LARGEFILE
- return flags, err
-}
-
-// SetStatusFlags sets file description status flags.
-func (fd *FileDescription) SetStatusFlags(ctx context.Context, flags uint32) error {
- return fd.impl.SetStatusFlags(ctx, flags)
-}
-
-// TODO:
-//
-// - VFS.SyncAllFilesystems() for sync(2)
-//
-// - Something for syncfs(2)
-//
-// - VFS.LinkAt()
-//
-// - VFS.ReadlinkAt()
-//
-// - VFS.RenameAt()
-//
-// - VFS.RmdirAt()
-//
-// - VFS.SetStatAt()
-//
-// - VFS.StatFSAt()
-//
-// - VFS.SymlinkAt()
-//
-// - VFS.UmountAt()
-//
-// - VFS.UnlinkAt()
-//
-// - FileDescription.(almost everything)
diff --git a/pkg/sentry/vfs/testutil.go b/pkg/sentry/vfs/testutil.go
index 593144cb7..ee5c8b9e2 100644
--- a/pkg/sentry/vfs/testutil.go
+++ b/pkg/sentry/vfs/testutil.go
@@ -15,7 +15,10 @@
package vfs
import (
+ "fmt"
+
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/syserror"
@@ -54,6 +57,11 @@ func (fs *FDTestFilesystem) GetDentryAt(ctx context.Context, rp *ResolvingPath,
return nil, syserror.EPERM
}
+// GetParentDentryAt implements FilesystemImpl.GetParentDentryAt.
+func (fs *FDTestFilesystem) GetParentDentryAt(ctx context.Context, rp *ResolvingPath) (*Dentry, error) {
+ return nil, syserror.EPERM
+}
+
// LinkAt implements FilesystemImpl.LinkAt.
func (fs *FDTestFilesystem) LinkAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry) error {
return syserror.EPERM
@@ -80,7 +88,7 @@ func (fs *FDTestFilesystem) ReadlinkAt(ctx context.Context, rp *ResolvingPath) (
}
// RenameAt implements FilesystemImpl.RenameAt.
-func (fs *FDTestFilesystem) RenameAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry, opts RenameOptions) error {
+func (fs *FDTestFilesystem) RenameAt(ctx context.Context, rp *ResolvingPath, oldParentVD VirtualDentry, oldName string, opts RenameOptions) error {
return syserror.EPERM
}
@@ -114,6 +122,32 @@ func (fs *FDTestFilesystem) UnlinkAt(ctx context.Context, rp *ResolvingPath) err
return syserror.EPERM
}
+// ListxattrAt implements FilesystemImpl.ListxattrAt.
+func (fs *FDTestFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath) ([]string, error) {
+ return nil, syserror.EPERM
+}
+
+// GetxattrAt implements FilesystemImpl.GetxattrAt.
+func (fs *FDTestFilesystem) GetxattrAt(ctx context.Context, rp *ResolvingPath, name string) (string, error) {
+ return "", syserror.EPERM
+}
+
+// SetxattrAt implements FilesystemImpl.SetxattrAt.
+func (fs *FDTestFilesystem) SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error {
+ return syserror.EPERM
+}
+
+// RemovexattrAt implements FilesystemImpl.RemovexattrAt.
+func (fs *FDTestFilesystem) RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error {
+ return syserror.EPERM
+}
+
+// PrependPath implements FilesystemImpl.PrependPath.
+func (fs *FDTestFilesystem) PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error {
+ b.PrependComponent(fmt.Sprintf("vfs.fdTestDentry:%p", vd.dentry.impl.(*fdTestDentry)))
+ return PrependPathSyntheticError{}
+}
+
type fdTestDentry struct {
vfsd Dentry
}
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index f0cd3ffe5..1f21b0b31 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -20,6 +20,7 @@
// VirtualFilesystem.mountMu
// Dentry.mu
// Locks acquired by FilesystemImpls between Prepare{Delete,Rename}Dentry and Commit{Delete,Rename*}Dentry
+// VirtualFilesystem.filesystemsMu
// VirtualFilesystem.fsTypesMu
//
// Locking Dentry.mu in multiple Dentries requires holding
@@ -27,7 +28,14 @@
package vfs
import (
- "sync"
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/syserror"
)
// A VirtualFilesystem (VFS for short) combines Filesystems in trees of Mounts.
@@ -67,22 +75,598 @@ type VirtualFilesystem struct {
// mountpoints is analogous to Linux's mountpoint_hashtable.
mountpoints map[*Dentry]map[*Mount]struct{}
- // fsTypes contains all FilesystemTypes that are usable in the
- // VirtualFilesystem. fsTypes is protected by fsTypesMu.
+ // devices contains all registered Devices. devices is protected by
+ // devicesMu.
+ devicesMu sync.RWMutex
+ devices map[devTuple]*registeredDevice
+
+ // fsTypes contains all registered FilesystemTypes. fsTypes is protected by
+ // fsTypesMu.
fsTypesMu sync.RWMutex
- fsTypes map[string]FilesystemType
+ fsTypes map[string]*registeredFilesystemType
+
+ // filesystems contains all Filesystems. filesystems is protected by
+ // filesystemsMu.
+ filesystemsMu sync.Mutex
+ filesystems map[*Filesystem]struct{}
}
// New returns a new VirtualFilesystem with no mounts or FilesystemTypes.
func New() *VirtualFilesystem {
vfs := &VirtualFilesystem{
mountpoints: make(map[*Dentry]map[*Mount]struct{}),
- fsTypes: make(map[string]FilesystemType),
+ devices: make(map[devTuple]*registeredDevice),
+ fsTypes: make(map[string]*registeredFilesystemType),
+ filesystems: make(map[*Filesystem]struct{}),
}
vfs.mounts.Init()
return vfs
}
+// PathOperation specifies the path operated on by a VFS method.
+//
+// PathOperation is passed to VFS methods by pointer to reduce memory copying:
+// it's somewhat large and should never escape. (Options structs are passed by
+// pointer to VFS and FileDescription methods for the same reason.)
+type PathOperation struct {
+ // Root is the VFS root. References on Root are borrowed from the provider
+ // of the PathOperation.
+ //
+ // Invariants: Root.Ok().
+ Root VirtualDentry
+
+ // Start is the starting point for the path traversal. References on Start
+ // are borrowed from the provider of the PathOperation (i.e. the caller of
+ // the VFS method to which the PathOperation was passed).
+ //
+ // Invariants: Start.Ok(). If Path.Absolute, then Start == Root.
+ Start VirtualDentry
+
+ // Path is the pathname traversed by this operation.
+ Path fspath.Path
+
+ // If FollowFinalSymlink is true, and the Dentry traversed by the final
+ // path component represents a symbolic link, the symbolic link should be
+ // followed.
+ FollowFinalSymlink bool
+}
+
+// GetDentryAt returns a VirtualDentry representing the given path, at which a
+// file must exist. A reference is taken on the returned VirtualDentry.
+func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetDentryOptions) (VirtualDentry, error) {
+ rp := vfs.getResolvingPath(creds, pop)
+ for {
+ d, err := rp.mount.fs.impl.GetDentryAt(ctx, rp, *opts)
+ if err == nil {
+ vd := VirtualDentry{
+ mount: rp.mount,
+ dentry: d,
+ }
+ rp.mount.IncRef()
+ vfs.putResolvingPath(rp)
+ return vd, nil
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ return VirtualDentry{}, err
+ }
+ }
+}
+
+// Preconditions: pop.Path.Begin.Ok().
+func (vfs *VirtualFilesystem) getParentDirAndName(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (VirtualDentry, string, error) {
+ rp := vfs.getResolvingPath(creds, pop)
+ for {
+ parent, err := rp.mount.fs.impl.GetParentDentryAt(ctx, rp)
+ if err == nil {
+ parentVD := VirtualDentry{
+ mount: rp.mount,
+ dentry: parent,
+ }
+ rp.mount.IncRef()
+ name := rp.Component()
+ vfs.putResolvingPath(rp)
+ return parentVD, name, nil
+ }
+ if checkInvariants {
+ if rp.canHandleError(err) && rp.Done() {
+ panic(fmt.Sprintf("%T.GetParentDentryAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+ }
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ return VirtualDentry{}, "", err
+ }
+ }
+}
+
+// LinkAt creates a hard link at newpop representing the existing file at
+// oldpop.
+func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation) error {
+ oldVD, err := vfs.GetDentryAt(ctx, creds, oldpop, &GetDentryOptions{})
+ if err != nil {
+ return err
+ }
+
+ if !newpop.Path.Begin.Ok() {
+ oldVD.DecRef()
+ if newpop.Path.Absolute {
+ return syserror.EEXIST
+ }
+ return syserror.ENOENT
+ }
+ if newpop.FollowFinalSymlink {
+ oldVD.DecRef()
+ ctx.Warningf("VirtualFilesystem.LinkAt: file creation paths can't follow final symlink")
+ return syserror.EINVAL
+ }
+
+ rp := vfs.getResolvingPath(creds, newpop)
+ for {
+ err := rp.mount.fs.impl.LinkAt(ctx, rp, oldVD)
+ if err == nil {
+ vfs.putResolvingPath(rp)
+ oldVD.DecRef()
+ return nil
+ }
+ if checkInvariants {
+ if rp.canHandleError(err) && rp.Done() {
+ panic(fmt.Sprintf("%T.LinkAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+ }
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ oldVD.DecRef()
+ return err
+ }
+ }
+}
+
+// MkdirAt creates a directory at the given path.
+func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error {
+ if !pop.Path.Begin.Ok() {
+ if pop.Path.Absolute {
+ return syserror.EEXIST
+ }
+ return syserror.ENOENT
+ }
+ if pop.FollowFinalSymlink {
+ ctx.Warningf("VirtualFilesystem.MkdirAt: file creation paths can't follow final symlink")
+ return syserror.EINVAL
+ }
+ // "Under Linux, apart from the permission bits, the S_ISVTX mode bit is
+ // also honored." - mkdir(2)
+ opts.Mode &= 0777 | linux.S_ISVTX
+
+ rp := vfs.getResolvingPath(creds, pop)
+ for {
+ err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts)
+ if err == nil {
+ vfs.putResolvingPath(rp)
+ return nil
+ }
+ if checkInvariants {
+ if rp.canHandleError(err) && rp.Done() {
+ panic(fmt.Sprintf("%T.MkdirAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+ }
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ return err
+ }
+ }
+}
+
+// MknodAt creates a file of the given mode at the given path. It returns an
+// error from the syserror package.
+func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MknodOptions) error {
+ if !pop.Path.Begin.Ok() {
+ if pop.Path.Absolute {
+ return syserror.EEXIST
+ }
+ return syserror.ENOENT
+ }
+ if pop.FollowFinalSymlink {
+ ctx.Warningf("VirtualFilesystem.MknodAt: file creation paths can't follow final symlink")
+ return syserror.EINVAL
+ }
+
+ rp := vfs.getResolvingPath(creds, pop)
+ for {
+ err := rp.mount.fs.impl.MknodAt(ctx, rp, *opts)
+ if err != nil {
+ vfs.putResolvingPath(rp)
+ return nil
+ }
+ if checkInvariants {
+ if rp.canHandleError(err) && rp.Done() {
+ panic(fmt.Sprintf("%T.MknodAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+ }
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ return err
+ }
+ }
+}
+
+// OpenAt returns a FileDescription providing access to the file at the given
+// path. A reference is taken on the returned FileDescription.
+func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *OpenOptions) (*FileDescription, error) {
+ // Remove:
+ //
+ // - O_LARGEFILE, which we always report in FileDescription status flags
+ // since only 64-bit architectures are supported at this time.
+ //
+ // - O_CLOEXEC, which affects file descriptors and therefore must be
+ // handled outside of VFS.
+ //
+ // - Unknown flags.
+ opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_APPEND | linux.O_NONBLOCK | linux.O_DSYNC | linux.O_ASYNC | linux.O_DIRECT | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NOATIME | linux.O_SYNC | linux.O_PATH | linux.O_TMPFILE
+ // Linux's __O_SYNC (which we call linux.O_SYNC) implies O_DSYNC.
+ if opts.Flags&linux.O_SYNC != 0 {
+ opts.Flags |= linux.O_DSYNC
+ }
+ // Linux's __O_TMPFILE (which we call linux.O_TMPFILE) must be specified
+ // with O_DIRECTORY and a writable access mode (to ensure that it fails on
+ // filesystem implementations that do not support it).
+ if opts.Flags&linux.O_TMPFILE != 0 {
+ if opts.Flags&linux.O_DIRECTORY == 0 {
+ return nil, syserror.EINVAL
+ }
+ if opts.Flags&linux.O_CREAT != 0 {
+ return nil, syserror.EINVAL
+ }
+ if opts.Flags&linux.O_ACCMODE == linux.O_RDONLY {
+ return nil, syserror.EINVAL
+ }
+ }
+ // O_PATH causes most other flags to be ignored.
+ if opts.Flags&linux.O_PATH != 0 {
+ opts.Flags &= linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_PATH
+ }
+ // "On Linux, the following bits are also honored in mode: [S_ISUID,
+ // S_ISGID, S_ISVTX]" - open(2)
+ opts.Mode &= 0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX
+
+ if opts.Flags&linux.O_NOFOLLOW != 0 {
+ pop.FollowFinalSymlink = false
+ }
+ rp := vfs.getResolvingPath(creds, pop)
+ if opts.Flags&linux.O_DIRECTORY != 0 {
+ rp.mustBeDir = true
+ rp.mustBeDirOrig = true
+ }
+ for {
+ fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts)
+ if err == nil {
+ vfs.putResolvingPath(rp)
+ return fd, nil
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ return nil, err
+ }
+ }
+}
+
+// ReadlinkAt returns the target of the symbolic link at the given path.
+func (vfs *VirtualFilesystem) ReadlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (string, error) {
+ rp := vfs.getResolvingPath(creds, pop)
+ for {
+ target, err := rp.mount.fs.impl.ReadlinkAt(ctx, rp)
+ if err == nil {
+ vfs.putResolvingPath(rp)
+ return target, nil
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ return "", err
+ }
+ }
+}
+
+// RenameAt renames the file at oldpop to newpop.
+func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation, opts *RenameOptions) error {
+ if !oldpop.Path.Begin.Ok() {
+ if oldpop.Path.Absolute {
+ return syserror.EBUSY
+ }
+ return syserror.ENOENT
+ }
+ if oldpop.FollowFinalSymlink {
+ ctx.Warningf("VirtualFilesystem.RenameAt: source path can't follow final symlink")
+ return syserror.EINVAL
+ }
+
+ oldParentVD, oldName, err := vfs.getParentDirAndName(ctx, creds, oldpop)
+ if err != nil {
+ return err
+ }
+ if oldName == "." || oldName == ".." {
+ oldParentVD.DecRef()
+ return syserror.EBUSY
+ }
+
+ if !newpop.Path.Begin.Ok() {
+ oldParentVD.DecRef()
+ if newpop.Path.Absolute {
+ return syserror.EBUSY
+ }
+ return syserror.ENOENT
+ }
+ if newpop.FollowFinalSymlink {
+ oldParentVD.DecRef()
+ ctx.Warningf("VirtualFilesystem.RenameAt: destination path can't follow final symlink")
+ return syserror.EINVAL
+ }
+
+ rp := vfs.getResolvingPath(creds, newpop)
+ renameOpts := *opts
+ if oldpop.Path.Dir {
+ renameOpts.MustBeDir = true
+ }
+ for {
+ err := rp.mount.fs.impl.RenameAt(ctx, rp, oldParentVD, oldName, renameOpts)
+ if err == nil {
+ vfs.putResolvingPath(rp)
+ oldParentVD.DecRef()
+ return nil
+ }
+ if checkInvariants {
+ if rp.canHandleError(err) && rp.Done() {
+ panic(fmt.Sprintf("%T.RenameAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+ }
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ oldParentVD.DecRef()
+ return err
+ }
+ }
+}
+
+// RmdirAt removes the directory at the given path.
+func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error {
+ if !pop.Path.Begin.Ok() {
+ if pop.Path.Absolute {
+ return syserror.EBUSY
+ }
+ return syserror.ENOENT
+ }
+ if pop.FollowFinalSymlink {
+ ctx.Warningf("VirtualFilesystem.RmdirAt: file deletion paths can't follow final symlink")
+ return syserror.EINVAL
+ }
+
+ rp := vfs.getResolvingPath(creds, pop)
+ for {
+ err := rp.mount.fs.impl.RmdirAt(ctx, rp)
+ if err == nil {
+ vfs.putResolvingPath(rp)
+ return nil
+ }
+ if checkInvariants {
+ if rp.canHandleError(err) && rp.Done() {
+ panic(fmt.Sprintf("%T.RmdirAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+ }
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ return err
+ }
+ }
+}
+
+// SetStatAt changes metadata for the file at the given path.
+func (vfs *VirtualFilesystem) SetStatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetStatOptions) error {
+ rp := vfs.getResolvingPath(creds, pop)
+ for {
+ err := rp.mount.fs.impl.SetStatAt(ctx, rp, *opts)
+ if err == nil {
+ vfs.putResolvingPath(rp)
+ return nil
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ return err
+ }
+ }
+}
+
+// StatAt returns metadata for the file at the given path.
+func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *StatOptions) (linux.Statx, error) {
+ rp := vfs.getResolvingPath(creds, pop)
+ for {
+ stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts)
+ if err == nil {
+ vfs.putResolvingPath(rp)
+ return stat, nil
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ return linux.Statx{}, err
+ }
+ }
+}
+
+// StatFSAt returns metadata for the filesystem containing the file at the
+// given path.
+func (vfs *VirtualFilesystem) StatFSAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (linux.Statfs, error) {
+ rp := vfs.getResolvingPath(creds, pop)
+ for {
+ statfs, err := rp.mount.fs.impl.StatFSAt(ctx, rp)
+ if err == nil {
+ vfs.putResolvingPath(rp)
+ return statfs, nil
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ return linux.Statfs{}, err
+ }
+ }
+}
+
+// SymlinkAt creates a symbolic link at the given path with the given target.
+func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, target string) error {
+ if !pop.Path.Begin.Ok() {
+ if pop.Path.Absolute {
+ return syserror.EEXIST
+ }
+ return syserror.ENOENT
+ }
+ if pop.FollowFinalSymlink {
+ ctx.Warningf("VirtualFilesystem.SymlinkAt: file creation paths can't follow final symlink")
+ return syserror.EINVAL
+ }
+
+ rp := vfs.getResolvingPath(creds, pop)
+ for {
+ err := rp.mount.fs.impl.SymlinkAt(ctx, rp, target)
+ if err == nil {
+ vfs.putResolvingPath(rp)
+ return nil
+ }
+ if checkInvariants {
+ if rp.canHandleError(err) && rp.Done() {
+ panic(fmt.Sprintf("%T.SymlinkAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+ }
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ return err
+ }
+ }
+}
+
+// UnlinkAt deletes the non-directory file at the given path.
+func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error {
+ if !pop.Path.Begin.Ok() {
+ if pop.Path.Absolute {
+ return syserror.EBUSY
+ }
+ return syserror.ENOENT
+ }
+ if pop.FollowFinalSymlink {
+ ctx.Warningf("VirtualFilesystem.UnlinkAt: file deletion paths can't follow final symlink")
+ return syserror.EINVAL
+ }
+
+ rp := vfs.getResolvingPath(creds, pop)
+ for {
+ err := rp.mount.fs.impl.UnlinkAt(ctx, rp)
+ if err == nil {
+ vfs.putResolvingPath(rp)
+ return nil
+ }
+ if checkInvariants {
+ if rp.canHandleError(err) && rp.Done() {
+ panic(fmt.Sprintf("%T.UnlinkAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+ }
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ return err
+ }
+ }
+}
+
+// ListxattrAt returns all extended attribute names for the file at the given
+// path.
+func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) ([]string, error) {
+ rp := vfs.getResolvingPath(creds, pop)
+ for {
+ names, err := rp.mount.fs.impl.ListxattrAt(ctx, rp)
+ if err == nil {
+ vfs.putResolvingPath(rp)
+ return names, nil
+ }
+ if err == syserror.ENOTSUP {
+ // Linux doesn't actually return ENOTSUP in this case; instead,
+ // fs/xattr.c:vfs_listxattr() falls back to allowing the security
+ // subsystem to return security extended attributes, which by
+ // default don't exist.
+ vfs.putResolvingPath(rp)
+ return nil, nil
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ return nil, err
+ }
+ }
+}
+
+// GetxattrAt returns the value associated with the given extended attribute
+// for the file at the given path.
+func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) (string, error) {
+ rp := vfs.getResolvingPath(creds, pop)
+ for {
+ val, err := rp.mount.fs.impl.GetxattrAt(ctx, rp, name)
+ if err == nil {
+ vfs.putResolvingPath(rp)
+ return val, nil
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ return "", err
+ }
+ }
+}
+
+// SetxattrAt changes the value associated with the given extended attribute
+// for the file at the given path.
+func (vfs *VirtualFilesystem) SetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetxattrOptions) error {
+ rp := vfs.getResolvingPath(creds, pop)
+ for {
+ err := rp.mount.fs.impl.SetxattrAt(ctx, rp, *opts)
+ if err == nil {
+ vfs.putResolvingPath(rp)
+ return nil
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ return err
+ }
+ }
+}
+
+// RemovexattrAt removes the given extended attribute from the file at rp.
+func (vfs *VirtualFilesystem) RemovexattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) error {
+ rp := vfs.getResolvingPath(creds, pop)
+ for {
+ err := rp.mount.fs.impl.RemovexattrAt(ctx, rp, name)
+ if err == nil {
+ vfs.putResolvingPath(rp)
+ return nil
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ return err
+ }
+ }
+}
+
+// SyncAllFilesystems has the semantics of Linux's sync(2).
+func (vfs *VirtualFilesystem) SyncAllFilesystems(ctx context.Context) error {
+ fss := make(map[*Filesystem]struct{})
+ vfs.filesystemsMu.Lock()
+ for fs := range vfs.filesystems {
+ if !fs.TryIncRef() {
+ continue
+ }
+ fss[fs] = struct{}{}
+ }
+ vfs.filesystemsMu.Unlock()
+ var retErr error
+ for fs := range fss {
+ if err := fs.impl.Sync(ctx); err != nil && retErr == nil {
+ retErr = err
+ }
+ fs.DecRef()
+ }
+ return retErr
+}
+
// A VirtualDentry represents a node in a VFS tree, by combining a Dentry
// (which represents a node in a Filesystem's tree) and a Mount (which
// represents the Filesystem's position in a VFS mount tree).
diff --git a/pkg/sentry/watchdog/BUILD b/pkg/sentry/watchdog/BUILD
index 4d8435265..28f21f13d 100644
--- a/pkg/sentry/watchdog/BUILD
+++ b/pkg/sentry/watchdog/BUILD
@@ -13,5 +13,6 @@ go_library(
"//pkg/metric",
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/time",
+ "//pkg/sync",
],
)
diff --git a/pkg/sentry/watchdog/watchdog.go b/pkg/sentry/watchdog/watchdog.go
index ecce6c69f..bfb2fac26 100644
--- a/pkg/sentry/watchdog/watchdog.go
+++ b/pkg/sentry/watchdog/watchdog.go
@@ -32,7 +32,6 @@ package watchdog
import (
"bytes"
"fmt"
- "sync"
"time"
"gvisor.dev/gvisor/pkg/abi/linux"
@@ -40,6 +39,7 @@ import (
"gvisor.dev/gvisor/pkg/metric"
"gvisor.dev/gvisor/pkg/sentry/kernel"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+ "gvisor.dev/gvisor/pkg/sync"
)
// Opts configures the watchdog.
@@ -287,7 +287,9 @@ func (w *Watchdog) runTurn() {
if !ok {
// New stuck task detected.
//
- // TODO(b/65849403): Tasks blocked doing IO may be considered stuck in kernel.
+ // Note that tasks blocked doing IO may be considered stuck in kernel,
+ // unless they are surrounded b
+ // Task.UninterruptibleSleepStart/Finish.
tc = &offender{lastUpdateTime: lastUpdateTime}
stuckTasks.Increment()
newTaskFound = true
diff --git a/pkg/sleep/sleep_test.go b/pkg/sleep/sleep_test.go
index 130806c86..af47e2ba1 100644
--- a/pkg/sleep/sleep_test.go
+++ b/pkg/sleep/sleep_test.go
@@ -376,6 +376,37 @@ func TestRace(t *testing.T) {
}
}
+// TestRaceInOrder tests that multiple wakers can continuously send wake requests to
+// the sleeper and that the wakers are retrieved in the order asserted.
+func TestRaceInOrder(t *testing.T) {
+ const wakers = 100
+ const wakeRequests = 10000
+
+ w := make([]Waker, wakers)
+ s := Sleeper{}
+
+ // Associate each waker and start goroutines that will assert them.
+ for i := range w {
+ s.AddWaker(&w[i], i)
+ }
+ go func() {
+ n := 0
+ for n < wakeRequests {
+ wk := w[n%len(w)]
+ wk.Assert()
+ n++
+ }
+ }()
+
+ // Wait for all wake up notifications from all wakers.
+ for i := 0; i < wakeRequests; i++ {
+ v, _ := s.Fetch(true)
+ if got, want := v, i%wakers; got != want {
+ t.Fatalf("got %d want %d", got, want)
+ }
+ }
+}
+
// BenchmarkSleeperMultiSelect measures how long it takes to fetch a wake up
// from 4 wakers when at least one is already asserted.
func BenchmarkSleeperMultiSelect(b *testing.B) {
diff --git a/pkg/syncutil/BUILD b/pkg/sync/BUILD
index b06a90bef..e8cd16b8f 100644
--- a/pkg/syncutil/BUILD
+++ b/pkg/sync/BUILD
@@ -29,10 +29,9 @@ go_template(
)
go_library(
- name = "syncutil",
+ name = "sync",
srcs = [
- "downgradable_rwmutex_1_12_unsafe.go",
- "downgradable_rwmutex_1_13_unsafe.go",
+ "aliases.go",
"downgradable_rwmutex_unsafe.go",
"memmove_unsafe.go",
"norace_unsafe.go",
@@ -40,15 +39,15 @@ go_library(
"seqcount.go",
"syncutil.go",
],
- importpath = "gvisor.dev/gvisor/pkg/syncutil",
+ importpath = "gvisor.dev/gvisor/pkg/sync",
)
go_test(
- name = "syncutil_test",
+ name = "sync_test",
size = "small",
srcs = [
"downgradable_rwmutex_test.go",
"seqcount_test.go",
],
- embed = [":syncutil"],
+ embed = [":sync"],
)
diff --git a/pkg/syncutil/LICENSE b/pkg/sync/LICENSE
index 6a66aea5e..6a66aea5e 100644
--- a/pkg/syncutil/LICENSE
+++ b/pkg/sync/LICENSE
diff --git a/pkg/syncutil/README.md b/pkg/sync/README.md
index 2183c4e20..2183c4e20 100644
--- a/pkg/syncutil/README.md
+++ b/pkg/sync/README.md
diff --git a/pkg/sync/aliases.go b/pkg/sync/aliases.go
new file mode 100644
index 000000000..20c7ca041
--- /dev/null
+++ b/pkg/sync/aliases.go
@@ -0,0 +1,37 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sync
+
+import (
+ "sync"
+)
+
+// Aliases of standard library types.
+type (
+ // Mutex is an alias of sync.Mutex.
+ Mutex = sync.Mutex
+
+ // RWMutex is an alias of sync.RWMutex.
+ RWMutex = sync.RWMutex
+
+ // Cond is an alias of sync.Cond.
+ Cond = sync.Cond
+
+ // Locker is an alias of sync.Locker.
+ Locker = sync.Locker
+
+ // Once is an alias of sync.Once.
+ Once = sync.Once
+
+ // Pool is an alias of sync.Pool.
+ Pool = sync.Pool
+
+ // WaitGroup is an alias of sync.WaitGroup.
+ WaitGroup = sync.WaitGroup
+
+ // Map is an alias of sync.Map.
+ Map = sync.Map
+)
diff --git a/pkg/syncutil/atomicptr_unsafe.go b/pkg/sync/atomicptr_unsafe.go
index 525c4beed..525c4beed 100644
--- a/pkg/syncutil/atomicptr_unsafe.go
+++ b/pkg/sync/atomicptr_unsafe.go
diff --git a/pkg/syncutil/atomicptrtest/BUILD b/pkg/sync/atomicptrtest/BUILD
index 63f411a90..418eda29c 100644
--- a/pkg/syncutil/atomicptrtest/BUILD
+++ b/pkg/sync/atomicptrtest/BUILD
@@ -9,7 +9,7 @@ go_template_instance(
out = "atomicptr_int_unsafe.go",
package = "atomicptr",
suffix = "Int",
- template = "//pkg/syncutil:generic_atomicptr",
+ template = "//pkg/sync:generic_atomicptr",
types = {
"Value": "int",
},
@@ -18,7 +18,7 @@ go_template_instance(
go_library(
name = "atomicptr",
srcs = ["atomicptr_int_unsafe.go"],
- importpath = "gvisor.dev/gvisor/pkg/syncutil/atomicptr",
+ importpath = "gvisor.dev/gvisor/pkg/sync/atomicptr",
)
go_test(
diff --git a/pkg/syncutil/atomicptrtest/atomicptr_test.go b/pkg/sync/atomicptrtest/atomicptr_test.go
index 8fdc5112e..8fdc5112e 100644
--- a/pkg/syncutil/atomicptrtest/atomicptr_test.go
+++ b/pkg/sync/atomicptrtest/atomicptr_test.go
diff --git a/pkg/syncutil/downgradable_rwmutex_test.go b/pkg/sync/downgradable_rwmutex_test.go
index ffaf7ecc7..f04496bc5 100644
--- a/pkg/syncutil/downgradable_rwmutex_test.go
+++ b/pkg/sync/downgradable_rwmutex_test.go
@@ -9,7 +9,7 @@
// addition of downgradingWriter and the renaming of num_iterations to
// numIterations to shut up Golint.
-package syncutil
+package sync
import (
"fmt"
diff --git a/pkg/syncutil/downgradable_rwmutex_unsafe.go b/pkg/sync/downgradable_rwmutex_unsafe.go
index 07feca402..9bb55cd3a 100644
--- a/pkg/syncutil/downgradable_rwmutex_unsafe.go
+++ b/pkg/sync/downgradable_rwmutex_unsafe.go
@@ -3,7 +3,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-// +build go1.12
+// +build go1.13
// +build !go1.15
// Check go:linkname function signatures when updating Go version.
@@ -16,7 +16,7 @@
// - RUnlock -> Lock (via writerSem)
// - DowngradeLock -> RLock (via readerSem)
-package syncutil
+package sync
import (
"sync"
@@ -27,6 +27,9 @@ import (
//go:linkname runtimeSemacquire sync.runtime_Semacquire
func runtimeSemacquire(s *uint32)
+//go:linkname runtimeSemrelease sync.runtime_Semrelease
+func runtimeSemrelease(s *uint32, handoff bool, skipframes int)
+
// DowngradableRWMutex is identical to sync.RWMutex, but adds the DowngradeLock
// method.
type DowngradableRWMutex struct {
diff --git a/pkg/syncutil/memmove_unsafe.go b/pkg/sync/memmove_unsafe.go
index 348675baa..ad4a3a37e 100644
--- a/pkg/syncutil/memmove_unsafe.go
+++ b/pkg/sync/memmove_unsafe.go
@@ -8,7 +8,7 @@
// Check go:linkname function signatures when updating Go version.
-package syncutil
+package sync
import (
"unsafe"
diff --git a/pkg/syncutil/norace_unsafe.go b/pkg/sync/norace_unsafe.go
index 0a0a9deda..006055dd6 100644
--- a/pkg/syncutil/norace_unsafe.go
+++ b/pkg/sync/norace_unsafe.go
@@ -5,7 +5,7 @@
// +build !race
-package syncutil
+package sync
import (
"unsafe"
diff --git a/pkg/syncutil/race_unsafe.go b/pkg/sync/race_unsafe.go
index 206067ec1..31d8fa9a6 100644
--- a/pkg/syncutil/race_unsafe.go
+++ b/pkg/sync/race_unsafe.go
@@ -5,7 +5,7 @@
// +build race
-package syncutil
+package sync
import (
"runtime"
diff --git a/pkg/syncutil/seqatomic_unsafe.go b/pkg/sync/seqatomic_unsafe.go
index cb6d2eb22..eda6fb131 100644
--- a/pkg/syncutil/seqatomic_unsafe.go
+++ b/pkg/sync/seqatomic_unsafe.go
@@ -13,7 +13,7 @@ import (
"strings"
"unsafe"
- "gvisor.dev/gvisor/pkg/syncutil"
+ "gvisor.dev/gvisor/pkg/sync"
)
// Value is a required type parameter.
@@ -26,17 +26,17 @@ type Value struct{}
// SeqAtomicLoad returns a copy of *ptr, ensuring that the read does not race
// with any writer critical sections in sc.
-func SeqAtomicLoad(sc *syncutil.SeqCount, ptr *Value) Value {
+func SeqAtomicLoad(sc *sync.SeqCount, ptr *Value) Value {
// This function doesn't use SeqAtomicTryLoad because doing so is
// measurably, significantly (~20%) slower; Go is awful at inlining.
var val Value
for {
epoch := sc.BeginRead()
- if syncutil.RaceEnabled {
+ if sync.RaceEnabled {
// runtime.RaceDisable() doesn't actually stop the race detector,
// so it can't help us here. Instead, call runtime.memmove
// directly, which is not instrumented by the race detector.
- syncutil.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val))
+ sync.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val))
} else {
// This is ~40% faster for short reads than going through memmove.
val = *ptr
@@ -52,10 +52,10 @@ func SeqAtomicLoad(sc *syncutil.SeqCount, ptr *Value) Value {
// in sc initiated by a call to sc.BeginRead() that returned epoch. If the read
// would race with a writer critical section, SeqAtomicTryLoad returns
// (unspecified, false).
-func SeqAtomicTryLoad(sc *syncutil.SeqCount, epoch syncutil.SeqCountEpoch, ptr *Value) (Value, bool) {
+func SeqAtomicTryLoad(sc *sync.SeqCount, epoch sync.SeqCountEpoch, ptr *Value) (Value, bool) {
var val Value
- if syncutil.RaceEnabled {
- syncutil.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val))
+ if sync.RaceEnabled {
+ sync.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val))
} else {
val = *ptr
}
@@ -66,7 +66,7 @@ func init() {
var val Value
typ := reflect.TypeOf(val)
name := typ.Name()
- if ptrs := syncutil.PointersInType(typ, name); len(ptrs) != 0 {
+ if ptrs := sync.PointersInType(typ, name); len(ptrs) != 0 {
panic(fmt.Sprintf("SeqAtomicLoad<%s> is invalid since values %s of type %s contain pointers:\n%s", typ, name, typ, strings.Join(ptrs, "\n")))
}
}
diff --git a/pkg/syncutil/seqatomictest/BUILD b/pkg/sync/seqatomictest/BUILD
index ba18f3238..eba21518d 100644
--- a/pkg/syncutil/seqatomictest/BUILD
+++ b/pkg/sync/seqatomictest/BUILD
@@ -9,7 +9,7 @@ go_template_instance(
out = "seqatomic_int_unsafe.go",
package = "seqatomic",
suffix = "Int",
- template = "//pkg/syncutil:generic_seqatomic",
+ template = "//pkg/sync:generic_seqatomic",
types = {
"Value": "int",
},
@@ -18,9 +18,9 @@ go_template_instance(
go_library(
name = "seqatomic",
srcs = ["seqatomic_int_unsafe.go"],
- importpath = "gvisor.dev/gvisor/pkg/syncutil/seqatomic",
+ importpath = "gvisor.dev/gvisor/pkg/sync/seqatomic",
deps = [
- "//pkg/syncutil",
+ "//pkg/sync",
],
)
@@ -29,7 +29,5 @@ go_test(
size = "small",
srcs = ["seqatomic_test.go"],
embed = [":seqatomic"],
- deps = [
- "//pkg/syncutil",
- ],
+ deps = ["//pkg/sync"],
)
diff --git a/pkg/syncutil/seqatomictest/seqatomic_test.go b/pkg/sync/seqatomictest/seqatomic_test.go
index b0db44999..2c4568b07 100644
--- a/pkg/syncutil/seqatomictest/seqatomic_test.go
+++ b/pkg/sync/seqatomictest/seqatomic_test.go
@@ -19,11 +19,11 @@ import (
"testing"
"time"
- "gvisor.dev/gvisor/pkg/syncutil"
+ "gvisor.dev/gvisor/pkg/sync"
)
func TestSeqAtomicLoadUncontended(t *testing.T) {
- var seq syncutil.SeqCount
+ var seq sync.SeqCount
const want = 1
data := want
if got := SeqAtomicLoadInt(&seq, &data); got != want {
@@ -32,7 +32,7 @@ func TestSeqAtomicLoadUncontended(t *testing.T) {
}
func TestSeqAtomicLoadAfterWrite(t *testing.T) {
- var seq syncutil.SeqCount
+ var seq sync.SeqCount
var data int
const want = 1
seq.BeginWrite()
@@ -44,7 +44,7 @@ func TestSeqAtomicLoadAfterWrite(t *testing.T) {
}
func TestSeqAtomicLoadDuringWrite(t *testing.T) {
- var seq syncutil.SeqCount
+ var seq sync.SeqCount
var data int
const want = 1
seq.BeginWrite()
@@ -59,7 +59,7 @@ func TestSeqAtomicLoadDuringWrite(t *testing.T) {
}
func TestSeqAtomicTryLoadUncontended(t *testing.T) {
- var seq syncutil.SeqCount
+ var seq sync.SeqCount
const want = 1
data := want
epoch := seq.BeginRead()
@@ -69,7 +69,7 @@ func TestSeqAtomicTryLoadUncontended(t *testing.T) {
}
func TestSeqAtomicTryLoadDuringWrite(t *testing.T) {
- var seq syncutil.SeqCount
+ var seq sync.SeqCount
var data int
epoch := seq.BeginRead()
seq.BeginWrite()
@@ -80,7 +80,7 @@ func TestSeqAtomicTryLoadDuringWrite(t *testing.T) {
}
func TestSeqAtomicTryLoadAfterWrite(t *testing.T) {
- var seq syncutil.SeqCount
+ var seq sync.SeqCount
var data int
epoch := seq.BeginRead()
seq.BeginWrite()
@@ -91,7 +91,7 @@ func TestSeqAtomicTryLoadAfterWrite(t *testing.T) {
}
func BenchmarkSeqAtomicLoadIntUncontended(b *testing.B) {
- var seq syncutil.SeqCount
+ var seq sync.SeqCount
const want = 42
data := want
b.RunParallel(func(pb *testing.PB) {
@@ -104,7 +104,7 @@ func BenchmarkSeqAtomicLoadIntUncontended(b *testing.B) {
}
func BenchmarkSeqAtomicTryLoadIntUncontended(b *testing.B) {
- var seq syncutil.SeqCount
+ var seq sync.SeqCount
const want = 42
data := want
b.RunParallel(func(pb *testing.PB) {
diff --git a/pkg/syncutil/seqcount.go b/pkg/sync/seqcount.go
index 11d8dbfaa..a1e895352 100644
--- a/pkg/syncutil/seqcount.go
+++ b/pkg/sync/seqcount.go
@@ -3,7 +3,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-package syncutil
+package sync
import (
"fmt"
diff --git a/pkg/syncutil/seqcount_test.go b/pkg/sync/seqcount_test.go
index 14d6aedea..6eb7b4b59 100644
--- a/pkg/syncutil/seqcount_test.go
+++ b/pkg/sync/seqcount_test.go
@@ -3,7 +3,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-package syncutil
+package sync
import (
"reflect"
diff --git a/pkg/syncutil/syncutil.go b/pkg/sync/syncutil.go
index 66e750d06..b16cf5333 100644
--- a/pkg/syncutil/syncutil.go
+++ b/pkg/sync/syncutil.go
@@ -3,5 +3,5 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-// Package syncutil provides synchronization primitives.
-package syncutil
+// Package sync provides synchronization primitives.
+package sync
diff --git a/pkg/syncutil/downgradable_rwmutex_1_12_unsafe.go b/pkg/syncutil/downgradable_rwmutex_1_12_unsafe.go
deleted file mode 100644
index 7c6336e62..000000000
--- a/pkg/syncutil/downgradable_rwmutex_1_12_unsafe.go
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Copyright 2019 The gVisor Authors.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build go1.12
-// +build !go1.13
-
-// TODO(b/133868570): Delete once Go 1.12 is no longer supported.
-
-package syncutil
-
-import _ "unsafe"
-
-//go:linkname runtimeSemrelease112 sync.runtime_Semrelease
-func runtimeSemrelease112(s *uint32, handoff bool)
-
-func runtimeSemrelease(s *uint32, handoff bool, skipframes int) {
- // 'skipframes' is only available starting from 1.13.
- runtimeSemrelease112(s, handoff)
-}
diff --git a/pkg/syncutil/downgradable_rwmutex_1_13_unsafe.go b/pkg/syncutil/downgradable_rwmutex_1_13_unsafe.go
deleted file mode 100644
index 3c3673119..000000000
--- a/pkg/syncutil/downgradable_rwmutex_1_13_unsafe.go
+++ /dev/null
@@ -1,16 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Copyright 2019 The gVisor Authors.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build go1.13
-// +build !go1.15
-
-// Check go:linkname function signatures when updating Go version.
-
-package syncutil
-
-import _ "unsafe"
-
-//go:linkname runtimeSemrelease sync.runtime_Semrelease
-func runtimeSemrelease(s *uint32, handoff bool, skipframes int)
diff --git a/pkg/syserror/syserror.go b/pkg/syserror/syserror.go
index 1987e89cc..2269f6237 100644
--- a/pkg/syserror/syserror.go
+++ b/pkg/syserror/syserror.go
@@ -45,6 +45,7 @@ var (
ELIBBAD = error(syscall.ELIBBAD)
ELOOP = error(syscall.ELOOP)
EMFILE = error(syscall.EMFILE)
+ EMLINK = error(syscall.EMLINK)
EMSGSIZE = error(syscall.EMSGSIZE)
ENAMETOOLONG = error(syscall.ENAMETOOLONG)
ENOATTR = ENODATA
diff --git a/pkg/tcpip/BUILD b/pkg/tcpip/BUILD
index 65d4d0cd8..ebc8d0209 100644
--- a/pkg/tcpip/BUILD
+++ b/pkg/tcpip/BUILD
@@ -10,10 +10,12 @@ go_library(
"packet_buffer_state.go",
"tcpip.go",
"time_unsafe.go",
+ "timer.go",
],
importpath = "gvisor.dev/gvisor/pkg/tcpip",
visibility = ["//visibility:public"],
deps = [
+ "//pkg/sync",
"//pkg/tcpip/buffer",
"//pkg/tcpip/iptables",
"//pkg/waiter",
@@ -26,3 +28,10 @@ go_test(
srcs = ["tcpip_test.go"],
embed = [":tcpip"],
)
+
+go_test(
+ name = "tcpip_x_test",
+ size = "small",
+ srcs = ["timer_test.go"],
+ deps = [":tcpip"],
+)
diff --git a/pkg/tcpip/adapters/gonet/BUILD b/pkg/tcpip/adapters/gonet/BUILD
index 78df5a0b1..3df7d18d3 100644
--- a/pkg/tcpip/adapters/gonet/BUILD
+++ b/pkg/tcpip/adapters/gonet/BUILD
@@ -9,6 +9,7 @@ go_library(
importpath = "gvisor.dev/gvisor/pkg/tcpip/adapters/gonet",
visibility = ["//visibility:public"],
deps = [
+ "//pkg/sync",
"//pkg/tcpip",
"//pkg/tcpip/buffer",
"//pkg/tcpip/stack",
diff --git a/pkg/tcpip/adapters/gonet/gonet.go b/pkg/tcpip/adapters/gonet/gonet.go
index cd6ce930a..a2f44b496 100644
--- a/pkg/tcpip/adapters/gonet/gonet.go
+++ b/pkg/tcpip/adapters/gonet/gonet.go
@@ -20,9 +20,9 @@ import (
"errors"
"io"
"net"
- "sync"
"time"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/buffer"
"gvisor.dev/gvisor/pkg/tcpip/stack"
diff --git a/pkg/tcpip/checker/checker.go b/pkg/tcpip/checker/checker.go
index 2f15bf1f1..885d773b0 100644
--- a/pkg/tcpip/checker/checker.go
+++ b/pkg/tcpip/checker/checker.go
@@ -33,6 +33,9 @@ type NetworkChecker func(*testing.T, []header.Network)
// TransportChecker is a function to check a property of a transport packet.
type TransportChecker func(*testing.T, header.Transport)
+// ControlMessagesChecker is a function to check a property of ancillary data.
+type ControlMessagesChecker func(*testing.T, tcpip.ControlMessages)
+
// IPv4 checks the validity and properties of the given IPv4 packet. It is
// expected to be used in conjunction with other network checkers for specific
// properties. For example, to check the source and destination address, one
@@ -158,6 +161,19 @@ func FragmentFlags(flags uint8) NetworkChecker {
}
}
+// ReceiveTOS creates a checker that checks the TOS field in ControlMessages.
+func ReceiveTOS(want uint8) ControlMessagesChecker {
+ return func(t *testing.T, cm tcpip.ControlMessages) {
+ t.Helper()
+ if !cm.HasTOS {
+ t.Fatalf("got cm.HasTOS = %t, want cm.TOS = %d", cm.HasTOS, want)
+ }
+ if got := cm.TOS; got != want {
+ t.Fatalf("got cm.TOS = %d, want %d", got, want)
+ }
+ }
+}
+
// TOS creates a checker that checks the TOS field.
func TOS(tos uint8, label uint32) NetworkChecker {
return func(t *testing.T, h []header.Network) {
@@ -754,3 +770,9 @@ func NDPNSTargetAddress(want tcpip.Address) TransportChecker {
}
}
}
+
+// NDPRS creates a checker that checks that the packet contains a valid NDP
+// Router Solicitation message (as per the raw wire format).
+func NDPRS() NetworkChecker {
+ return NDP(header.ICMPv6RouterSolicit, header.NDPRSMinimumSize)
+}
diff --git a/pkg/tcpip/header/BUILD b/pkg/tcpip/header/BUILD
index a3485b35c..cd747d100 100644
--- a/pkg/tcpip/header/BUILD
+++ b/pkg/tcpip/header/BUILD
@@ -20,6 +20,7 @@ go_library(
"ndp_neighbor_solicit.go",
"ndp_options.go",
"ndp_router_advert.go",
+ "ndp_router_solicit.go",
"tcp.go",
"udp.go",
],
@@ -38,12 +39,16 @@ go_test(
size = "small",
srcs = [
"checksum_test.go",
+ "ipv6_test.go",
"ipversion_test.go",
"tcp_test.go",
],
deps = [
":header",
+ "//pkg/rand",
+ "//pkg/tcpip",
"//pkg/tcpip/buffer",
+ "@com_github_google_go-cmp//cmp:go_default_library",
],
)
@@ -55,5 +60,8 @@ go_test(
"ndp_test.go",
],
embed = [":header"],
- deps = ["//pkg/tcpip"],
+ deps = [
+ "//pkg/tcpip",
+ "@com_github_google_go-cmp//cmp:go_default_library",
+ ],
)
diff --git a/pkg/tcpip/header/ipv6.go b/pkg/tcpip/header/ipv6.go
index 0caa51c1e..70e6ce095 100644
--- a/pkg/tcpip/header/ipv6.go
+++ b/pkg/tcpip/header/ipv6.go
@@ -15,6 +15,7 @@
package header
import (
+ "crypto/sha256"
"encoding/binary"
"strings"
@@ -83,6 +84,13 @@ const (
// The address is ff02::1.
IPv6AllNodesMulticastAddress tcpip.Address = "\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"
+ // IPv6AllRoutersMulticastAddress is a link-local multicast group that
+ // all IPv6 routers MUST join, as per RFC 4291, section 2.8. Packets
+ // destined to this address will reach all routers on a link.
+ //
+ // The address is ff02::2.
+ IPv6AllRoutersMulticastAddress tcpip.Address = "\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"
+
// IPv6MinimumMTU is the minimum MTU required by IPv6, per RFC 2460,
// section 5.
IPv6MinimumMTU = 1280
@@ -90,6 +98,23 @@ const (
// IPv6Any is the non-routable IPv6 "any" meta address. It is also
// known as the unspecified address.
IPv6Any tcpip.Address = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+
+ // IIDSize is the size of an interface identifier (IID), in bytes, as
+ // defined by RFC 4291 section 2.5.1.
+ IIDSize = 8
+
+ // IIDOffsetInIPv6Address is the offset, in bytes, from the start
+ // of an IPv6 address to the beginning of the interface identifier
+ // (IID) for auto-generated addresses. That is, all bytes before
+ // the IIDOffsetInIPv6Address-th byte are the prefix bytes, and all
+ // bytes including and after the IIDOffsetInIPv6Address-th byte are
+ // for the IID.
+ IIDOffsetInIPv6Address = 8
+
+ // OpaqueIIDSecretKeyMinBytes is the recommended minimum number of bytes
+ // for the secret key used to generate an opaque interface identifier as
+ // outlined by RFC 7217.
+ OpaqueIIDSecretKeyMinBytes = 16
)
// IPv6EmptySubnet is the empty IPv6 subnet. It may also be known as the
@@ -266,27 +291,43 @@ func SolicitedNodeAddr(addr tcpip.Address) tcpip.Address {
return solicitedNodeMulticastPrefix + addr[len(addr)-3:]
}
+// EthernetAdddressToModifiedEUI64IntoBuf populates buf with a modified EUI-64
+// from a 48-bit Ethernet/MAC address, as per RFC 4291 section 2.5.1.
+//
+// buf MUST be at least 8 bytes.
+func EthernetAdddressToModifiedEUI64IntoBuf(linkAddr tcpip.LinkAddress, buf []byte) {
+ buf[0] = linkAddr[0] ^ 2
+ buf[1] = linkAddr[1]
+ buf[2] = linkAddr[2]
+ buf[3] = 0xFF
+ buf[4] = 0xFE
+ buf[5] = linkAddr[3]
+ buf[6] = linkAddr[4]
+ buf[7] = linkAddr[5]
+}
+
+// EthernetAddressToModifiedEUI64 computes a modified EUI-64 from a 48-bit
+// Ethernet/MAC address, as per RFC 4291 section 2.5.1.
+func EthernetAddressToModifiedEUI64(linkAddr tcpip.LinkAddress) [IIDSize]byte {
+ var buf [IIDSize]byte
+ EthernetAdddressToModifiedEUI64IntoBuf(linkAddr, buf[:])
+ return buf
+}
+
// LinkLocalAddr computes the default IPv6 link-local address from a link-layer
// (MAC) address.
func LinkLocalAddr(linkAddr tcpip.LinkAddress) tcpip.Address {
- // Convert a 48-bit MAC to an EUI-64 and then prepend the link-local
- // header, FE80::.
+ // Convert a 48-bit MAC to a modified EUI-64 and then prepend the
+ // link-local header, FE80::.
//
// The conversion is very nearly:
// aa:bb:cc:dd:ee:ff => FE80::Aabb:ccFF:FEdd:eeff
// Note the capital A. The conversion aa->Aa involves a bit flip.
- lladdrb := [16]byte{
- 0: 0xFE,
- 1: 0x80,
- 8: linkAddr[0] ^ 2,
- 9: linkAddr[1],
- 10: linkAddr[2],
- 11: 0xFF,
- 12: 0xFE,
- 13: linkAddr[3],
- 14: linkAddr[4],
- 15: linkAddr[5],
+ lladdrb := [IPv6AddressSize]byte{
+ 0: 0xFE,
+ 1: 0x80,
}
+ EthernetAdddressToModifiedEUI64IntoBuf(linkAddr, lladdrb[IIDOffsetInIPv6Address:])
return tcpip.Address(lladdrb[:])
}
@@ -298,3 +339,85 @@ func IsV6LinkLocalAddress(addr tcpip.Address) bool {
}
return addr[0] == 0xfe && (addr[1]&0xc0) == 0x80
}
+
+// IsV6UniqueLocalAddress determines if the provided address is an IPv6
+// unique-local address (within the prefix FC00::/7).
+func IsV6UniqueLocalAddress(addr tcpip.Address) bool {
+ if len(addr) != IPv6AddressSize {
+ return false
+ }
+ // According to RFC 4193 section 3.1, a unique local address has the prefix
+ // FC00::/7.
+ return (addr[0] & 0xfe) == 0xfc
+}
+
+// AppendOpaqueInterfaceIdentifier appends a 64 bit opaque interface identifier
+// (IID) to buf as outlined by RFC 7217 and returns the extended buffer.
+//
+// The opaque IID is generated from the cryptographic hash of the concatenation
+// of the prefix, NIC's name, DAD counter (DAD retry counter) and the secret
+// key. The secret key SHOULD be at least OpaqueIIDSecretKeyMinBytes bytes and
+// MUST be generated to a pseudo-random number. See RFC 4086 for randomness
+// requirements for security.
+//
+// If buf has enough capacity for the IID (IIDSize bytes), a new underlying
+// array for the buffer will not be allocated.
+func AppendOpaqueInterfaceIdentifier(buf []byte, prefix tcpip.Subnet, nicName string, dadCounter uint8, secretKey []byte) []byte {
+ // As per RFC 7217 section 5, the opaque identifier can be generated as a
+ // cryptographic hash of the concatenation of each of the function parameters.
+ // Note, we omit the optional Network_ID field.
+ h := sha256.New()
+ // h.Write never returns an error.
+ h.Write([]byte(prefix.ID()[:IIDOffsetInIPv6Address]))
+ h.Write([]byte(nicName))
+ h.Write([]byte{dadCounter})
+ h.Write(secretKey)
+
+ var sumBuf [sha256.Size]byte
+ sum := h.Sum(sumBuf[:0])
+
+ return append(buf, sum[:IIDSize]...)
+}
+
+// LinkLocalAddrWithOpaqueIID computes the default IPv6 link-local address with
+// an opaque IID.
+func LinkLocalAddrWithOpaqueIID(nicName string, dadCounter uint8, secretKey []byte) tcpip.Address {
+ lladdrb := [IPv6AddressSize]byte{
+ 0: 0xFE,
+ 1: 0x80,
+ }
+
+ return tcpip.Address(AppendOpaqueInterfaceIdentifier(lladdrb[:IIDOffsetInIPv6Address], IPv6LinkLocalPrefix.Subnet(), nicName, dadCounter, secretKey))
+}
+
+// IPv6AddressScope is the scope of an IPv6 address.
+type IPv6AddressScope int
+
+const (
+ // LinkLocalScope indicates a link-local address.
+ LinkLocalScope IPv6AddressScope = iota
+
+ // UniqueLocalScope indicates a unique-local address.
+ UniqueLocalScope
+
+ // GlobalScope indicates a global address.
+ GlobalScope
+)
+
+// ScopeForIPv6Address returns the scope for an IPv6 address.
+func ScopeForIPv6Address(addr tcpip.Address) (IPv6AddressScope, *tcpip.Error) {
+ if len(addr) != IPv6AddressSize {
+ return GlobalScope, tcpip.ErrBadAddress
+ }
+
+ switch {
+ case IsV6LinkLocalAddress(addr):
+ return LinkLocalScope, nil
+
+ case IsV6UniqueLocalAddress(addr):
+ return UniqueLocalScope, nil
+
+ default:
+ return GlobalScope, nil
+ }
+}
diff --git a/pkg/tcpip/header/ipv6_test.go b/pkg/tcpip/header/ipv6_test.go
new file mode 100644
index 000000000..29f54bc57
--- /dev/null
+++ b/pkg/tcpip/header/ipv6_test.go
@@ -0,0 +1,302 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header_test
+
+import (
+ "bytes"
+ "crypto/sha256"
+ "testing"
+
+ "github.com/google/go-cmp/cmp"
+ "gvisor.dev/gvisor/pkg/rand"
+ "gvisor.dev/gvisor/pkg/tcpip"
+ "gvisor.dev/gvisor/pkg/tcpip/header"
+)
+
+const (
+ linkAddr = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x06")
+ linkLocalAddr = tcpip.Address("\xfe\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+ uniqueLocalAddr1 = tcpip.Address("\xfc\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+ uniqueLocalAddr2 = tcpip.Address("\xfd\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
+ globalAddr = tcpip.Address("\xa0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+)
+
+func TestEthernetAdddressToModifiedEUI64(t *testing.T) {
+ expectedIID := [header.IIDSize]byte{0, 2, 3, 255, 254, 4, 5, 6}
+
+ if diff := cmp.Diff(expectedIID, header.EthernetAddressToModifiedEUI64(linkAddr)); diff != "" {
+ t.Errorf("EthernetAddressToModifiedEUI64(%s) mismatch (-want +got):\n%s", linkAddr, diff)
+ }
+
+ var buf [header.IIDSize]byte
+ header.EthernetAdddressToModifiedEUI64IntoBuf(linkAddr, buf[:])
+ if diff := cmp.Diff(expectedIID, buf); diff != "" {
+ t.Errorf("EthernetAddressToModifiedEUI64IntoBuf(%s, _) mismatch (-want +got):\n%s", linkAddr, diff)
+ }
+}
+
+func TestLinkLocalAddr(t *testing.T) {
+ if got, want := header.LinkLocalAddr(linkAddr), tcpip.Address("\xfe\x80\x00\x00\x00\x00\x00\x00\x00\x02\x03\xff\xfe\x04\x05\x06"); got != want {
+ t.Errorf("got LinkLocalAddr(%s) = %s, want = %s", linkAddr, got, want)
+ }
+}
+
+func TestAppendOpaqueInterfaceIdentifier(t *testing.T) {
+ var secretKeyBuf [header.OpaqueIIDSecretKeyMinBytes * 2]byte
+ if n, err := rand.Read(secretKeyBuf[:]); err != nil {
+ t.Fatalf("rand.Read(_): %s", err)
+ } else if want := header.OpaqueIIDSecretKeyMinBytes * 2; n != want {
+ t.Fatalf("expected rand.Read to read %d bytes, read %d bytes", want, n)
+ }
+
+ tests := []struct {
+ name string
+ prefix tcpip.Subnet
+ nicName string
+ dadCounter uint8
+ secretKey []byte
+ }{
+ {
+ name: "SecretKey of minimum size",
+ prefix: header.IPv6LinkLocalPrefix.Subnet(),
+ nicName: "eth0",
+ dadCounter: 0,
+ secretKey: secretKeyBuf[:header.OpaqueIIDSecretKeyMinBytes],
+ },
+ {
+ name: "SecretKey of less than minimum size",
+ prefix: func() tcpip.Subnet {
+ addrWithPrefix := tcpip.AddressWithPrefix{
+ Address: "\x01\x02\x03\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+ PrefixLen: header.IIDOffsetInIPv6Address * 8,
+ }
+ return addrWithPrefix.Subnet()
+ }(),
+ nicName: "eth10",
+ dadCounter: 1,
+ secretKey: secretKeyBuf[:header.OpaqueIIDSecretKeyMinBytes/2],
+ },
+ {
+ name: "SecretKey of more than minimum size",
+ prefix: func() tcpip.Subnet {
+ addrWithPrefix := tcpip.AddressWithPrefix{
+ Address: "\x01\x02\x03\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+ PrefixLen: header.IIDOffsetInIPv6Address * 8,
+ }
+ return addrWithPrefix.Subnet()
+ }(),
+ nicName: "eth11",
+ dadCounter: 2,
+ secretKey: secretKeyBuf[:header.OpaqueIIDSecretKeyMinBytes*2],
+ },
+ {
+ name: "Nil SecretKey and empty nicName",
+ prefix: func() tcpip.Subnet {
+ addrWithPrefix := tcpip.AddressWithPrefix{
+ Address: "\x01\x02\x03\x05\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+ PrefixLen: header.IIDOffsetInIPv6Address * 8,
+ }
+ return addrWithPrefix.Subnet()
+ }(),
+ nicName: "",
+ dadCounter: 3,
+ secretKey: nil,
+ },
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ h := sha256.New()
+ h.Write([]byte(test.prefix.ID()[:header.IIDOffsetInIPv6Address]))
+ h.Write([]byte(test.nicName))
+ h.Write([]byte{test.dadCounter})
+ if k := test.secretKey; k != nil {
+ h.Write(k)
+ }
+ var hashSum [sha256.Size]byte
+ h.Sum(hashSum[:0])
+ want := hashSum[:header.IIDSize]
+
+ // Passing a nil buffer should result in a new buffer returned with the
+ // IID.
+ if got := header.AppendOpaqueInterfaceIdentifier(nil, test.prefix, test.nicName, test.dadCounter, test.secretKey); !bytes.Equal(got, want) {
+ t.Errorf("got AppendOpaqueInterfaceIdentifier(nil, %s, %s, %d, %x) = %x, want = %x", test.prefix, test.nicName, test.dadCounter, test.secretKey, got, want)
+ }
+
+ // Passing a buffer with sufficient capacity for the IID should populate
+ // the buffer provided.
+ var iidBuf [header.IIDSize]byte
+ if got := header.AppendOpaqueInterfaceIdentifier(iidBuf[:0], test.prefix, test.nicName, test.dadCounter, test.secretKey); !bytes.Equal(got, want) {
+ t.Errorf("got AppendOpaqueInterfaceIdentifier(iidBuf[:0], %s, %s, %d, %x) = %x, want = %x", test.prefix, test.nicName, test.dadCounter, test.secretKey, got, want)
+ }
+ if got := iidBuf[:]; !bytes.Equal(got, want) {
+ t.Errorf("got iidBuf = %x, want = %x", got, want)
+ }
+ })
+ }
+}
+
+func TestLinkLocalAddrWithOpaqueIID(t *testing.T) {
+ var secretKeyBuf [header.OpaqueIIDSecretKeyMinBytes * 2]byte
+ if n, err := rand.Read(secretKeyBuf[:]); err != nil {
+ t.Fatalf("rand.Read(_): %s", err)
+ } else if want := header.OpaqueIIDSecretKeyMinBytes * 2; n != want {
+ t.Fatalf("expected rand.Read to read %d bytes, read %d bytes", want, n)
+ }
+
+ prefix := header.IPv6LinkLocalPrefix.Subnet()
+
+ tests := []struct {
+ name string
+ prefix tcpip.Subnet
+ nicName string
+ dadCounter uint8
+ secretKey []byte
+ }{
+ {
+ name: "SecretKey of minimum size",
+ nicName: "eth0",
+ dadCounter: 0,
+ secretKey: secretKeyBuf[:header.OpaqueIIDSecretKeyMinBytes],
+ },
+ {
+ name: "SecretKey of less than minimum size",
+ nicName: "eth10",
+ dadCounter: 1,
+ secretKey: secretKeyBuf[:header.OpaqueIIDSecretKeyMinBytes/2],
+ },
+ {
+ name: "SecretKey of more than minimum size",
+ nicName: "eth11",
+ dadCounter: 2,
+ secretKey: secretKeyBuf[:header.OpaqueIIDSecretKeyMinBytes*2],
+ },
+ {
+ name: "Nil SecretKey and empty nicName",
+ nicName: "",
+ dadCounter: 3,
+ secretKey: nil,
+ },
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ addrBytes := [header.IPv6AddressSize]byte{
+ 0: 0xFE,
+ 1: 0x80,
+ }
+
+ want := tcpip.Address(header.AppendOpaqueInterfaceIdentifier(
+ addrBytes[:header.IIDOffsetInIPv6Address],
+ prefix,
+ test.nicName,
+ test.dadCounter,
+ test.secretKey,
+ ))
+
+ if got := header.LinkLocalAddrWithOpaqueIID(test.nicName, test.dadCounter, test.secretKey); got != want {
+ t.Errorf("got LinkLocalAddrWithOpaqueIID(%s, %d, %x) = %s, want = %s", test.nicName, test.dadCounter, test.secretKey, got, want)
+ }
+ })
+ }
+}
+
+func TestIsV6UniqueLocalAddress(t *testing.T) {
+ tests := []struct {
+ name string
+ addr tcpip.Address
+ expected bool
+ }{
+ {
+ name: "Valid Unique 1",
+ addr: uniqueLocalAddr1,
+ expected: true,
+ },
+ {
+ name: "Valid Unique 2",
+ addr: uniqueLocalAddr1,
+ expected: true,
+ },
+ {
+ name: "Link Local",
+ addr: linkLocalAddr,
+ expected: false,
+ },
+ {
+ name: "Global",
+ addr: globalAddr,
+ expected: false,
+ },
+ {
+ name: "IPv4",
+ addr: "\x01\x02\x03\x04",
+ expected: false,
+ },
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ if got := header.IsV6UniqueLocalAddress(test.addr); got != test.expected {
+ t.Errorf("got header.IsV6UniqueLocalAddress(%s) = %t, want = %t", test.addr, got, test.expected)
+ }
+ })
+ }
+}
+
+func TestScopeForIPv6Address(t *testing.T) {
+ tests := []struct {
+ name string
+ addr tcpip.Address
+ scope header.IPv6AddressScope
+ err *tcpip.Error
+ }{
+ {
+ name: "Unique Local",
+ addr: uniqueLocalAddr1,
+ scope: header.UniqueLocalScope,
+ err: nil,
+ },
+ {
+ name: "Link Local",
+ addr: linkLocalAddr,
+ scope: header.LinkLocalScope,
+ err: nil,
+ },
+ {
+ name: "Global",
+ addr: globalAddr,
+ scope: header.GlobalScope,
+ err: nil,
+ },
+ {
+ name: "IPv4",
+ addr: "\x01\x02\x03\x04",
+ scope: header.GlobalScope,
+ err: tcpip.ErrBadAddress,
+ },
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ got, err := header.ScopeForIPv6Address(test.addr)
+ if err != test.err {
+ t.Errorf("got header.IsV6UniqueLocalAddress(%s) = (_, %v), want = (_, %v)", test.addr, err, test.err)
+ }
+ if got != test.scope {
+ t.Errorf("got header.IsV6UniqueLocalAddress(%s) = (%d, _), want = (%d, _)", test.addr, got, test.scope)
+ }
+ })
+ }
+}
diff --git a/pkg/tcpip/header/ndp_options.go b/pkg/tcpip/header/ndp_options.go
index 1ca6199ef..06e0bace2 100644
--- a/pkg/tcpip/header/ndp_options.go
+++ b/pkg/tcpip/header/ndp_options.go
@@ -17,6 +17,7 @@ package header
import (
"encoding/binary"
"errors"
+ "math"
"time"
"gvisor.dev/gvisor/pkg/tcpip"
@@ -85,6 +86,23 @@ const (
// within an NDPPrefixInformation.
ndpPrefixInformationPrefixOffset = 14
+ // NDPRecursiveDNSServerOptionType is the type of the Recursive DNS
+ // Server option, as per RFC 8106 section 5.1.
+ NDPRecursiveDNSServerOptionType = 25
+
+ // ndpRecursiveDNSServerLifetimeOffset is the start of the 4-byte
+ // Lifetime field within an NDPRecursiveDNSServer.
+ ndpRecursiveDNSServerLifetimeOffset = 2
+
+ // ndpRecursiveDNSServerAddressesOffset is the start of the addresses
+ // for IPv6 Recursive DNS Servers within an NDPRecursiveDNSServer.
+ ndpRecursiveDNSServerAddressesOffset = 6
+
+ // minNDPRecursiveDNSServerLength is the minimum NDP Recursive DNS
+ // Server option's length field value when it contains at least one
+ // IPv6 address.
+ minNDPRecursiveDNSServerLength = 3
+
// lengthByteUnits is the multiplier factor for the Length field of an
// NDP option. That is, the length field for NDP options is in units of
// 8 octets, as per RFC 4861 section 4.6.
@@ -92,13 +110,13 @@ const (
)
var (
- // NDPPrefixInformationInfiniteLifetime is a value that represents
- // infinity for the Valid and Preferred Lifetime fields in a NDP Prefix
- // Information option. Its value is (2^32 - 1)s = 4294967295s
+ // NDPInfiniteLifetime is a value that represents infinity for the
+ // 4-byte lifetime fields found in various NDP options. Its value is
+ // (2^32 - 1)s = 4294967295s.
//
// This is a variable instead of a constant so that tests can change
// this value to a smaller value. It should only be modified by tests.
- NDPPrefixInformationInfiniteLifetime = time.Second * 4294967295
+ NDPInfiniteLifetime = time.Second * math.MaxUint32
)
// NDPOptionIterator is an iterator of NDPOption.
@@ -118,6 +136,7 @@ var (
ErrNDPOptBufExhausted = errors.New("Buffer unexpectedly exhausted")
ErrNDPOptZeroLength = errors.New("NDP option has zero-valued Length field")
ErrNDPOptMalformedBody = errors.New("NDP option has a malformed body")
+ ErrNDPInvalidLength = errors.New("NDP option's Length value is invalid as per relevant RFC")
)
// Next returns the next element in the backing NDPOptions, or true if we are
@@ -182,6 +201,22 @@ func (i *NDPOptionIterator) Next() (NDPOption, bool, error) {
}
return NDPPrefixInformation(body), false, nil
+
+ case NDPRecursiveDNSServerOptionType:
+ // RFC 8106 section 5.3.1 outlines that the RDNSS option
+ // must have a minimum length of 3 so it contains at
+ // least one IPv6 address.
+ if l < minNDPRecursiveDNSServerLength {
+ return nil, true, ErrNDPInvalidLength
+ }
+
+ opt := NDPRecursiveDNSServer(body)
+ if len(opt.Addresses()) == 0 {
+ return nil, true, ErrNDPOptMalformedBody
+ }
+
+ return opt, false, nil
+
default:
// We do not yet recognize the option, just skip for
// now. This is okay because RFC 4861 allows us to
@@ -434,7 +469,7 @@ func (o NDPPrefixInformation) AutonomousAddressConfigurationFlag() bool {
//
// Note, a value of 0 implies the prefix should not be considered as on-link,
// and a value of infinity/forever is represented by
-// NDPPrefixInformationInfiniteLifetime.
+// NDPInfiniteLifetime.
func (o NDPPrefixInformation) ValidLifetime() time.Duration {
// The field is the time in seconds, as per RFC 4861 section 4.6.2.
return time.Second * time.Duration(binary.BigEndian.Uint32(o[ndpPrefixInformationValidLifetimeOffset:]))
@@ -447,7 +482,7 @@ func (o NDPPrefixInformation) ValidLifetime() time.Duration {
//
// Note, a value of 0 implies that addresses generated from the prefix should
// no longer remain preferred, and a value of infinity is represented by
-// NDPPrefixInformationInfiniteLifetime.
+// NDPInfiniteLifetime.
//
// Also note that the value of this field MUST NOT exceed the Valid Lifetime
// field to avoid preferring addresses that are no longer valid, for the
@@ -476,3 +511,79 @@ func (o NDPPrefixInformation) Subnet() tcpip.Subnet {
}
return addrWithPrefix.Subnet()
}
+
+// NDPRecursiveDNSServer is the NDP Recursive DNS Server option, as defined by
+// RFC 8106 section 5.1.
+//
+// To make sure that the option meets its minimum length and does not end in the
+// middle of a DNS server's IPv6 address, the length of a valid
+// NDPRecursiveDNSServer must meet the following constraint:
+// (Length - ndpRecursiveDNSServerAddressesOffset) % IPv6AddressSize == 0
+type NDPRecursiveDNSServer []byte
+
+// Type returns the type of an NDP Recursive DNS Server option.
+//
+// Type implements NDPOption.Type.
+func (NDPRecursiveDNSServer) Type() uint8 {
+ return NDPRecursiveDNSServerOptionType
+}
+
+// Length implements NDPOption.Length.
+func (o NDPRecursiveDNSServer) Length() int {
+ return len(o)
+}
+
+// serializeInto implements NDPOption.serializeInto.
+func (o NDPRecursiveDNSServer) serializeInto(b []byte) int {
+ used := copy(b, o)
+
+ // Zero out the reserved bytes that are before the Lifetime field.
+ for i := 0; i < ndpRecursiveDNSServerLifetimeOffset; i++ {
+ b[i] = 0
+ }
+
+ return used
+}
+
+// Lifetime returns the length of time that the DNS server addresses
+// in this option may be used for name resolution.
+//
+// Note, a value of 0 implies the addresses should no longer be used,
+// and a value of infinity/forever is represented by NDPInfiniteLifetime.
+//
+// Lifetime may panic if o does not have enough bytes to hold the Lifetime
+// field.
+func (o NDPRecursiveDNSServer) Lifetime() time.Duration {
+ // The field is the time in seconds, as per RFC 8106 section 5.1.
+ return time.Second * time.Duration(binary.BigEndian.Uint32(o[ndpRecursiveDNSServerLifetimeOffset:]))
+}
+
+// Addresses returns the recursive DNS server IPv6 addresses that may be
+// used for name resolution.
+//
+// Note, some of the addresses returned MAY be link-local addresses.
+//
+// Addresses may panic if o does not hold valid IPv6 addresses.
+func (o NDPRecursiveDNSServer) Addresses() []tcpip.Address {
+ l := len(o)
+ if l < ndpRecursiveDNSServerAddressesOffset {
+ return nil
+ }
+
+ l -= ndpRecursiveDNSServerAddressesOffset
+ if l%IPv6AddressSize != 0 {
+ return nil
+ }
+
+ buf := o[ndpRecursiveDNSServerAddressesOffset:]
+ var addrs []tcpip.Address
+ for len(buf) > 0 {
+ addr := tcpip.Address(buf[:IPv6AddressSize])
+ if !IsV6UnicastAddress(addr) {
+ return nil
+ }
+ addrs = append(addrs, addr)
+ buf = buf[IPv6AddressSize:]
+ }
+ return addrs
+}
diff --git a/pkg/tcpip/header/ndp_router_solicit.go b/pkg/tcpip/header/ndp_router_solicit.go
new file mode 100644
index 000000000..9e67ba95d
--- /dev/null
+++ b/pkg/tcpip/header/ndp_router_solicit.go
@@ -0,0 +1,36 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header
+
+// NDPRouterSolicit is an NDP Router Solicitation message. It will only contain
+// the body of an ICMPv6 packet.
+//
+// See RFC 4861 section 4.1 for more details.
+type NDPRouterSolicit []byte
+
+const (
+ // NDPRSMinimumSize is the minimum size of a valid NDP Router
+ // Solicitation message (body of an ICMPv6 packet).
+ NDPRSMinimumSize = 4
+
+ // ndpRSOptionsOffset is the start of the NDP options in an
+ // NDPRouterSolicit.
+ ndpRSOptionsOffset = 4
+)
+
+// Options returns an NDPOptions of the the options body.
+func (b NDPRouterSolicit) Options() NDPOptions {
+ return NDPOptions(b[ndpRSOptionsOffset:])
+}
diff --git a/pkg/tcpip/header/ndp_test.go b/pkg/tcpip/header/ndp_test.go
index ad6daafcd..2c439d70c 100644
--- a/pkg/tcpip/header/ndp_test.go
+++ b/pkg/tcpip/header/ndp_test.go
@@ -19,6 +19,7 @@ import (
"testing"
"time"
+ "github.com/google/go-cmp/cmp"
"gvisor.dev/gvisor/pkg/tcpip"
)
@@ -369,6 +370,175 @@ func TestNDPPrefixInformationOption(t *testing.T) {
}
}
+func TestNDPRecursiveDNSServerOptionSerialize(t *testing.T) {
+ b := []byte{
+ 9, 8,
+ 1, 2, 4, 8,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ }
+ targetBuf := []byte{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}
+ expected := []byte{
+ 25, 3, 0, 0,
+ 1, 2, 4, 8,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ }
+ opts := NDPOptions(targetBuf)
+ serializer := NDPOptionsSerializer{
+ NDPRecursiveDNSServer(b),
+ }
+ if got, want := opts.Serialize(serializer), len(expected); got != want {
+ t.Errorf("got Serialize = %d, want = %d", got, want)
+ }
+ if !bytes.Equal(targetBuf, expected) {
+ t.Fatalf("got targetBuf = %x, want = %x", targetBuf, expected)
+ }
+
+ it, err := opts.Iter(true)
+ if err != nil {
+ t.Fatalf("got Iter = (_, %s), want = (_, nil)", err)
+ }
+
+ next, done, err := it.Next()
+ if err != nil {
+ t.Fatalf("got Next = (_, _, %s), want = (_, _, nil)", err)
+ }
+ if done {
+ t.Fatal("got Next = (_, true, _), want = (_, false, _)")
+ }
+ if got := next.Type(); got != NDPRecursiveDNSServerOptionType {
+ t.Errorf("got Type = %d, want = %d", got, NDPRecursiveDNSServerOptionType)
+ }
+
+ opt, ok := next.(NDPRecursiveDNSServer)
+ if !ok {
+ t.Fatalf("next (type = %T) cannot be casted to an NDPRecursiveDNSServer", next)
+ }
+ if got := opt.Type(); got != 25 {
+ t.Errorf("got Type = %d, want = 31", got)
+ }
+ if got := opt.Length(); got != 22 {
+ t.Errorf("got Length = %d, want = 22", got)
+ }
+ if got, want := opt.Lifetime(), 16909320*time.Second; got != want {
+ t.Errorf("got Lifetime = %s, want = %s", got, want)
+ }
+ want := []tcpip.Address{
+ "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+ }
+ if got := opt.Addresses(); !cmp.Equal(got, want) {
+ t.Errorf("got Addresses = %v, want = %v", got, want)
+ }
+
+ // Iterator should not return anything else.
+ next, done, err = it.Next()
+ if err != nil {
+ t.Errorf("got Next = (_, _, %s), want = (_, _, nil)", err)
+ }
+ if !done {
+ t.Error("got Next = (_, false, _), want = (_, true, _)")
+ }
+ if next != nil {
+ t.Errorf("got Next = (%x, _, _), want = (nil, _, _)", next)
+ }
+}
+
+func TestNDPRecursiveDNSServerOption(t *testing.T) {
+ tests := []struct {
+ name string
+ buf []byte
+ lifetime time.Duration
+ addrs []tcpip.Address
+ }{
+ {
+ "Valid1Addr",
+ []byte{
+ 25, 3, 0, 0,
+ 0, 0, 0, 0,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ },
+ 0,
+ []tcpip.Address{
+ "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+ },
+ },
+ {
+ "Valid2Addr",
+ []byte{
+ 25, 5, 0, 0,
+ 0, 0, 0, 0,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 17, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16,
+ },
+ 0,
+ []tcpip.Address{
+ "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+ "\x11\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x10",
+ },
+ },
+ {
+ "Valid3Addr",
+ []byte{
+ 25, 7, 0, 0,
+ 0, 0, 0, 0,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 17, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16,
+ 17, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 17,
+ },
+ 0,
+ []tcpip.Address{
+ "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+ "\x11\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x10",
+ "\x11\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x11",
+ },
+ },
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ opts := NDPOptions(test.buf)
+ it, err := opts.Iter(true)
+ if err != nil {
+ t.Fatalf("got Iter = (_, %s), want = (_, nil)", err)
+ }
+
+ // Iterator should get our option.
+ next, done, err := it.Next()
+ if err != nil {
+ t.Fatalf("got Next = (_, _, %s), want = (_, _, nil)", err)
+ }
+ if done {
+ t.Fatal("got Next = (_, true, _), want = (_, false, _)")
+ }
+ if got := next.Type(); got != NDPRecursiveDNSServerOptionType {
+ t.Fatalf("got Type %= %d, want = %d", got, NDPRecursiveDNSServerOptionType)
+ }
+
+ opt, ok := next.(NDPRecursiveDNSServer)
+ if !ok {
+ t.Fatalf("next (type = %T) cannot be casted to an NDPRecursiveDNSServer", next)
+ }
+ if got := opt.Lifetime(); got != test.lifetime {
+ t.Errorf("got Lifetime = %d, want = %d", got, test.lifetime)
+ }
+ if got := opt.Addresses(); !cmp.Equal(got, test.addrs) {
+ t.Errorf("got Addresses = %v, want = %v", got, test.addrs)
+ }
+
+ // Iterator should not return anything else.
+ next, done, err = it.Next()
+ if err != nil {
+ t.Errorf("got Next = (_, _, %s), want = (_, _, nil)", err)
+ }
+ if !done {
+ t.Error("got Next = (_, false, _), want = (_, true, _)")
+ }
+ if next != nil {
+ t.Errorf("got Next = (%x, _, _), want = (nil, _, _)", next)
+ }
+ })
+ }
+}
+
// TestNDPOptionsIterCheck tests that Iter will return false if the NDPOptions
// the iterator was returned for is malformed.
func TestNDPOptionsIterCheck(t *testing.T) {
@@ -473,6 +643,51 @@ func TestNDPOptionsIterCheck(t *testing.T) {
},
nil,
},
+ {
+ "InvalidRecursiveDNSServerCutsOffAddress",
+ []byte{
+ 25, 4, 0, 0,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ },
+ ErrNDPOptMalformedBody,
+ },
+ {
+ "InvalidRecursiveDNSServerInvalidLengthField",
+ []byte{
+ 25, 2, 0, 0,
+ 0, 0, 0, 0,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8,
+ },
+ ErrNDPInvalidLength,
+ },
+ {
+ "RecursiveDNSServerTooSmall",
+ []byte{
+ 25, 1, 0, 0,
+ 0, 0, 0,
+ },
+ ErrNDPOptBufExhausted,
+ },
+ {
+ "RecursiveDNSServerMulticast",
+ []byte{
+ 25, 3, 0, 0,
+ 0, 0, 0, 0,
+ 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
+ },
+ ErrNDPOptMalformedBody,
+ },
+ {
+ "RecursiveDNSServerUnspecified",
+ []byte{
+ 25, 3, 0, 0,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ ErrNDPOptMalformedBody,
+ },
}
for _, test := range tests {
diff --git a/pkg/tcpip/iptables/BUILD b/pkg/tcpip/iptables/BUILD
index cc5f531e2..64769c333 100644
--- a/pkg/tcpip/iptables/BUILD
+++ b/pkg/tcpip/iptables/BUILD
@@ -11,5 +11,8 @@ go_library(
],
importpath = "gvisor.dev/gvisor/pkg/tcpip/iptables",
visibility = ["//visibility:public"],
- deps = ["//pkg/tcpip/buffer"],
+ deps = [
+ "//pkg/log",
+ "//pkg/tcpip/buffer",
+ ],
)
diff --git a/pkg/tcpip/iptables/iptables.go b/pkg/tcpip/iptables/iptables.go
index 68c68d4aa..647970133 100644
--- a/pkg/tcpip/iptables/iptables.go
+++ b/pkg/tcpip/iptables/iptables.go
@@ -16,66 +16,114 @@
// tool.
package iptables
+// Table names.
const (
- tablenameNat = "nat"
- tablenameMangle = "mangle"
+ TablenameNat = "nat"
+ TablenameMangle = "mangle"
+ TablenameFilter = "filter"
)
// Chain names as defined by net/ipv4/netfilter/ip_tables.c.
const (
- chainNamePrerouting = "PREROUTING"
- chainNameInput = "INPUT"
- chainNameForward = "FORWARD"
- chainNameOutput = "OUTPUT"
- chainNamePostrouting = "POSTROUTING"
+ ChainNamePrerouting = "PREROUTING"
+ ChainNameInput = "INPUT"
+ ChainNameForward = "FORWARD"
+ ChainNameOutput = "OUTPUT"
+ ChainNamePostrouting = "POSTROUTING"
)
+// HookUnset indicates that there is no hook set for an entrypoint or
+// underflow.
+const HookUnset = -1
+
// DefaultTables returns a default set of tables. Each chain is set to accept
// all packets.
func DefaultTables() IPTables {
+ // TODO(gvisor.dev/issue/170): We may be able to swap out some strings for
+ // iotas.
return IPTables{
Tables: map[string]Table{
- tablenameNat: Table{
- BuiltinChains: map[Hook]Chain{
- Prerouting: unconditionalAcceptChain(chainNamePrerouting),
- Input: unconditionalAcceptChain(chainNameInput),
- Output: unconditionalAcceptChain(chainNameOutput),
- Postrouting: unconditionalAcceptChain(chainNamePostrouting),
+ TablenameNat: Table{
+ Rules: []Rule{
+ Rule{Target: UnconditionalAcceptTarget{}},
+ Rule{Target: UnconditionalAcceptTarget{}},
+ Rule{Target: UnconditionalAcceptTarget{}},
+ Rule{Target: UnconditionalAcceptTarget{}},
+ Rule{Target: ErrorTarget{}},
+ },
+ BuiltinChains: map[Hook]int{
+ Prerouting: 0,
+ Input: 1,
+ Output: 2,
+ Postrouting: 3,
},
- DefaultTargets: map[Hook]Target{
- Prerouting: UnconditionalAcceptTarget{},
- Input: UnconditionalAcceptTarget{},
- Output: UnconditionalAcceptTarget{},
- Postrouting: UnconditionalAcceptTarget{},
+ Underflows: map[Hook]int{
+ Prerouting: 0,
+ Input: 1,
+ Output: 2,
+ Postrouting: 3,
},
- UserChains: map[string]Chain{},
+ UserChains: map[string]int{},
},
- tablenameMangle: Table{
- BuiltinChains: map[Hook]Chain{
- Prerouting: unconditionalAcceptChain(chainNamePrerouting),
- Output: unconditionalAcceptChain(chainNameOutput),
+ TablenameMangle: Table{
+ Rules: []Rule{
+ Rule{Target: UnconditionalAcceptTarget{}},
+ Rule{Target: UnconditionalAcceptTarget{}},
+ Rule{Target: ErrorTarget{}},
+ },
+ BuiltinChains: map[Hook]int{
+ Prerouting: 0,
+ Output: 1,
},
- DefaultTargets: map[Hook]Target{
- Prerouting: UnconditionalAcceptTarget{},
- Output: UnconditionalAcceptTarget{},
+ Underflows: map[Hook]int{
+ Prerouting: 0,
+ Output: 1,
},
- UserChains: map[string]Chain{},
+ UserChains: map[string]int{},
+ },
+ TablenameFilter: Table{
+ Rules: []Rule{
+ Rule{Target: UnconditionalAcceptTarget{}},
+ Rule{Target: UnconditionalAcceptTarget{}},
+ Rule{Target: UnconditionalAcceptTarget{}},
+ Rule{Target: ErrorTarget{}},
+ },
+ BuiltinChains: map[Hook]int{
+ Input: 0,
+ Forward: 1,
+ Output: 2,
+ },
+ Underflows: map[Hook]int{
+ Input: 0,
+ Forward: 1,
+ Output: 2,
+ },
+ UserChains: map[string]int{},
},
},
Priorities: map[Hook][]string{
- Prerouting: []string{tablenameMangle, tablenameNat},
- Output: []string{tablenameMangle, tablenameNat},
+ Input: []string{TablenameNat, TablenameFilter},
+ Prerouting: []string{TablenameMangle, TablenameNat},
+ Output: []string{TablenameMangle, TablenameNat, TablenameFilter},
},
}
}
-func unconditionalAcceptChain(name string) Chain {
- return Chain{
- Name: name,
- Rules: []Rule{
- Rule{
- Target: UnconditionalAcceptTarget{},
- },
+// EmptyFilterTable returns a Table with no rules and the filter table chains
+// mapped to HookUnset.
+func EmptyFilterTable() Table {
+ return Table{
+ Rules: []Rule{},
+ BuiltinChains: map[Hook]int{
+ Input: HookUnset,
+ Forward: HookUnset,
+ Output: HookUnset,
+ },
+ Underflows: map[Hook]int{
+ Input: HookUnset,
+ Forward: HookUnset,
+ Output: HookUnset,
},
+ UserChains: map[string]int{},
}
}
diff --git a/pkg/tcpip/iptables/targets.go b/pkg/tcpip/iptables/targets.go
index 19a7f77e3..b94a4c941 100644
--- a/pkg/tcpip/iptables/targets.go
+++ b/pkg/tcpip/iptables/targets.go
@@ -16,7 +16,10 @@
package iptables
-import "gvisor.dev/gvisor/pkg/tcpip/buffer"
+import (
+ "gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/tcpip/buffer"
+)
// UnconditionalAcceptTarget accepts all packets.
type UnconditionalAcceptTarget struct{}
@@ -33,3 +36,14 @@ type UnconditionalDropTarget struct{}
func (UnconditionalDropTarget) Action(packet buffer.VectorisedView) (Verdict, string) {
return Drop, ""
}
+
+// ErrorTarget logs an error and drops the packet. It represents a target that
+// should be unreachable.
+type ErrorTarget struct{}
+
+// Action implements Target.Action.
+func (ErrorTarget) Action(packet buffer.VectorisedView) (Verdict, string) {
+ log.Warningf("ErrorTarget triggered.")
+ return Drop, ""
+
+}
diff --git a/pkg/tcpip/iptables/types.go b/pkg/tcpip/iptables/types.go
index 42a79ef9f..540f8c0b4 100644
--- a/pkg/tcpip/iptables/types.go
+++ b/pkg/tcpip/iptables/types.go
@@ -61,9 +61,12 @@ const (
type Verdict int
const (
+ // Invalid indicates an unkonwn or erroneous verdict.
+ Invalid Verdict = iota
+
// Accept indicates the packet should continue traversing netstack as
// normal.
- Accept Verdict = iota
+ Accept
// Drop inicates the packet should be dropped, stopping traversing
// netstack.
@@ -104,29 +107,22 @@ type IPTables struct {
Priorities map[Hook][]string
}
-// A Table defines a set of chains and hooks into the network stack. The
-// currently supported tables are:
-// * nat
-// * mangle
+// A Table defines a set of chains and hooks into the network stack. It is
+// really just a list of rules with some metadata for entrypoints and such.
type Table struct {
- // BuiltinChains holds the un-deletable chains built into netstack. If
- // a hook isn't present in the map, this table doesn't utilize that
- // hook.
- BuiltinChains map[Hook]Chain
+ // Rules holds the rules that make up the table.
+ Rules []Rule
- // DefaultTargets holds a target for each hook that will be executed if
- // chain traversal doesn't yield a verdict.
- DefaultTargets map[Hook]Target
+ // BuiltinChains maps builtin chains to their entrypoint rule in Rules.
+ BuiltinChains map[Hook]int
+
+ // Underflows maps builtin chains to their underflow rule in Rules
+ // (i.e. the rule to execute if the chain returns without a verdict).
+ Underflows map[Hook]int
// UserChains holds user-defined chains for the keyed by name. Users
// can give their chains arbitrary names.
- UserChains map[string]Chain
-
- // Chains maps names to chains for both builtin and user-defined chains.
- // Its entries point to Chains already either in BuiltinChains or
- // UserChains, and its purpose is to make looking up tables by name
- // fast.
- Chains map[string]*Chain
+ UserChains map[string]int
// Metadata holds information about the Table that is useful to users
// of IPTables, but not to the netstack IPTables code itself.
@@ -152,21 +148,6 @@ func (table *Table) SetMetadata(metadata interface{}) {
table.metadata = metadata
}
-// A Chain defines a list of rules for packet processing. When a packet
-// traverses a chain, it is checked against each rule until either a rule
-// returns a verdict or the chain ends.
-//
-// By convention, builtin chains end with a rule that matches everything and
-// returns either Accept or Drop. User-defined chains end with Return. These
-// aren't strictly necessary here, but the iptables tool writes tables this way.
-type Chain struct {
- // Name is the chain name.
- Name string
-
- // Rules is the list of rules to traverse.
- Rules []Rule
-}
-
// A Rule is a packet processing rule. It consists of two pieces. First it
// contains zero or more matchers, each of which is a specification of which
// packets this rule applies to. If there are no matchers in the rule, it
diff --git a/pkg/tcpip/link/fdbased/BUILD b/pkg/tcpip/link/fdbased/BUILD
index 897c94821..66cc53ed4 100644
--- a/pkg/tcpip/link/fdbased/BUILD
+++ b/pkg/tcpip/link/fdbased/BUILD
@@ -16,6 +16,7 @@ go_library(
importpath = "gvisor.dev/gvisor/pkg/tcpip/link/fdbased",
visibility = ["//visibility:public"],
deps = [
+ "//pkg/sync",
"//pkg/tcpip",
"//pkg/tcpip/buffer",
"//pkg/tcpip/header",
diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go
index fa8a703d9..b7f60178e 100644
--- a/pkg/tcpip/link/fdbased/endpoint.go
+++ b/pkg/tcpip/link/fdbased/endpoint.go
@@ -41,10 +41,10 @@ package fdbased
import (
"fmt"
- "sync"
"syscall"
"golang.org/x/sys/unix"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/buffer"
"gvisor.dev/gvisor/pkg/tcpip/header"
diff --git a/pkg/tcpip/link/sharedmem/BUILD b/pkg/tcpip/link/sharedmem/BUILD
index a4f9cdd69..09165dd4c 100644
--- a/pkg/tcpip/link/sharedmem/BUILD
+++ b/pkg/tcpip/link/sharedmem/BUILD
@@ -15,6 +15,7 @@ go_library(
visibility = ["//visibility:public"],
deps = [
"//pkg/log",
+ "//pkg/sync",
"//pkg/tcpip",
"//pkg/tcpip/buffer",
"//pkg/tcpip/header",
@@ -31,6 +32,7 @@ go_test(
],
embed = [":sharedmem"],
deps = [
+ "//pkg/sync",
"//pkg/tcpip",
"//pkg/tcpip/buffer",
"//pkg/tcpip/header",
diff --git a/pkg/tcpip/link/sharedmem/pipe/BUILD b/pkg/tcpip/link/sharedmem/pipe/BUILD
index 6b5bc542c..a0d4ad0be 100644
--- a/pkg/tcpip/link/sharedmem/pipe/BUILD
+++ b/pkg/tcpip/link/sharedmem/pipe/BUILD
@@ -21,4 +21,5 @@ go_test(
"pipe_test.go",
],
embed = [":pipe"],
+ deps = ["//pkg/sync"],
)
diff --git a/pkg/tcpip/link/sharedmem/pipe/pipe_test.go b/pkg/tcpip/link/sharedmem/pipe/pipe_test.go
index 59ef69a8b..dc239a0d0 100644
--- a/pkg/tcpip/link/sharedmem/pipe/pipe_test.go
+++ b/pkg/tcpip/link/sharedmem/pipe/pipe_test.go
@@ -18,8 +18,9 @@ import (
"math/rand"
"reflect"
"runtime"
- "sync"
"testing"
+
+ "gvisor.dev/gvisor/pkg/sync"
)
func TestSimpleReadWrite(t *testing.T) {
diff --git a/pkg/tcpip/link/sharedmem/sharedmem.go b/pkg/tcpip/link/sharedmem/sharedmem.go
index 080f9d667..655e537c4 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem.go
@@ -23,11 +23,11 @@
package sharedmem
import (
- "sync"
"sync/atomic"
"syscall"
"gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/buffer"
"gvisor.dev/gvisor/pkg/tcpip/header"
diff --git a/pkg/tcpip/link/sharedmem/sharedmem_test.go b/pkg/tcpip/link/sharedmem/sharedmem_test.go
index 89603c48f..5c729a439 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem_test.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem_test.go
@@ -22,11 +22,11 @@ import (
"math/rand"
"os"
"strings"
- "sync"
"syscall"
"testing"
"time"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/buffer"
"gvisor.dev/gvisor/pkg/tcpip/header"
diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go
index da8482509..42cacb8a6 100644
--- a/pkg/tcpip/network/arp/arp.go
+++ b/pkg/tcpip/network/arp/arp.go
@@ -79,16 +79,16 @@ func (e *endpoint) MaxHeaderLength() uint16 {
func (e *endpoint) Close() {}
-func (e *endpoint) WritePacket(*stack.Route, *stack.GSO, stack.NetworkHeaderParams, stack.PacketLooping, tcpip.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WritePacket(*stack.Route, *stack.GSO, stack.NetworkHeaderParams, tcpip.PacketBuffer) *tcpip.Error {
return tcpip.ErrNotSupported
}
// WritePackets implements stack.NetworkEndpoint.WritePackets.
-func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, []tcpip.PacketBuffer, stack.NetworkHeaderParams, stack.PacketLooping) (int, *tcpip.Error) {
+func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, []tcpip.PacketBuffer, stack.NetworkHeaderParams) (int, *tcpip.Error) {
return 0, tcpip.ErrNotSupported
}
-func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, loop stack.PacketLooping, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt tcpip.PacketBuffer) *tcpip.Error {
return tcpip.ErrNotSupported
}
diff --git a/pkg/tcpip/network/fragmentation/BUILD b/pkg/tcpip/network/fragmentation/BUILD
index acf1e022c..ed16076fd 100644
--- a/pkg/tcpip/network/fragmentation/BUILD
+++ b/pkg/tcpip/network/fragmentation/BUILD
@@ -28,6 +28,7 @@ go_library(
visibility = ["//visibility:public"],
deps = [
"//pkg/log",
+ "//pkg/sync",
"//pkg/tcpip",
"//pkg/tcpip/buffer",
],
diff --git a/pkg/tcpip/network/fragmentation/fragmentation.go b/pkg/tcpip/network/fragmentation/fragmentation.go
index 6da5238ec..92f2aa13a 100644
--- a/pkg/tcpip/network/fragmentation/fragmentation.go
+++ b/pkg/tcpip/network/fragmentation/fragmentation.go
@@ -19,9 +19,9 @@ package fragmentation
import (
"fmt"
"log"
- "sync"
"time"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/tcpip/buffer"
)
diff --git a/pkg/tcpip/network/fragmentation/reassembler.go b/pkg/tcpip/network/fragmentation/reassembler.go
index 9e002e396..0a83d81f2 100644
--- a/pkg/tcpip/network/fragmentation/reassembler.go
+++ b/pkg/tcpip/network/fragmentation/reassembler.go
@@ -18,9 +18,9 @@ import (
"container/heap"
"fmt"
"math"
- "sync"
"time"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/tcpip/buffer"
)
diff --git a/pkg/tcpip/network/ip_test.go b/pkg/tcpip/network/ip_test.go
index 4144a7837..f1bc33adf 100644
--- a/pkg/tcpip/network/ip_test.go
+++ b/pkg/tcpip/network/ip_test.go
@@ -239,7 +239,7 @@ func TestIPv4Send(t *testing.T) {
if err != nil {
t.Fatalf("could not find route: %v", err)
}
- if err := ep.WritePacket(&r, nil /* gso */, stack.NetworkHeaderParams{Protocol: 123, TTL: 123, TOS: stack.DefaultTOS}, stack.PacketOut, tcpip.PacketBuffer{
+ if err := ep.WritePacket(&r, nil /* gso */, stack.NetworkHeaderParams{Protocol: 123, TTL: 123, TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
Header: hdr,
Data: payload.ToVectorisedView(),
}); err != nil {
@@ -480,7 +480,7 @@ func TestIPv6Send(t *testing.T) {
if err != nil {
t.Fatalf("could not find route: %v", err)
}
- if err := ep.WritePacket(&r, nil /* gso */, stack.NetworkHeaderParams{Protocol: 123, TTL: 123, TOS: stack.DefaultTOS}, stack.PacketOut, tcpip.PacketBuffer{
+ if err := ep.WritePacket(&r, nil /* gso */, stack.NetworkHeaderParams{Protocol: 123, TTL: 123, TOS: stack.DefaultTOS}, tcpip.PacketBuffer{
Header: hdr,
Data: payload.ToVectorisedView(),
}); err != nil {
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index e645cf62c..4ee3d5b45 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -238,11 +238,11 @@ func (e *endpoint) addIPHeader(r *stack.Route, hdr *buffer.Prependable, payloadS
}
// WritePacket writes a packet to the given destination address and protocol.
-func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, loop stack.PacketLooping, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt tcpip.PacketBuffer) *tcpip.Error {
ip := e.addIPHeader(r, &pkt.Header, pkt.Data.Size(), params)
pkt.NetworkHeader = buffer.View(ip)
- if loop&stack.PacketLoop != 0 {
+ if r.Loop&stack.PacketLoop != 0 {
// The inbound path expects the network header to still be in
// the PacketBuffer's Data field.
views := make([]buffer.View, 1, 1+len(pkt.Data.Views()))
@@ -256,7 +256,7 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw
loopedR.Release()
}
- if loop&stack.PacketOut == 0 {
+ if r.Loop&stack.PacketOut == 0 {
return nil
}
if pkt.Header.UsedLength()+pkt.Data.Size() > int(e.linkEP.MTU()) && (gso == nil || gso.Type == stack.GSONone) {
@@ -270,11 +270,11 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw
}
// WritePackets implements stack.NetworkEndpoint.WritePackets.
-func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, params stack.NetworkHeaderParams, loop stack.PacketLooping) (int, *tcpip.Error) {
- if loop&stack.PacketLoop != 0 {
+func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, params stack.NetworkHeaderParams) (int, *tcpip.Error) {
+ if r.Loop&stack.PacketLoop != 0 {
panic("multiple packets in local loop")
}
- if loop&stack.PacketOut == 0 {
+ if r.Loop&stack.PacketOut == 0 {
return len(pkts), nil
}
@@ -289,7 +289,7 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.Pac
// WriteHeaderIncludedPacket writes a packet already containing a network
// header through the given route.
-func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, loop stack.PacketLooping, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt tcpip.PacketBuffer) *tcpip.Error {
// The packet already has an IP header, but there are a few required
// checks.
ip := header.IPv4(pkt.Data.First())
@@ -324,10 +324,10 @@ func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, loop stack.PacketLo
ip.SetChecksum(0)
ip.SetChecksum(^ip.CalculateChecksum())
- if loop&stack.PacketLoop != 0 {
+ if r.Loop&stack.PacketLoop != 0 {
e.HandlePacket(r, pkt.Clone())
}
- if loop&stack.PacketOut == 0 {
+ if r.Loop&stack.PacketOut == 0 {
return nil
}
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index dd31f0fb7..58c3c79b9 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -112,11 +112,11 @@ func (e *endpoint) addIPHeader(r *stack.Route, hdr *buffer.Prependable, payloadS
}
// WritePacket writes a packet to the given destination address and protocol.
-func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, loop stack.PacketLooping, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt tcpip.PacketBuffer) *tcpip.Error {
ip := e.addIPHeader(r, &pkt.Header, pkt.Data.Size(), params)
pkt.NetworkHeader = buffer.View(ip)
- if loop&stack.PacketLoop != 0 {
+ if r.Loop&stack.PacketLoop != 0 {
// The inbound path expects the network header to still be in
// the PacketBuffer's Data field.
views := make([]buffer.View, 1, 1+len(pkt.Data.Views()))
@@ -130,7 +130,7 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw
loopedR.Release()
}
- if loop&stack.PacketOut == 0 {
+ if r.Loop&stack.PacketOut == 0 {
return nil
}
@@ -139,11 +139,11 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.Netw
}
// WritePackets implements stack.LinkEndpoint.WritePackets.
-func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, params stack.NetworkHeaderParams, loop stack.PacketLooping) (int, *tcpip.Error) {
- if loop&stack.PacketLoop != 0 {
+func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, params stack.NetworkHeaderParams) (int, *tcpip.Error) {
+ if r.Loop&stack.PacketLoop != 0 {
panic("not implemented")
}
- if loop&stack.PacketOut == 0 {
+ if r.Loop&stack.PacketOut == 0 {
return len(pkts), nil
}
@@ -161,8 +161,8 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.Pac
// WriteHeaderIncludedPacker implements stack.NetworkEndpoint. It is not yet
// supported by IPv6.
-func (*endpoint) WriteHeaderIncludedPacket(r *stack.Route, loop stack.PacketLooping, pkt tcpip.PacketBuffer) *tcpip.Error {
- // TODO(b/119580726): Support IPv6 header-included packets.
+func (*endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt tcpip.PacketBuffer) *tcpip.Error {
+ // TODO(b/146666412): Support IPv6 header-included packets.
return tcpip.ErrNotSupported
}
diff --git a/pkg/tcpip/ports/BUILD b/pkg/tcpip/ports/BUILD
index 4839f0a65..a6ef3bdcc 100644
--- a/pkg/tcpip/ports/BUILD
+++ b/pkg/tcpip/ports/BUILD
@@ -1,5 +1,5 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_stateify:defs.bzl", "go_library")
package(licenses = ["notice"])
@@ -9,6 +9,7 @@ go_library(
importpath = "gvisor.dev/gvisor/pkg/tcpip/ports",
visibility = ["//visibility:public"],
deps = [
+ "//pkg/sync",
"//pkg/tcpip",
],
)
diff --git a/pkg/tcpip/ports/ports.go b/pkg/tcpip/ports/ports.go
index 30cea8996..b937cb84b 100644
--- a/pkg/tcpip/ports/ports.go
+++ b/pkg/tcpip/ports/ports.go
@@ -18,9 +18,9 @@ package ports
import (
"math"
"math/rand"
- "sync"
"sync/atomic"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/tcpip"
)
@@ -41,6 +41,30 @@ type portDescriptor struct {
port uint16
}
+// Flags represents the type of port reservation.
+//
+// +stateify savable
+type Flags struct {
+ // MostRecent represents UDP SO_REUSEADDR.
+ MostRecent bool
+
+ // LoadBalanced indicates SO_REUSEPORT.
+ //
+ // LoadBalanced takes precidence over MostRecent.
+ LoadBalanced bool
+}
+
+func (f Flags) bits() reuseFlag {
+ var rf reuseFlag
+ if f.MostRecent {
+ rf |= mostRecentFlag
+ }
+ if f.LoadBalanced {
+ rf |= loadBalancedFlag
+ }
+ return rf
+}
+
// PortManager manages allocating, reserving and releasing ports.
type PortManager struct {
mu sync.RWMutex
@@ -54,9 +78,59 @@ type PortManager struct {
hint uint32
}
+type reuseFlag int
+
+const (
+ mostRecentFlag reuseFlag = 1 << iota
+ loadBalancedFlag
+ nextFlag
+
+ flagMask = nextFlag - 1
+)
+
type portNode struct {
- reuse bool
- refs int
+ // refs stores the count for each possible flag combination.
+ refs [nextFlag]int
+}
+
+func (p portNode) totalRefs() int {
+ var total int
+ for _, r := range p.refs {
+ total += r
+ }
+ return total
+}
+
+// flagRefs returns the number of references with all specified flags.
+func (p portNode) flagRefs(flags reuseFlag) int {
+ var total int
+ for i, r := range p.refs {
+ if reuseFlag(i)&flags == flags {
+ total += r
+ }
+ }
+ return total
+}
+
+// allRefsHave returns if all references have all specified flags.
+func (p portNode) allRefsHave(flags reuseFlag) bool {
+ for i, r := range p.refs {
+ if reuseFlag(i)&flags == flags && r > 0 {
+ return false
+ }
+ }
+ return true
+}
+
+// intersectionRefs returns the set of flags shared by all references.
+func (p portNode) intersectionRefs() reuseFlag {
+ intersection := flagMask
+ for i, r := range p.refs {
+ if r > 0 {
+ intersection &= reuseFlag(i)
+ }
+ }
+ return intersection
}
// deviceNode is never empty. When it has no elements, it is removed from the
@@ -66,30 +140,44 @@ type deviceNode map[tcpip.NICID]portNode
// isAvailable checks whether binding is possible by device. If not binding to a
// device, check against all portNodes. If binding to a specific device, check
// against the unspecified device and the provided device.
-func (d deviceNode) isAvailable(reuse bool, bindToDevice tcpip.NICID) bool {
+//
+// If either of the port reuse flags is enabled on any of the nodes, all nodes
+// sharing a port must share at least one reuse flag. This matches Linux's
+// behavior.
+func (d deviceNode) isAvailable(flags Flags, bindToDevice tcpip.NICID) bool {
+ flagBits := flags.bits()
if bindToDevice == 0 {
// Trying to binding all devices.
- if !reuse {
+ if flagBits == 0 {
// Can't bind because the (addr,port) is already bound.
return false
}
+ intersection := flagMask
for _, p := range d {
- if !p.reuse {
- // Can't bind because the (addr,port) was previously bound without reuse.
+ i := p.intersectionRefs()
+ intersection &= i
+ if intersection&flagBits == 0 {
+ // Can't bind because the (addr,port) was
+ // previously bound without reuse.
return false
}
}
return true
}
+ intersection := flagMask
+
if p, ok := d[0]; ok {
- if !reuse || !p.reuse {
+ intersection = p.intersectionRefs()
+ if intersection&flagBits == 0 {
return false
}
}
if p, ok := d[bindToDevice]; ok {
- if !reuse || !p.reuse {
+ i := p.intersectionRefs()
+ intersection &= i
+ if intersection&flagBits == 0 {
return false
}
}
@@ -103,12 +191,12 @@ type bindAddresses map[tcpip.Address]deviceNode
// isAvailable checks whether an IP address is available to bind to. If the
// address is the "any" address, check all other addresses. Otherwise, just
// check against the "any" address and the provided address.
-func (b bindAddresses) isAvailable(addr tcpip.Address, reuse bool, bindToDevice tcpip.NICID) bool {
+func (b bindAddresses) isAvailable(addr tcpip.Address, flags Flags, bindToDevice tcpip.NICID) bool {
if addr == anyIPAddress {
// If binding to the "any" address then check that there are no conflicts
// with all addresses.
for _, d := range b {
- if !d.isAvailable(reuse, bindToDevice) {
+ if !d.isAvailable(flags, bindToDevice) {
return false
}
}
@@ -117,14 +205,14 @@ func (b bindAddresses) isAvailable(addr tcpip.Address, reuse bool, bindToDevice
// Check that there is no conflict with the "any" address.
if d, ok := b[anyIPAddress]; ok {
- if !d.isAvailable(reuse, bindToDevice) {
+ if !d.isAvailable(flags, bindToDevice) {
return false
}
}
// Check that this is no conflict with the provided address.
if d, ok := b[addr]; ok {
- if !d.isAvailable(reuse, bindToDevice) {
+ if !d.isAvailable(flags, bindToDevice) {
return false
}
}
@@ -190,17 +278,17 @@ func (s *PortManager) pickEphemeralPort(offset, count uint32, testPort func(p ui
}
// IsPortAvailable tests if the given port is available on all given protocols.
-func (s *PortManager) IsPortAvailable(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, reuse bool, bindToDevice tcpip.NICID) bool {
+func (s *PortManager) IsPortAvailable(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID) bool {
s.mu.Lock()
defer s.mu.Unlock()
- return s.isPortAvailableLocked(networks, transport, addr, port, reuse, bindToDevice)
+ return s.isPortAvailableLocked(networks, transport, addr, port, flags, bindToDevice)
}
-func (s *PortManager) isPortAvailableLocked(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, reuse bool, bindToDevice tcpip.NICID) bool {
+func (s *PortManager) isPortAvailableLocked(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID) bool {
for _, network := range networks {
desc := portDescriptor{network, transport, port}
if addrs, ok := s.allocatedPorts[desc]; ok {
- if !addrs.isAvailable(addr, reuse, bindToDevice) {
+ if !addrs.isAvailable(addr, flags, bindToDevice) {
return false
}
}
@@ -212,14 +300,14 @@ func (s *PortManager) isPortAvailableLocked(networks []tcpip.NetworkProtocolNumb
// reserved by another endpoint. If port is zero, ReservePort will search for
// an unreserved ephemeral port and reserve it, returning its value in the
// "port" return value.
-func (s *PortManager) ReservePort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, reuse bool, bindToDevice tcpip.NICID) (reservedPort uint16, err *tcpip.Error) {
+func (s *PortManager) ReservePort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID) (reservedPort uint16, err *tcpip.Error) {
s.mu.Lock()
defer s.mu.Unlock()
// If a port is specified, just try to reserve it for all network
// protocols.
if port != 0 {
- if !s.reserveSpecificPort(networks, transport, addr, port, reuse, bindToDevice) {
+ if !s.reserveSpecificPort(networks, transport, addr, port, flags, bindToDevice) {
return 0, tcpip.ErrPortInUse
}
return port, nil
@@ -227,15 +315,16 @@ func (s *PortManager) ReservePort(networks []tcpip.NetworkProtocolNumber, transp
// A port wasn't specified, so try to find one.
return s.PickEphemeralPort(func(p uint16) (bool, *tcpip.Error) {
- return s.reserveSpecificPort(networks, transport, addr, p, reuse, bindToDevice), nil
+ return s.reserveSpecificPort(networks, transport, addr, p, flags, bindToDevice), nil
})
}
// reserveSpecificPort tries to reserve the given port on all given protocols.
-func (s *PortManager) reserveSpecificPort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, reuse bool, bindToDevice tcpip.NICID) bool {
- if !s.isPortAvailableLocked(networks, transport, addr, port, reuse, bindToDevice) {
+func (s *PortManager) reserveSpecificPort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID) bool {
+ if !s.isPortAvailableLocked(networks, transport, addr, port, flags, bindToDevice) {
return false
}
+ flagBits := flags.bits()
// Reserve port on all network protocols.
for _, network := range networks {
@@ -250,12 +339,9 @@ func (s *PortManager) reserveSpecificPort(networks []tcpip.NetworkProtocolNumber
d = make(deviceNode)
m[addr] = d
}
- if n, ok := d[bindToDevice]; ok {
- n.refs++
- d[bindToDevice] = n
- } else {
- d[bindToDevice] = portNode{reuse: reuse, refs: 1}
- }
+ n := d[bindToDevice]
+ n.refs[flagBits]++
+ d[bindToDevice] = n
}
return true
@@ -263,10 +349,12 @@ func (s *PortManager) reserveSpecificPort(networks []tcpip.NetworkProtocolNumber
// ReleasePort releases the reservation on a port/IP combination so that it can
// be reserved by other endpoints.
-func (s *PortManager) ReleasePort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, bindToDevice tcpip.NICID) {
+func (s *PortManager) ReleasePort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID) {
s.mu.Lock()
defer s.mu.Unlock()
+ flagBits := flags.bits()
+
for _, network := range networks {
desc := portDescriptor{network, transport, port}
if m, ok := s.allocatedPorts[desc]; ok {
@@ -278,9 +366,9 @@ func (s *PortManager) ReleasePort(networks []tcpip.NetworkProtocolNumber, transp
if !ok {
continue
}
- n.refs--
+ n.refs[flagBits]--
d[bindToDevice] = n
- if n.refs == 0 {
+ if n.refs == [nextFlag]int{} {
delete(d, bindToDevice)
}
if len(d) == 0 {
diff --git a/pkg/tcpip/ports/ports_test.go b/pkg/tcpip/ports/ports_test.go
index 19f4833fc..d6969d050 100644
--- a/pkg/tcpip/ports/ports_test.go
+++ b/pkg/tcpip/ports/ports_test.go
@@ -33,7 +33,7 @@ type portReserveTestAction struct {
port uint16
ip tcpip.Address
want *tcpip.Error
- reuse bool
+ flags Flags
release bool
device tcpip.NICID
}
@@ -50,7 +50,7 @@ func TestPortReservation(t *testing.T) {
{port: 80, ip: fakeIPAddress1, want: nil},
/* N.B. Order of tests matters! */
{port: 80, ip: anyIPAddress, want: tcpip.ErrPortInUse},
- {port: 80, ip: fakeIPAddress, want: tcpip.ErrPortInUse, reuse: true},
+ {port: 80, ip: fakeIPAddress, want: tcpip.ErrPortInUse, flags: Flags{LoadBalanced: true}},
},
},
{
@@ -61,7 +61,7 @@ func TestPortReservation(t *testing.T) {
/* release fakeIPAddress, but anyIPAddress is still inuse */
{port: 22, ip: fakeIPAddress, release: true},
{port: 22, ip: fakeIPAddress, want: tcpip.ErrPortInUse},
- {port: 22, ip: fakeIPAddress, want: tcpip.ErrPortInUse, reuse: true},
+ {port: 22, ip: fakeIPAddress, want: tcpip.ErrPortInUse, flags: Flags{LoadBalanced: true}},
/* Release port 22 from any IP address, then try to reserve fake IP address on 22 */
{port: 22, ip: anyIPAddress, want: nil, release: true},
{port: 22, ip: fakeIPAddress, want: nil},
@@ -71,36 +71,36 @@ func TestPortReservation(t *testing.T) {
actions: []portReserveTestAction{
{port: 00, ip: fakeIPAddress, want: nil},
{port: 00, ip: fakeIPAddress, want: nil},
- {port: 00, ip: fakeIPAddress, reuse: true, want: nil},
+ {port: 00, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
},
}, {
tname: "bind to ip with reuseport",
actions: []portReserveTestAction{
- {port: 25, ip: fakeIPAddress, reuse: true, want: nil},
- {port: 25, ip: fakeIPAddress, reuse: true, want: nil},
+ {port: 25, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
+ {port: 25, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
- {port: 25, ip: fakeIPAddress, reuse: false, want: tcpip.ErrPortInUse},
- {port: 25, ip: anyIPAddress, reuse: false, want: tcpip.ErrPortInUse},
+ {port: 25, ip: fakeIPAddress, flags: Flags{}, want: tcpip.ErrPortInUse},
+ {port: 25, ip: anyIPAddress, flags: Flags{}, want: tcpip.ErrPortInUse},
- {port: 25, ip: anyIPAddress, reuse: true, want: nil},
+ {port: 25, ip: anyIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
},
}, {
tname: "bind to inaddr any with reuseport",
actions: []portReserveTestAction{
- {port: 24, ip: anyIPAddress, reuse: true, want: nil},
- {port: 24, ip: anyIPAddress, reuse: true, want: nil},
+ {port: 24, ip: anyIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
+ {port: 24, ip: anyIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
- {port: 24, ip: anyIPAddress, reuse: false, want: tcpip.ErrPortInUse},
- {port: 24, ip: fakeIPAddress, reuse: false, want: tcpip.ErrPortInUse},
+ {port: 24, ip: anyIPAddress, flags: Flags{}, want: tcpip.ErrPortInUse},
+ {port: 24, ip: fakeIPAddress, flags: Flags{}, want: tcpip.ErrPortInUse},
- {port: 24, ip: fakeIPAddress, reuse: true, want: nil},
- {port: 24, ip: fakeIPAddress, release: true, want: nil},
+ {port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
+ {port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, release: true, want: nil},
- {port: 24, ip: anyIPAddress, release: true},
- {port: 24, ip: anyIPAddress, reuse: false, want: tcpip.ErrPortInUse},
+ {port: 24, ip: anyIPAddress, flags: Flags{LoadBalanced: true}, release: true},
+ {port: 24, ip: anyIPAddress, flags: Flags{}, want: tcpip.ErrPortInUse},
- {port: 24, ip: anyIPAddress, release: true},
- {port: 24, ip: anyIPAddress, reuse: false, want: nil},
+ {port: 24, ip: anyIPAddress, flags: Flags{LoadBalanced: true}, release: true},
+ {port: 24, ip: anyIPAddress, flags: Flags{}, want: nil},
},
}, {
tname: "bind twice with device fails",
@@ -125,88 +125,152 @@ func TestPortReservation(t *testing.T) {
actions: []portReserveTestAction{
{port: 24, ip: fakeIPAddress, want: nil},
{port: 24, ip: fakeIPAddress, device: 123, want: tcpip.ErrPortInUse},
- {port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: tcpip.ErrPortInUse},
+ {port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
{port: 24, ip: fakeIPAddress, want: tcpip.ErrPortInUse},
- {port: 24, ip: fakeIPAddress, reuse: true, want: tcpip.ErrPortInUse},
+ {port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
},
}, {
tname: "bind with device",
actions: []portReserveTestAction{
{port: 24, ip: fakeIPAddress, device: 123, want: nil},
{port: 24, ip: fakeIPAddress, device: 123, want: tcpip.ErrPortInUse},
- {port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: tcpip.ErrPortInUse},
+ {port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
{port: 24, ip: fakeIPAddress, device: 0, want: tcpip.ErrPortInUse},
- {port: 24, ip: fakeIPAddress, device: 0, reuse: true, want: tcpip.ErrPortInUse},
- {port: 24, ip: fakeIPAddress, device: 456, reuse: true, want: nil},
+ {port: 24, ip: fakeIPAddress, device: 0, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
+ {port: 24, ip: fakeIPAddress, device: 456, flags: Flags{LoadBalanced: true}, want: nil},
{port: 24, ip: fakeIPAddress, device: 789, want: nil},
{port: 24, ip: fakeIPAddress, want: tcpip.ErrPortInUse},
- {port: 24, ip: fakeIPAddress, reuse: true, want: tcpip.ErrPortInUse},
+ {port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
},
}, {
- tname: "bind with reuse",
+ tname: "bind with reuseport",
actions: []portReserveTestAction{
- {port: 24, ip: fakeIPAddress, reuse: true, want: nil},
+ {port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
{port: 24, ip: fakeIPAddress, device: 123, want: tcpip.ErrPortInUse},
- {port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: nil},
+ {port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: nil},
{port: 24, ip: fakeIPAddress, device: 0, want: tcpip.ErrPortInUse},
- {port: 24, ip: fakeIPAddress, device: 0, reuse: true, want: nil},
+ {port: 24, ip: fakeIPAddress, device: 0, flags: Flags{LoadBalanced: true}, want: nil},
},
}, {
- tname: "binding with reuse and device",
+ tname: "binding with reuseport and device",
actions: []portReserveTestAction{
- {port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: nil},
+ {port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: nil},
{port: 24, ip: fakeIPAddress, device: 123, want: tcpip.ErrPortInUse},
- {port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: nil},
+ {port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: nil},
{port: 24, ip: fakeIPAddress, device: 0, want: tcpip.ErrPortInUse},
- {port: 24, ip: fakeIPAddress, device: 456, reuse: true, want: nil},
- {port: 24, ip: fakeIPAddress, device: 0, reuse: true, want: nil},
- {port: 24, ip: fakeIPAddress, device: 789, reuse: true, want: nil},
+ {port: 24, ip: fakeIPAddress, device: 456, flags: Flags{LoadBalanced: true}, want: nil},
+ {port: 24, ip: fakeIPAddress, device: 0, flags: Flags{LoadBalanced: true}, want: nil},
+ {port: 24, ip: fakeIPAddress, device: 789, flags: Flags{LoadBalanced: true}, want: nil},
{port: 24, ip: fakeIPAddress, device: 999, want: tcpip.ErrPortInUse},
},
}, {
- tname: "mixing reuse and not reuse by binding to device",
+ tname: "mixing reuseport and not reuseport by binding to device",
actions: []portReserveTestAction{
- {port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: nil},
+ {port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: nil},
{port: 24, ip: fakeIPAddress, device: 456, want: nil},
- {port: 24, ip: fakeIPAddress, device: 789, reuse: true, want: nil},
+ {port: 24, ip: fakeIPAddress, device: 789, flags: Flags{LoadBalanced: true}, want: nil},
{port: 24, ip: fakeIPAddress, device: 999, want: nil},
},
}, {
- tname: "can't bind to 0 after mixing reuse and not reuse",
+ tname: "can't bind to 0 after mixing reuseport and not reuseport",
actions: []portReserveTestAction{
- {port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: nil},
+ {port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: nil},
{port: 24, ip: fakeIPAddress, device: 456, want: nil},
- {port: 24, ip: fakeIPAddress, device: 0, reuse: true, want: tcpip.ErrPortInUse},
+ {port: 24, ip: fakeIPAddress, device: 0, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
},
}, {
tname: "bind and release",
actions: []portReserveTestAction{
- {port: 24, ip: fakeIPAddress, device: 123, reuse: true, want: nil},
- {port: 24, ip: fakeIPAddress, device: 0, reuse: true, want: nil},
- {port: 24, ip: fakeIPAddress, device: 345, reuse: false, want: tcpip.ErrPortInUse},
- {port: 24, ip: fakeIPAddress, device: 789, reuse: true, want: nil},
+ {port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: nil},
+ {port: 24, ip: fakeIPAddress, device: 0, flags: Flags{LoadBalanced: true}, want: nil},
+ {port: 24, ip: fakeIPAddress, device: 345, flags: Flags{}, want: tcpip.ErrPortInUse},
+ {port: 24, ip: fakeIPAddress, device: 789, flags: Flags{LoadBalanced: true}, want: nil},
// Release the bind to device 0 and try again.
- {port: 24, ip: fakeIPAddress, device: 0, reuse: true, want: nil, release: true},
- {port: 24, ip: fakeIPAddress, device: 345, reuse: false, want: nil},
+ {port: 24, ip: fakeIPAddress, device: 0, flags: Flags{LoadBalanced: true}, want: nil, release: true},
+ {port: 24, ip: fakeIPAddress, device: 345, flags: Flags{}, want: nil},
},
}, {
- tname: "bind twice with reuse once",
+ tname: "bind twice with reuseport once",
actions: []portReserveTestAction{
- {port: 24, ip: fakeIPAddress, device: 123, reuse: false, want: nil},
- {port: 24, ip: fakeIPAddress, device: 0, reuse: true, want: tcpip.ErrPortInUse},
+ {port: 24, ip: fakeIPAddress, device: 123, flags: Flags{}, want: nil},
+ {port: 24, ip: fakeIPAddress, device: 0, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
},
}, {
tname: "release an unreserved device",
actions: []portReserveTestAction{
- {port: 24, ip: fakeIPAddress, device: 123, reuse: false, want: nil},
- {port: 24, ip: fakeIPAddress, device: 456, reuse: false, want: nil},
+ {port: 24, ip: fakeIPAddress, device: 123, flags: Flags{}, want: nil},
+ {port: 24, ip: fakeIPAddress, device: 456, flags: Flags{}, want: nil},
// The below don't exist.
- {port: 24, ip: fakeIPAddress, device: 345, reuse: false, want: nil, release: true},
- {port: 9999, ip: fakeIPAddress, device: 123, reuse: false, want: nil, release: true},
+ {port: 24, ip: fakeIPAddress, device: 345, flags: Flags{}, want: nil, release: true},
+ {port: 9999, ip: fakeIPAddress, device: 123, flags: Flags{}, want: nil, release: true},
// Release all.
- {port: 24, ip: fakeIPAddress, device: 123, reuse: false, want: nil, release: true},
- {port: 24, ip: fakeIPAddress, device: 456, reuse: false, want: nil, release: true},
+ {port: 24, ip: fakeIPAddress, device: 123, flags: Flags{}, want: nil, release: true},
+ {port: 24, ip: fakeIPAddress, device: 456, flags: Flags{}, want: nil, release: true},
+ },
+ }, {
+ tname: "bind with reuseaddr",
+ actions: []portReserveTestAction{
+ {port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true}, want: nil},
+ {port: 24, ip: fakeIPAddress, device: 123, want: tcpip.ErrPortInUse},
+ {port: 24, ip: fakeIPAddress, device: 123, flags: Flags{MostRecent: true}, want: nil},
+ {port: 24, ip: fakeIPAddress, device: 0, want: tcpip.ErrPortInUse},
+ {port: 24, ip: fakeIPAddress, device: 0, flags: Flags{MostRecent: true}, want: nil},
+ },
+ }, {
+ tname: "bind twice with reuseaddr once",
+ actions: []portReserveTestAction{
+ {port: 24, ip: fakeIPAddress, device: 123, flags: Flags{}, want: nil},
+ {port: 24, ip: fakeIPAddress, device: 0, flags: Flags{MostRecent: true}, want: tcpip.ErrPortInUse},
+ },
+ }, {
+ tname: "bind with reuseaddr and reuseport",
+ actions: []portReserveTestAction{
+ {port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+ {port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+ {port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+ },
+ }, {
+ tname: "bind with reuseaddr and reuseport, and then reuseaddr",
+ actions: []portReserveTestAction{
+ {port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+ {port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true}, want: nil},
+ {port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
+ },
+ }, {
+ tname: "bind with reuseaddr and reuseport, and then reuseport",
+ actions: []portReserveTestAction{
+ {port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+ {port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
+ {port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true}, want: tcpip.ErrPortInUse},
+ },
+ }, {
+ tname: "bind with reuseaddr and reuseport twice, and then reuseaddr",
+ actions: []portReserveTestAction{
+ {port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+ {port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+ {port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true}, want: nil},
+ },
+ }, {
+ tname: "bind with reuseaddr and reuseport twice, and then reuseport",
+ actions: []portReserveTestAction{
+ {port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+ {port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+ {port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
+ },
+ }, {
+ tname: "bind with reuseaddr, and then reuseaddr and reuseport",
+ actions: []portReserveTestAction{
+ {port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true}, want: nil},
+ {port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+ {port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
+ },
+ }, {
+ tname: "bind with reuseport, and then reuseaddr and reuseport",
+ actions: []portReserveTestAction{
+ {port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
+ {port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+ {port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true}, want: tcpip.ErrPortInUse},
},
},
} {
@@ -216,12 +280,12 @@ func TestPortReservation(t *testing.T) {
for _, test := range test.actions {
if test.release {
- pm.ReleasePort(net, fakeTransNumber, test.ip, test.port, test.device)
+ pm.ReleasePort(net, fakeTransNumber, test.ip, test.port, test.flags, test.device)
continue
}
- gotPort, err := pm.ReservePort(net, fakeTransNumber, test.ip, test.port, test.reuse, test.device)
+ gotPort, err := pm.ReservePort(net, fakeTransNumber, test.ip, test.port, test.flags, test.device)
if err != test.want {
- t.Fatalf("ReservePort(.., .., %s, %d, %t, %d) = %v, want %v", test.ip, test.port, test.reuse, test.device, err, test.want)
+ t.Fatalf("ReservePort(.., .., %s, %d, %+v, %d) = %v, want %v", test.ip, test.port, test.flags, test.device, err, test.want)
}
if test.port == 0 && (gotPort == 0 || gotPort < FirstEphemeral) {
t.Fatalf("ReservePort(.., .., .., 0) = %d, want port number >= %d to be picked", gotPort, FirstEphemeral)
diff --git a/pkg/tcpip/sample/tun_tcp_connect/BUILD b/pkg/tcpip/sample/tun_tcp_connect/BUILD
index a57752a7c..d7496fde6 100644
--- a/pkg/tcpip/sample/tun_tcp_connect/BUILD
+++ b/pkg/tcpip/sample/tun_tcp_connect/BUILD
@@ -5,6 +5,7 @@ package(licenses = ["notice"])
go_binary(
name = "tun_tcp_connect",
srcs = ["main.go"],
+ visibility = ["//:sandbox"],
deps = [
"//pkg/tcpip",
"//pkg/tcpip/buffer",
diff --git a/pkg/tcpip/sample/tun_tcp_echo/BUILD b/pkg/tcpip/sample/tun_tcp_echo/BUILD
index dad8ef399..875561566 100644
--- a/pkg/tcpip/sample/tun_tcp_echo/BUILD
+++ b/pkg/tcpip/sample/tun_tcp_echo/BUILD
@@ -5,6 +5,7 @@ package(licenses = ["notice"])
go_binary(
name = "tun_tcp_echo",
srcs = ["main.go"],
+ visibility = ["//:sandbox"],
deps = [
"//pkg/tcpip",
"//pkg/tcpip/link/fdbased",
diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD
index 69077669a..783351a69 100644
--- a/pkg/tcpip/stack/BUILD
+++ b/pkg/tcpip/stack/BUILD
@@ -36,6 +36,7 @@ go_library(
"//pkg/ilist",
"//pkg/rand",
"//pkg/sleep",
+ "//pkg/sync",
"//pkg/tcpip",
"//pkg/tcpip/buffer",
"//pkg/tcpip/hash/jenkins",
@@ -50,7 +51,7 @@ go_library(
go_test(
name = "stack_x_test",
- size = "small",
+ size = "medium",
srcs = [
"ndp_test.go",
"stack_test.go",
@@ -59,6 +60,7 @@ go_test(
],
deps = [
":stack",
+ "//pkg/rand",
"//pkg/tcpip",
"//pkg/tcpip/buffer",
"//pkg/tcpip/checker",
@@ -82,6 +84,7 @@ go_test(
embed = [":stack"],
deps = [
"//pkg/sleep",
+ "//pkg/sync",
"//pkg/tcpip",
],
)
diff --git a/pkg/tcpip/stack/linkaddrcache.go b/pkg/tcpip/stack/linkaddrcache.go
index 267df60d1..403557fd7 100644
--- a/pkg/tcpip/stack/linkaddrcache.go
+++ b/pkg/tcpip/stack/linkaddrcache.go
@@ -16,10 +16,10 @@ package stack
import (
"fmt"
- "sync"
"time"
"gvisor.dev/gvisor/pkg/sleep"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/tcpip"
)
diff --git a/pkg/tcpip/stack/linkaddrcache_test.go b/pkg/tcpip/stack/linkaddrcache_test.go
index 9946b8fe8..1baa498d0 100644
--- a/pkg/tcpip/stack/linkaddrcache_test.go
+++ b/pkg/tcpip/stack/linkaddrcache_test.go
@@ -16,12 +16,12 @@ package stack
import (
"fmt"
- "sync"
"sync/atomic"
"testing"
"time"
"gvisor.dev/gvisor/pkg/sleep"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/tcpip"
)
diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index cfdd0496e..c99d387d5 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -17,6 +17,7 @@ package stack
import (
"fmt"
"log"
+ "math/rand"
"time"
"gvisor.dev/gvisor/pkg/tcpip"
@@ -38,25 +39,45 @@ const (
// Default = 1s (from RFC 4861 section 10).
defaultRetransmitTimer = time.Second
+ // defaultMaxRtrSolicitations is the default number of Router
+ // Solicitation messages to send when a NIC becomes enabled.
+ //
+ // Default = 3 (from RFC 4861 section 10).
+ defaultMaxRtrSolicitations = 3
+
+ // defaultRtrSolicitationInterval is the default amount of time between
+ // sending Router Solicitation messages.
+ //
+ // Default = 4s (from 4861 section 10).
+ defaultRtrSolicitationInterval = 4 * time.Second
+
+ // defaultMaxRtrSolicitationDelay is the default maximum amount of time
+ // to wait before sending the first Router Solicitation message.
+ //
+ // Default = 1s (from 4861 section 10).
+ defaultMaxRtrSolicitationDelay = time.Second
+
// defaultHandleRAs is the default configuration for whether or not to
// handle incoming Router Advertisements as a host.
- //
- // Default = true.
defaultHandleRAs = true
// defaultDiscoverDefaultRouters is the default configuration for
// whether or not to discover default routers from incoming Router
// Advertisements, as a host.
- //
- // Default = true.
defaultDiscoverDefaultRouters = true
// defaultDiscoverOnLinkPrefixes is the default configuration for
// whether or not to discover on-link prefixes from incoming Router
// Advertisements' Prefix Information option, as a host.
+ defaultDiscoverOnLinkPrefixes = true
+
+ // defaultAutoGenGlobalAddresses is the default configuration for
+ // whether or not to generate global IPv6 addresses in response to
+ // receiving a new Prefix Information option with its Autonomous
+ // Address AutoConfiguration flag set, as a host.
//
// Default = true.
- defaultDiscoverOnLinkPrefixes = true
+ defaultAutoGenGlobalAddresses = true
// minimumRetransmitTimer is the minimum amount of time to wait between
// sending NDP Neighbor solicitation messages. Note, RFC 4861 does
@@ -66,27 +87,74 @@ const (
// value of 0 means unspecified, so the smallest valid value is 1.
// Note, the unit of the RetransmitTimer field in the Router
// Advertisement is milliseconds.
- //
- // Min = 1ms.
minimumRetransmitTimer = time.Millisecond
+ // minimumRtrSolicitationInterval is the minimum amount of time to wait
+ // between sending Router Solicitation messages. This limit is imposed
+ // to make sure that Router Solicitation messages are not sent all at
+ // once, defeating the purpose of sending the initial few messages.
+ minimumRtrSolicitationInterval = 500 * time.Millisecond
+
+ // minimumMaxRtrSolicitationDelay is the minimum amount of time to wait
+ // before sending the first Router Solicitation message. It is 0 because
+ // we cannot have a negative delay.
+ minimumMaxRtrSolicitationDelay = 0
+
// MaxDiscoveredDefaultRouters is the maximum number of discovered
// default routers. The stack should stop discovering new routers after
// discovering MaxDiscoveredDefaultRouters routers.
//
// This value MUST be at minimum 2 as per RFC 4861 section 6.3.4, and
// SHOULD be more.
- //
- // Max = 10.
MaxDiscoveredDefaultRouters = 10
// MaxDiscoveredOnLinkPrefixes is the maximum number of discovered
// on-link prefixes. The stack should stop discovering new on-link
// prefixes after discovering MaxDiscoveredOnLinkPrefixes on-link
// prefixes.
- //
- // Max = 10.
MaxDiscoveredOnLinkPrefixes = 10
+
+ // validPrefixLenForAutoGen is the expected prefix length that an
+ // address can be generated for. Must be 64 bits as the interface
+ // identifier (IID) is 64 bits and an IPv6 address is 128 bits, so
+ // 128 - 64 = 64.
+ validPrefixLenForAutoGen = 64
+)
+
+var (
+ // MinPrefixInformationValidLifetimeForUpdate is the minimum Valid
+ // Lifetime to update the valid lifetime of a generated address by
+ // SLAAC.
+ //
+ // This is exported as a variable (instead of a constant) so tests
+ // can update it to a smaller value.
+ //
+ // Min = 2hrs.
+ MinPrefixInformationValidLifetimeForUpdate = 2 * time.Hour
+)
+
+// DHCPv6ConfigurationFromNDPRA is a configuration available via DHCPv6 that an
+// NDP Router Advertisement informed the Stack about.
+type DHCPv6ConfigurationFromNDPRA int
+
+const (
+ // DHCPv6NoConfiguration indicates that no configurations are available via
+ // DHCPv6.
+ DHCPv6NoConfiguration DHCPv6ConfigurationFromNDPRA = iota
+
+ // DHCPv6ManagedAddress indicates that addresses are available via DHCPv6.
+ //
+ // DHCPv6ManagedAddress also implies DHCPv6OtherConfigurations because DHCPv6
+ // will return all available configuration information.
+ DHCPv6ManagedAddress
+
+ // DHCPv6OtherConfigurations indicates that other configuration information is
+ // available via DHCPv6.
+ //
+ // Other configurations are configurations other than addresses. Examples of
+ // other configurations are recursive DNS server list, DNS search lists and
+ // default gateway.
+ DHCPv6OtherConfigurations
)
// NDPDispatcher is the interface integrators of netstack must implement to
@@ -105,40 +173,83 @@ type NDPDispatcher interface {
OnDuplicateAddressDetectionStatus(nicID tcpip.NICID, addr tcpip.Address, resolved bool, err *tcpip.Error)
// OnDefaultRouterDiscovered will be called when a new default router is
- // discovered. Implementations must return true along with a new valid
- // route table if the newly discovered router should be remembered. If
- // an implementation returns false, the second return value will be
- // ignored.
+ // discovered. Implementations must return true if the newly discovered
+ // router should be remembered.
//
// This function is not permitted to block indefinitely. This function
// is also not permitted to call into the stack.
- OnDefaultRouterDiscovered(nicID tcpip.NICID, addr tcpip.Address) (bool, []tcpip.Route)
+ OnDefaultRouterDiscovered(nicID tcpip.NICID, addr tcpip.Address) bool
// OnDefaultRouterInvalidated will be called when a discovered default
- // router is invalidated. Implementers must return a new valid route
- // table.
+ // router that was remembered is invalidated.
//
// This function is not permitted to block indefinitely. This function
// is also not permitted to call into the stack.
- OnDefaultRouterInvalidated(nicID tcpip.NICID, addr tcpip.Address) []tcpip.Route
+ OnDefaultRouterInvalidated(nicID tcpip.NICID, addr tcpip.Address)
// OnOnLinkPrefixDiscovered will be called when a new on-link prefix is
- // discovered. Implementations must return true along with a new valid
- // route table if the newly discovered on-link prefix should be
- // remembered. If an implementation returns false, the second return
- // value will be ignored.
+ // discovered. Implementations must return true if the newly discovered
+ // on-link prefix should be remembered.
//
// This function is not permitted to block indefinitely. This function
// is also not permitted to call into the stack.
- OnOnLinkPrefixDiscovered(nicID tcpip.NICID, prefix tcpip.Subnet) (bool, []tcpip.Route)
+ OnOnLinkPrefixDiscovered(nicID tcpip.NICID, prefix tcpip.Subnet) bool
// OnOnLinkPrefixInvalidated will be called when a discovered on-link
- // prefix is invalidated. Implementers must return a new valid route
- // table.
+ // prefix that was remembered is invalidated.
//
// This function is not permitted to block indefinitely. This function
// is also not permitted to call into the stack.
- OnOnLinkPrefixInvalidated(nicID tcpip.NICID, prefix tcpip.Subnet) []tcpip.Route
+ OnOnLinkPrefixInvalidated(nicID tcpip.NICID, prefix tcpip.Subnet)
+
+ // OnAutoGenAddress will be called when a new prefix with its
+ // autonomous address-configuration flag set has been received and SLAAC
+ // has been performed. Implementations may prevent the stack from
+ // assigning the address to the NIC by returning false.
+ //
+ // This function is not permitted to block indefinitely. It must not
+ // call functions on the stack itself.
+ OnAutoGenAddress(tcpip.NICID, tcpip.AddressWithPrefix) bool
+
+ // OnAutoGenAddressDeprecated will be called when an auto-generated
+ // address (as part of SLAAC) has been deprecated, but is still
+ // considered valid. Note, if an address is invalidated at the same
+ // time it is deprecated, the deprecation event MAY be omitted.
+ //
+ // This function is not permitted to block indefinitely. It must not
+ // call functions on the stack itself.
+ OnAutoGenAddressDeprecated(tcpip.NICID, tcpip.AddressWithPrefix)
+
+ // OnAutoGenAddressInvalidated will be called when an auto-generated
+ // address (as part of SLAAC) has been invalidated.
+ //
+ // This function is not permitted to block indefinitely. It must not
+ // call functions on the stack itself.
+ OnAutoGenAddressInvalidated(tcpip.NICID, tcpip.AddressWithPrefix)
+
+ // OnRecursiveDNSServerOption will be called when an NDP option with
+ // recursive DNS servers has been received. Note, addrs may contain
+ // link-local addresses.
+ //
+ // It is up to the caller to use the DNS Servers only for their valid
+ // lifetime. OnRecursiveDNSServerOption may be called for new or
+ // already known DNS servers. If called with known DNS servers, their
+ // valid lifetimes must be refreshed to lifetime (it may be increased,
+ // decreased, or completely invalidated when lifetime = 0).
+ //
+ // This function is not permitted to block indefinitely. It must not
+ // call functions on the stack itself.
+ OnRecursiveDNSServerOption(nicID tcpip.NICID, addrs []tcpip.Address, lifetime time.Duration)
+
+ // OnDHCPv6Configuration will be called with an updated configuration that is
+ // available via DHCPv6 for a specified NIC.
+ //
+ // NDPDispatcher assumes that the initial configuration available by DHCPv6 is
+ // DHCPv6NoConfiguration.
+ //
+ // This function is not permitted to block indefinitely. It must not
+ // call functions on the stack itself.
+ OnDHCPv6Configuration(tcpip.NICID, DHCPv6ConfigurationFromNDPRA)
}
// NDPConfigurations is the NDP configurations for the netstack.
@@ -152,9 +263,24 @@ type NDPConfigurations struct {
// The amount of time to wait between sending Neighbor solicitation
// messages.
//
- // Must be greater than 0.5s.
+ // Must be greater than or equal to 1ms.
RetransmitTimer time.Duration
+ // The number of Router Solicitation messages to send when the NIC
+ // becomes enabled.
+ MaxRtrSolicitations uint8
+
+ // The amount of time between transmitting Router Solicitation messages.
+ //
+ // Must be greater than or equal to 0.5s.
+ RtrSolicitationInterval time.Duration
+
+ // The maximum amount of time before transmitting the first Router
+ // Solicitation message.
+ //
+ // Must be greater than or equal to 0s.
+ MaxRtrSolicitationDelay time.Duration
+
// HandleRAs determines whether or not Router Advertisements will be
// processed.
HandleRAs bool
@@ -168,17 +294,32 @@ type NDPConfigurations struct {
// will be discovered from Router Advertisements' Prefix Information
// option. This configuration is ignored if HandleRAs is false.
DiscoverOnLinkPrefixes bool
+
+ // AutoGenGlobalAddresses determines whether or not global IPv6
+ // addresses will be generated for a NIC in response to receiving a new
+ // Prefix Information option with its Autonomous Address
+ // AutoConfiguration flag set, as a host, as per RFC 4862 (SLAAC).
+ //
+ // Note, if an address was already generated for some unique prefix, as
+ // part of SLAAC, this option does not affect whether or not the
+ // lifetime(s) of the generated address changes; this option only
+ // affects the generation of new addresses as part of SLAAC.
+ AutoGenGlobalAddresses bool
}
// DefaultNDPConfigurations returns an NDPConfigurations populated with
// default values.
func DefaultNDPConfigurations() NDPConfigurations {
return NDPConfigurations{
- DupAddrDetectTransmits: defaultDupAddrDetectTransmits,
- RetransmitTimer: defaultRetransmitTimer,
- HandleRAs: defaultHandleRAs,
- DiscoverDefaultRouters: defaultDiscoverDefaultRouters,
- DiscoverOnLinkPrefixes: defaultDiscoverOnLinkPrefixes,
+ DupAddrDetectTransmits: defaultDupAddrDetectTransmits,
+ RetransmitTimer: defaultRetransmitTimer,
+ MaxRtrSolicitations: defaultMaxRtrSolicitations,
+ RtrSolicitationInterval: defaultRtrSolicitationInterval,
+ MaxRtrSolicitationDelay: defaultMaxRtrSolicitationDelay,
+ HandleRAs: defaultHandleRAs,
+ DiscoverDefaultRouters: defaultDiscoverDefaultRouters,
+ DiscoverOnLinkPrefixes: defaultDiscoverOnLinkPrefixes,
+ AutoGenGlobalAddresses: defaultAutoGenGlobalAddresses,
}
}
@@ -187,10 +328,24 @@ func DefaultNDPConfigurations() NDPConfigurations {
//
// If RetransmitTimer is less than minimumRetransmitTimer, then a value of
// defaultRetransmitTimer will be used.
+//
+// If RtrSolicitationInterval is less than minimumRtrSolicitationInterval, then
+// a value of defaultRtrSolicitationInterval will be used.
+//
+// If MaxRtrSolicitationDelay is less than minimumMaxRtrSolicitationDelay, then
+// a value of defaultMaxRtrSolicitationDelay will be used.
func (c *NDPConfigurations) validate() {
if c.RetransmitTimer < minimumRetransmitTimer {
c.RetransmitTimer = defaultRetransmitTimer
}
+
+ if c.RtrSolicitationInterval < minimumRtrSolicitationInterval {
+ c.RtrSolicitationInterval = defaultRtrSolicitationInterval
+ }
+
+ if c.MaxRtrSolicitationDelay < minimumMaxRtrSolicitationDelay {
+ c.MaxRtrSolicitationDelay = defaultMaxRtrSolicitationDelay
+ }
}
// ndpState is the per-interface NDP state.
@@ -210,6 +365,16 @@ type ndpState struct {
// The on-link prefixes discovered through Router Advertisements' Prefix
// Information option.
onLinkPrefixes map[tcpip.Subnet]onLinkPrefixState
+
+ // The timer used to send the next router solicitation message.
+ // If routers are being solicited, rtrSolicitTimer MUST NOT be nil.
+ rtrSolicitTimer *time.Timer
+
+ // The addresses generated by SLAAC.
+ autoGenAddresses map[tcpip.Address]autoGenAddressState
+
+ // The last learned DHCPv6 configuration from an NDP RA.
+ dhcpv6Configuration DHCPv6ConfigurationFromNDPRA
}
// dadState holds the Duplicate Address Detection timer and channel to signal
@@ -228,46 +393,28 @@ type dadState struct {
// defaultRouterState holds data associated with a default router discovered by
// a Router Advertisement (RA).
type defaultRouterState struct {
- invalidationTimer *time.Timer
-
- // Used to inform the timer not to invalidate the default router (R) in
- // a race condition (T1 is a goroutine that handles an RA from R and T2
- // is the goroutine that handles R's invalidation timer firing):
- // T1: Receive a new RA from R
- // T1: Obtain the NIC's lock before processing the RA
- // T2: R's invalidation timer fires, and gets blocked on obtaining the
- // NIC's lock
- // T1: Refreshes/extends R's lifetime & releases NIC's lock
- // T2: Obtains NIC's lock & invalidates R immediately
- //
- // To resolve this, T1 will check to see if the timer already fired, and
- // inform the timer using doNotInvalidate to not invalidate R, so that
- // once T2 obtains the lock, it will see that it is set to true and do
- // nothing further.
- doNotInvalidate *bool
+ invalidationTimer tcpip.CancellableTimer
}
// onLinkPrefixState holds data associated with an on-link prefix discovered by
// a Router Advertisement's Prefix Information option (PI) when the NDP
// configurations was configured to do so.
type onLinkPrefixState struct {
- invalidationTimer *time.Timer
-
- // Used to signal the timer not to invalidate the on-link prefix (P) in
- // a race condition (T1 is a goroutine that handles a PI for P and T2
- // is the goroutine that handles P's invalidation timer firing):
- // T1: Receive a new PI for P
- // T1: Obtain the NIC's lock before processing the PI
- // T2: P's invalidation timer fires, and gets blocked on obtaining the
- // NIC's lock
- // T1: Refreshes/extends P's lifetime & releases NIC's lock
- // T2: Obtains NIC's lock & invalidates P immediately
- //
- // To resolve this, T1 will check to see if the timer already fired, and
- // inform the timer using doNotInvalidate to not invalidate P, so that
- // once T2 obtains the lock, it will see that it is set to true and do
- // nothing further.
- doNotInvalidate *bool
+ invalidationTimer tcpip.CancellableTimer
+}
+
+// autoGenAddressState holds data associated with an address generated via
+// SLAAC.
+type autoGenAddressState struct {
+ // A reference to the referencedNetworkEndpoint that this autoGenAddressState
+ // is holding state for.
+ ref *referencedNetworkEndpoint
+
+ deprecationTimer tcpip.CancellableTimer
+ invalidationTimer tcpip.CancellableTimer
+
+ // Nonzero only when the address is not valid forever.
+ validUntil time.Time
}
// startDuplicateAddressDetection performs Duplicate Address Detection.
@@ -408,10 +555,12 @@ func (ndp *ndpState) doDuplicateAddressDetection(addr tcpip.Address, remaining u
// address.
panic(fmt.Sprintf("ndpdad: NIC(%d) is not in the solicited-node multicast group (%s) but it has addr %s", ndp.nic.ID(), snmc, addr))
}
+ snmcRef.incRef()
// Use the unspecified address as the source address when performing
// DAD.
r := makeRoute(header.IPv6ProtocolNumber, header.IPv6Any, snmc, ndp.nic.linkEP.LinkAddress(), snmcRef, false, false)
+ defer r.Release()
hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv6NeighborSolicitMinimumSize)
pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborSolicitMinimumSize))
@@ -468,7 +617,7 @@ func (ndp *ndpState) stopDuplicateAddressDetection(addr tcpip.Address) {
// handleRA handles a Router Advertisement message that arrived on the NIC
// this ndp is for. Does nothing if the NIC is configured to not handle RAs.
//
-// The NIC that ndp belongs to and its associated stack MUST be locked.
+// The NIC that ndp belongs to MUST be locked.
func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
// Is the NIC configured to handle RAs at all?
//
@@ -480,6 +629,28 @@ func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
return
}
+ // Only worry about the DHCPv6 configuration if we have an NDPDispatcher as we
+ // only inform the dispatcher on configuration changes. We do nothing else
+ // with the information.
+ if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
+ var configuration DHCPv6ConfigurationFromNDPRA
+ switch {
+ case ra.ManagedAddrConfFlag():
+ configuration = DHCPv6ManagedAddress
+
+ case ra.OtherConfFlag():
+ configuration = DHCPv6OtherConfigurations
+
+ default:
+ configuration = DHCPv6NoConfiguration
+ }
+
+ if ndp.dhcpv6Configuration != configuration {
+ ndp.dhcpv6Configuration = configuration
+ ndpDisp.OnDHCPv6Configuration(ndp.nic.ID(), configuration)
+ }
+ }
+
// Is the NIC configured to discover default routers?
if ndp.configs.DiscoverDefaultRouters {
rtr, ok := ndp.defaultRouters[ip]
@@ -497,27 +668,9 @@ func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
case ok && rl != 0:
// This is an already discovered default router. Update
// the invalidation timer.
- timer := rtr.invalidationTimer
-
- // We should ALWAYS have an invalidation timer for a
- // discovered router.
- if timer == nil {
- panic("ndphandlera: RA invalidation timer should not be nil")
- }
-
- if !timer.Stop() {
- // If we reach this point, then we know the
- // timer fired after we already took the NIC
- // lock. Inform the timer not to invalidate the
- // router when it obtains the lock as we just
- // got a new RA that refreshes its lifetime to a
- // non-zero value. See
- // defaultRouterState.doNotInvalidate for more
- // details.
- *rtr.doNotInvalidate = true
- }
-
- timer.Reset(rl)
+ rtr.invalidationTimer.StopLocked()
+ rtr.invalidationTimer.Reset(rl)
+ ndp.defaultRouters[ip] = rtr
case ok && rl == 0:
// We know about the router but it is no longer to be
@@ -534,19 +687,21 @@ func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
// we do not check the iterator for errors on calls to Next.
it, _ := ra.Options().Iter(false)
for opt, done, _ := it.Next(); !done; opt, done, _ = it.Next() {
- switch opt.Type() {
- case header.NDPPrefixInformationType:
- if !ndp.configs.DiscoverOnLinkPrefixes {
+ switch opt := opt.(type) {
+ case header.NDPRecursiveDNSServer:
+ if ndp.nic.stack.ndpDisp == nil {
continue
}
- pi := opt.(header.NDPPrefixInformation)
+ ndp.nic.stack.ndpDisp.OnRecursiveDNSServerOption(ndp.nic.ID(), opt.Addresses(), opt.Lifetime())
- prefix := pi.Subnet()
+ case header.NDPPrefixInformation:
+ prefix := opt.Subnet()
// Is the prefix a link-local?
if header.IsV6LinkLocalAddress(prefix.ID()) {
- // ...Yes, skip as per RFC 4861 section 6.3.4.
+ // ...Yes, skip as per RFC 4861 section 6.3.4,
+ // and RFC 4862 section 5.5.3.b (for SLAAC).
continue
}
@@ -557,82 +712,13 @@ func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
continue
}
- if !pi.OnLinkFlag() {
- // Not on-link so don't "discover" it as an
- // on-link prefix.
- continue
- }
-
- prefixState, ok := ndp.onLinkPrefixes[prefix]
- vl := pi.ValidLifetime()
- switch {
- case !ok && vl == 0:
- // Don't know about this prefix but has a zero
- // valid lifetime, so just ignore.
- continue
-
- case !ok && vl != 0:
- // This is a new on-link prefix we are
- // discovering.
- //
- // Only remember it if we currently know about
- // less than MaxDiscoveredOnLinkPrefixes on-link
- // prefixes.
- if len(ndp.onLinkPrefixes) < MaxDiscoveredOnLinkPrefixes {
- ndp.rememberOnLinkPrefix(prefix, vl)
- }
- continue
-
- case ok && vl == 0:
- // We know about the on-link prefix, but it is
- // no longer to be considered on-link, so
- // invalidate it.
- ndp.invalidateOnLinkPrefix(prefix)
- continue
- }
-
- // This is an already discovered on-link prefix with a
- // new non-zero valid lifetime.
- // Update the invalidation timer.
- timer := prefixState.invalidationTimer
-
- if timer == nil && vl >= header.NDPPrefixInformationInfiniteLifetime {
- // Had infinite valid lifetime before and
- // continues to have an invalid lifetime. Do
- // nothing further.
- continue
+ if opt.OnLinkFlag() {
+ ndp.handleOnLinkPrefixInformation(opt)
}
- if timer != nil && !timer.Stop() {
- // If we reach this point, then we know the
- // timer already fired after we took the NIC
- // lock. Inform the timer to not invalidate
- // the prefix once it obtains the lock as we
- // just got a new PI that refeshes its lifetime
- // to a non-zero value. See
- // onLinkPrefixState.doNotInvalidate for more
- // details.
- *prefixState.doNotInvalidate = true
+ if opt.AutonomousAddressConfigurationFlag() {
+ ndp.handleAutonomousPrefixInformation(opt)
}
-
- if vl >= header.NDPPrefixInformationInfiniteLifetime {
- // Prefix is now valid forever so we don't need
- // an invalidation timer.
- prefixState.invalidationTimer = nil
- ndp.onLinkPrefixes[prefix] = prefixState
- continue
- }
-
- if timer != nil {
- // We already have a timer so just reset it to
- // expire after the new valid lifetime.
- timer.Reset(vl)
- continue
- }
-
- // We do not have a timer so just create a new one.
- prefixState.invalidationTimer = ndp.prefixInvalidationCallback(prefix, vl, prefixState.doNotInvalidate)
- ndp.onLinkPrefixes[prefix] = prefixState
}
// TODO(b/141556115): Do (MTU) Parameter Discovery.
@@ -641,7 +727,7 @@ func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
// invalidateDefaultRouter invalidates a discovered default router.
//
-// The NIC that ndp belongs to and its associated stack MUST be locked.
+// The NIC that ndp belongs to MUST be locked.
func (ndp *ndpState) invalidateDefaultRouter(ip tcpip.Address) {
rtr, ok := ndp.defaultRouters[ip]
@@ -651,16 +737,13 @@ func (ndp *ndpState) invalidateDefaultRouter(ip tcpip.Address) {
return
}
- rtr.invalidationTimer.Stop()
- rtr.invalidationTimer = nil
- *rtr.doNotInvalidate = true
- rtr.doNotInvalidate = nil
+ rtr.invalidationTimer.StopLocked()
delete(ndp.defaultRouters, ip)
// Let the integrator know a discovered default router is invalidated.
- if ndp.nic.stack.ndpDisp != nil {
- ndp.nic.stack.routeTable = ndp.nic.stack.ndpDisp.OnDefaultRouterInvalidated(ndp.nic.ID(), ip)
+ if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
+ ndpDisp.OnDefaultRouterInvalidated(ndp.nic.ID(), ip)
}
}
@@ -669,43 +752,29 @@ func (ndp *ndpState) invalidateDefaultRouter(ip tcpip.Address) {
//
// The router identified by ip MUST NOT already be known by the NIC.
//
-// The NIC that ndp belongs to and its associated stack MUST be locked.
+// The NIC that ndp belongs to MUST be locked.
func (ndp *ndpState) rememberDefaultRouter(ip tcpip.Address, rl time.Duration) {
- if ndp.nic.stack.ndpDisp == nil {
+ ndpDisp := ndp.nic.stack.ndpDisp
+ if ndpDisp == nil {
return
}
// Inform the integrator when we discovered a default router.
- remember, routeTable := ndp.nic.stack.ndpDisp.OnDefaultRouterDiscovered(ndp.nic.ID(), ip)
- if !remember {
+ if !ndpDisp.OnDefaultRouterDiscovered(ndp.nic.ID(), ip) {
// Informed by the integrator to not remember the router, do
// nothing further.
return
}
- // Used to signal the timer not to invalidate the default router (R) in
- // a race condition. See defaultRouterState.doNotInvalidate for more
- // details.
- var doNotInvalidate bool
-
- ndp.defaultRouters[ip] = defaultRouterState{
- invalidationTimer: time.AfterFunc(rl, func() {
- ndp.nic.stack.mu.Lock()
- defer ndp.nic.stack.mu.Unlock()
- ndp.nic.mu.Lock()
- defer ndp.nic.mu.Unlock()
-
- if doNotInvalidate {
- doNotInvalidate = false
- return
- }
-
+ state := defaultRouterState{
+ invalidationTimer: tcpip.MakeCancellableTimer(&ndp.nic.mu, func() {
ndp.invalidateDefaultRouter(ip)
}),
- doNotInvalidate: &doNotInvalidate,
}
- ndp.nic.stack.routeTable = routeTable
+ state.invalidationTimer.Reset(rl)
+
+ ndp.defaultRouters[ip] = state
}
// rememberOnLinkPrefix remembers a newly discovered on-link prefix with IPv6
@@ -713,42 +782,36 @@ func (ndp *ndpState) rememberDefaultRouter(ip tcpip.Address, rl time.Duration) {
//
// The prefix identified by prefix MUST NOT already be known.
//
-// The NIC that ndp belongs to and its associated stack MUST be locked.
+// The NIC that ndp belongs to MUST be locked.
func (ndp *ndpState) rememberOnLinkPrefix(prefix tcpip.Subnet, l time.Duration) {
- if ndp.nic.stack.ndpDisp == nil {
+ ndpDisp := ndp.nic.stack.ndpDisp
+ if ndpDisp == nil {
return
}
// Inform the integrator when we discovered an on-link prefix.
- remember, routeTable := ndp.nic.stack.ndpDisp.OnOnLinkPrefixDiscovered(ndp.nic.ID(), prefix)
- if !remember {
+ if !ndpDisp.OnOnLinkPrefixDiscovered(ndp.nic.ID(), prefix) {
// Informed by the integrator to not remember the prefix, do
// nothing further.
return
}
- // Used to signal the timer not to invalidate the on-link prefix (P) in
- // a race condition. See onLinkPrefixState.doNotInvalidate for more
- // details.
- var doNotInvalidate bool
- var timer *time.Timer
-
- // Only create a timer if the lifetime is not infinite.
- if l < header.NDPPrefixInformationInfiniteLifetime {
- timer = ndp.prefixInvalidationCallback(prefix, l, &doNotInvalidate)
+ state := onLinkPrefixState{
+ invalidationTimer: tcpip.MakeCancellableTimer(&ndp.nic.mu, func() {
+ ndp.invalidateOnLinkPrefix(prefix)
+ }),
}
- ndp.onLinkPrefixes[prefix] = onLinkPrefixState{
- invalidationTimer: timer,
- doNotInvalidate: &doNotInvalidate,
+ if l < header.NDPInfiniteLifetime {
+ state.invalidationTimer.Reset(l)
}
- ndp.nic.stack.routeTable = routeTable
+ ndp.onLinkPrefixes[prefix] = state
}
// invalidateOnLinkPrefix invalidates a discovered on-link prefix.
//
-// The NIC that ndp belongs to and its associated stack MUST be locked.
+// The NIC that ndp belongs to MUST be locked.
func (ndp *ndpState) invalidateOnLinkPrefix(prefix tcpip.Subnet) {
s, ok := ndp.onLinkPrefixes[prefix]
@@ -758,40 +821,451 @@ func (ndp *ndpState) invalidateOnLinkPrefix(prefix tcpip.Subnet) {
return
}
- if s.invalidationTimer != nil {
- s.invalidationTimer.Stop()
- s.invalidationTimer = nil
- *s.doNotInvalidate = true
- }
-
- s.doNotInvalidate = nil
+ s.invalidationTimer.StopLocked()
delete(ndp.onLinkPrefixes, prefix)
// Let the integrator know a discovered on-link prefix is invalidated.
- if ndp.nic.stack.ndpDisp != nil {
- ndp.nic.stack.routeTable = ndp.nic.stack.ndpDisp.OnOnLinkPrefixInvalidated(ndp.nic.ID(), prefix)
+ if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
+ ndpDisp.OnOnLinkPrefixInvalidated(ndp.nic.ID(), prefix)
}
}
-// prefixInvalidationCallback returns a new on-link prefix invalidation timer
-// for prefix that fires after vl.
+// handleOnLinkPrefixInformation handles a Prefix Information option with
+// its on-link flag set, as per RFC 4861 section 6.3.4.
//
-// doNotInvalidate is used to signal the timer when it fires at the same time
-// that a prefix's valid lifetime gets refreshed. See
-// onLinkPrefixState.doNotInvalidate for more details.
-func (ndp *ndpState) prefixInvalidationCallback(prefix tcpip.Subnet, vl time.Duration, doNotInvalidate *bool) *time.Timer {
- return time.AfterFunc(vl, func() {
- ndp.nic.stack.mu.Lock()
- defer ndp.nic.stack.mu.Unlock()
- ndp.nic.mu.Lock()
- defer ndp.nic.mu.Unlock()
+// handleOnLinkPrefixInformation assumes that the prefix this pi is for is
+// not the link-local prefix and the on-link flag is set.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) handleOnLinkPrefixInformation(pi header.NDPPrefixInformation) {
+ prefix := pi.Subnet()
+ prefixState, ok := ndp.onLinkPrefixes[prefix]
+ vl := pi.ValidLifetime()
+
+ if !ok && vl == 0 {
+ // Don't know about this prefix but it has a zero valid
+ // lifetime, so just ignore.
+ return
+ }
+
+ if !ok && vl != 0 {
+ // This is a new on-link prefix we are discovering
+ //
+ // Only remember it if we currently know about less than
+ // MaxDiscoveredOnLinkPrefixes on-link prefixes.
+ if ndp.configs.DiscoverOnLinkPrefixes && len(ndp.onLinkPrefixes) < MaxDiscoveredOnLinkPrefixes {
+ ndp.rememberOnLinkPrefix(prefix, vl)
+ }
+ return
+ }
+
+ if ok && vl == 0 {
+ // We know about the on-link prefix, but it is
+ // no longer to be considered on-link, so
+ // invalidate it.
+ ndp.invalidateOnLinkPrefix(prefix)
+ return
+ }
+
+ // This is an already discovered on-link prefix with a
+ // new non-zero valid lifetime.
+ //
+ // Update the invalidation timer.
+
+ prefixState.invalidationTimer.StopLocked()
+
+ if vl < header.NDPInfiniteLifetime {
+ // Prefix is valid for a finite lifetime, reset the timer to expire after
+ // the new valid lifetime.
+ prefixState.invalidationTimer.Reset(vl)
+ }
+
+ ndp.onLinkPrefixes[prefix] = prefixState
+}
- if *doNotInvalidate {
- *doNotInvalidate = false
+// handleAutonomousPrefixInformation handles a Prefix Information option with
+// its autonomous flag set, as per RFC 4862 section 5.5.3.
+//
+// handleAutonomousPrefixInformation assumes that the prefix this pi is for is
+// not the link-local prefix and the autonomous flag is set.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) handleAutonomousPrefixInformation(pi header.NDPPrefixInformation) {
+ vl := pi.ValidLifetime()
+ pl := pi.PreferredLifetime()
+
+ // If the preferred lifetime is greater than the valid lifetime,
+ // silently ignore the Prefix Information option, as per RFC 4862
+ // section 5.5.3.c.
+ if pl > vl {
+ return
+ }
+
+ prefix := pi.Subnet()
+
+ // Check if we already have an auto-generated address for prefix.
+ for addr, addrState := range ndp.autoGenAddresses {
+ refAddrWithPrefix := tcpip.AddressWithPrefix{Address: addr, PrefixLen: addrState.ref.ep.PrefixLen()}
+ if refAddrWithPrefix.Subnet() != prefix {
+ continue
+ }
+
+ // At this point, we know we are refreshing a SLAAC generated IPv6 address
+ // with the prefix prefix. Do the work as outlined by RFC 4862 section
+ // 5.5.3.e.
+ ndp.refreshAutoGenAddressLifetimes(addr, pl, vl)
+ return
+ }
+
+ // We do not already have an address within the prefix, prefix. Do the
+ // work as outlined by RFC 4862 section 5.5.3.d if n is configured
+ // to auto-generated global addresses by SLAAC.
+ ndp.newAutoGenAddress(prefix, pl, vl)
+}
+
+// newAutoGenAddress generates a new SLAAC address with the provided lifetimes
+// for prefix.
+//
+// pl is the new preferred lifetime. vl is the new valid lifetime.
+func (ndp *ndpState) newAutoGenAddress(prefix tcpip.Subnet, pl, vl time.Duration) {
+ // Are we configured to auto-generate new global addresses?
+ if !ndp.configs.AutoGenGlobalAddresses {
+ return
+ }
+
+ // If we do not already have an address for this prefix and the valid
+ // lifetime is 0, no need to do anything further, as per RFC 4862
+ // section 5.5.3.d.
+ if vl == 0 {
+ return
+ }
+
+ // Make sure the prefix is valid (as far as its length is concerned) to
+ // generate a valid IPv6 address from an interface identifier (IID), as
+ // per RFC 4862 sectiion 5.5.3.d.
+ if prefix.Prefix() != validPrefixLenForAutoGen {
+ return
+ }
+
+ addrBytes := []byte(prefix.ID())
+ if oIID := ndp.nic.stack.opaqueIIDOpts; oIID.NICNameFromID != nil {
+ addrBytes = header.AppendOpaqueInterfaceIdentifier(addrBytes[:header.IIDOffsetInIPv6Address], prefix, oIID.NICNameFromID(ndp.nic.ID(), ndp.nic.name), 0 /* dadCounter */, oIID.SecretKey)
+ } else {
+ // Only attempt to generate an interface-specific IID if we have a valid
+ // link address.
+ //
+ // TODO(b/141011931): Validate a LinkEndpoint's link address (provided by
+ // LinkEndpoint.LinkAddress) before reaching this point.
+ linkAddr := ndp.nic.linkEP.LinkAddress()
+ if !header.IsValidUnicastEthernetAddress(linkAddr) {
return
}
+ // Generate an address within prefix from the modified EUI-64 of ndp's NIC's
+ // Ethernet MAC address.
+ header.EthernetAdddressToModifiedEUI64IntoBuf(linkAddr, addrBytes[header.IIDOffsetInIPv6Address:])
+ }
+ addr := tcpip.Address(addrBytes)
+ addrWithPrefix := tcpip.AddressWithPrefix{
+ Address: addr,
+ PrefixLen: validPrefixLenForAutoGen,
+ }
+
+ // If the nic already has this address, do nothing further.
+ if ndp.nic.hasPermanentAddrLocked(addr) {
+ return
+ }
+
+ // Inform the integrator that we have a new SLAAC address.
+ ndpDisp := ndp.nic.stack.ndpDisp
+ if ndpDisp == nil {
+ return
+ }
+ if !ndpDisp.OnAutoGenAddress(ndp.nic.ID(), addrWithPrefix) {
+ // Informed by the integrator not to add the address.
+ return
+ }
+
+ protocolAddr := tcpip.ProtocolAddress{
+ Protocol: header.IPv6ProtocolNumber,
+ AddressWithPrefix: addrWithPrefix,
+ }
+ // If the preferred lifetime is zero, then the address should be considered
+ // deprecated.
+ deprecated := pl == 0
+ ref, err := ndp.nic.addPermanentAddressLocked(protocolAddr, FirstPrimaryEndpoint, slaac, deprecated)
+ if err != nil {
+ log.Fatalf("ndp: error when adding address %s: %s", protocolAddr, err)
+ }
+
+ state := autoGenAddressState{
+ ref: ref,
+ deprecationTimer: tcpip.MakeCancellableTimer(&ndp.nic.mu, func() {
+ addrState, ok := ndp.autoGenAddresses[addr]
+ if !ok {
+ log.Fatalf("ndp: must have an autoGenAddressess entry for the SLAAC generated IPv6 address %s", addr)
+ }
+ addrState.ref.deprecated = true
+ ndp.notifyAutoGenAddressDeprecated(addr)
+ }),
+ invalidationTimer: tcpip.MakeCancellableTimer(&ndp.nic.mu, func() {
+ ndp.invalidateAutoGenAddress(addr)
+ }),
+ }
+
+ // Setup the initial timers to deprecate and invalidate this newly generated
+ // address.
+
+ if !deprecated && pl < header.NDPInfiniteLifetime {
+ state.deprecationTimer.Reset(pl)
+ }
+
+ if vl < header.NDPInfiniteLifetime {
+ state.invalidationTimer.Reset(vl)
+ state.validUntil = time.Now().Add(vl)
+ }
+
+ ndp.autoGenAddresses[addr] = state
+}
+
+// refreshAutoGenAddressLifetimes refreshes the lifetime of a SLAAC generated
+// address addr.
+//
+// pl is the new preferred lifetime. vl is the new valid lifetime.
+func (ndp *ndpState) refreshAutoGenAddressLifetimes(addr tcpip.Address, pl, vl time.Duration) {
+ addrState, ok := ndp.autoGenAddresses[addr]
+ if !ok {
+ log.Fatalf("ndp: SLAAC state not found to refresh lifetimes for %s", addr)
+ }
+ defer func() { ndp.autoGenAddresses[addr] = addrState }()
+
+ // If the preferred lifetime is zero, then the address should be considered
+ // deprecated.
+ deprecated := pl == 0
+ wasDeprecated := addrState.ref.deprecated
+ addrState.ref.deprecated = deprecated
+
+ // Only send the deprecation event if the deprecated status for addr just
+ // changed from non-deprecated to deprecated.
+ if !wasDeprecated && deprecated {
+ ndp.notifyAutoGenAddressDeprecated(addr)
+ }
+
+ // If addr was preferred for some finite lifetime before, stop the deprecation
+ // timer so it can be reset.
+ addrState.deprecationTimer.StopLocked()
+
+ // Reset the deprecation timer if addr has a finite preferred lifetime.
+ if !deprecated && pl < header.NDPInfiniteLifetime {
+ addrState.deprecationTimer.Reset(pl)
+ }
+
+ // As per RFC 4862 section 5.5.3.e, the valid lifetime of the address
+ //
+ //
+ // 1) If the received Valid Lifetime is greater than 2 hours or greater than
+ // RemainingLifetime, set the valid lifetime of the address to the
+ // advertised Valid Lifetime.
+ //
+ // 2) If RemainingLifetime is less than or equal to 2 hours, ignore the
+ // advertised Valid Lifetime.
+ //
+ // 3) Otherwise, reset the valid lifetime of the address to 2 hours.
+
+ // Handle the infinite valid lifetime separately as we do not keep a timer in
+ // this case.
+ if vl >= header.NDPInfiniteLifetime {
+ addrState.invalidationTimer.StopLocked()
+ addrState.validUntil = time.Time{}
+ return
+ }
+
+ var effectiveVl time.Duration
+ var rl time.Duration
+
+ // If the address was originally set to be valid forever, assume the remaining
+ // time to be the maximum possible value.
+ if addrState.validUntil == (time.Time{}) {
+ rl = header.NDPInfiniteLifetime
+ } else {
+ rl = time.Until(addrState.validUntil)
+ }
+
+ if vl > MinPrefixInformationValidLifetimeForUpdate || vl > rl {
+ effectiveVl = vl
+ } else if rl <= MinPrefixInformationValidLifetimeForUpdate {
+ return
+ } else {
+ effectiveVl = MinPrefixInformationValidLifetimeForUpdate
+ }
+
+ addrState.invalidationTimer.StopLocked()
+ addrState.invalidationTimer.Reset(effectiveVl)
+ addrState.validUntil = time.Now().Add(effectiveVl)
+}
+
+// notifyAutoGenAddressDeprecated notifies the stack's NDP dispatcher that addr
+// has been deprecated.
+func (ndp *ndpState) notifyAutoGenAddressDeprecated(addr tcpip.Address) {
+ if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
+ ndpDisp.OnAutoGenAddressDeprecated(ndp.nic.ID(), tcpip.AddressWithPrefix{
+ Address: addr,
+ PrefixLen: validPrefixLenForAutoGen,
+ })
+ }
+}
+
+// invalidateAutoGenAddress invalidates an auto-generated address.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) invalidateAutoGenAddress(addr tcpip.Address) {
+ if !ndp.cleanupAutoGenAddrResourcesAndNotify(addr) {
+ return
+ }
+
+ ndp.nic.removePermanentAddressLocked(addr)
+}
+
+// cleanupAutoGenAddrResourcesAndNotify cleans up an invalidated auto-generated
+// address's resources from ndp. If the stack has an NDP dispatcher, it will
+// be notified that addr has been invalidated.
+//
+// Returns true if ndp had resources for addr to cleanup.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) cleanupAutoGenAddrResourcesAndNotify(addr tcpip.Address) bool {
+ state, ok := ndp.autoGenAddresses[addr]
+ if !ok {
+ return false
+ }
+
+ state.deprecationTimer.StopLocked()
+ state.invalidationTimer.StopLocked()
+ delete(ndp.autoGenAddresses, addr)
+
+ if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
+ ndpDisp.OnAutoGenAddressInvalidated(ndp.nic.ID(), tcpip.AddressWithPrefix{
+ Address: addr,
+ PrefixLen: validPrefixLenForAutoGen,
+ })
+ }
+
+ return true
+}
+
+// cleanupHostOnlyState cleans up any state that is only useful for hosts.
+//
+// cleanupHostOnlyState MUST be called when ndp's NIC is transitioning from a
+// host to a router. This function will invalidate all discovered on-link
+// prefixes, discovered routers, and auto-generated addresses as routers do not
+// normally process Router Advertisements to discover default routers and
+// on-link prefixes, and auto-generate addresses via SLAAC.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) cleanupHostOnlyState() {
+ for addr, _ := range ndp.autoGenAddresses {
+ ndp.invalidateAutoGenAddress(addr)
+ }
+
+ if got := len(ndp.autoGenAddresses); got != 0 {
+ log.Fatalf("ndp: still have auto-generated addresses after cleaning up, found = %d", got)
+ }
+
+ for prefix, _ := range ndp.onLinkPrefixes {
ndp.invalidateOnLinkPrefix(prefix)
+ }
+
+ if got := len(ndp.onLinkPrefixes); got != 0 {
+ log.Fatalf("ndp: still have discovered on-link prefixes after cleaning up, found = %d", got)
+ }
+
+ for router, _ := range ndp.defaultRouters {
+ ndp.invalidateDefaultRouter(router)
+ }
+
+ if got := len(ndp.defaultRouters); got != 0 {
+ log.Fatalf("ndp: still have discovered default routers after cleaning up, found = %d", got)
+ }
+}
+
+// startSolicitingRouters starts soliciting routers, as per RFC 4861 section
+// 6.3.7. If routers are already being solicited, this function does nothing.
+//
+// The NIC ndp belongs to MUST be locked.
+func (ndp *ndpState) startSolicitingRouters() {
+ if ndp.rtrSolicitTimer != nil {
+ // We are already soliciting routers.
+ return
+ }
+
+ remaining := ndp.configs.MaxRtrSolicitations
+ if remaining == 0 {
+ return
+ }
+
+ // Calculate the random delay before sending our first RS, as per RFC
+ // 4861 section 6.3.7.
+ var delay time.Duration
+ if ndp.configs.MaxRtrSolicitationDelay > 0 {
+ delay = time.Duration(rand.Int63n(int64(ndp.configs.MaxRtrSolicitationDelay)))
+ }
+
+ ndp.rtrSolicitTimer = time.AfterFunc(delay, func() {
+ // Send an RS message with the unspecified source address.
+ ref := ndp.nic.getRefOrCreateTemp(header.IPv6ProtocolNumber, header.IPv6Any, NeverPrimaryEndpoint, true)
+ r := makeRoute(header.IPv6ProtocolNumber, header.IPv6Any, header.IPv6AllRoutersMulticastAddress, ndp.nic.linkEP.LinkAddress(), ref, false, false)
+ defer r.Release()
+
+ payloadSize := header.ICMPv6HeaderSize + header.NDPRSMinimumSize
+ hdr := buffer.NewPrependable(header.IPv6MinimumSize + payloadSize)
+ pkt := header.ICMPv6(hdr.Prepend(payloadSize))
+ pkt.SetType(header.ICMPv6RouterSolicit)
+ pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
+
+ sent := r.Stats().ICMP.V6PacketsSent
+ if err := r.WritePacket(nil,
+ NetworkHeaderParams{
+ Protocol: header.ICMPv6ProtocolNumber,
+ TTL: header.NDPHopLimit,
+ TOS: DefaultTOS,
+ }, tcpip.PacketBuffer{Header: hdr},
+ ); err != nil {
+ sent.Dropped.Increment()
+ log.Printf("startSolicitingRouters: error writing NDP router solicit message on NIC(%d); err = %s", ndp.nic.ID(), err)
+ // Don't send any more messages if we had an error.
+ remaining = 0
+ } else {
+ sent.RouterSolicit.Increment()
+ remaining--
+ }
+
+ ndp.nic.mu.Lock()
+ defer ndp.nic.mu.Unlock()
+ if remaining == 0 {
+ ndp.rtrSolicitTimer = nil
+ } else if ndp.rtrSolicitTimer != nil {
+ // Note, we need to explicitly check to make sure that
+ // the timer field is not nil because if it was nil but
+ // we still reached this point, then we know the NIC
+ // was requested to stop soliciting routers so we don't
+ // need to send the next Router Solicitation message.
+ ndp.rtrSolicitTimer.Reset(ndp.configs.RtrSolicitationInterval)
+ }
})
+
+}
+
+// stopSolicitingRouters stops soliciting routers. If routers are not currently
+// being solicited, this function does nothing.
+//
+// The NIC ndp belongs to MUST be locked.
+func (ndp *ndpState) stopSolicitingRouters() {
+ if ndp.rtrSolicitTimer == nil {
+ // Nothing to do.
+ return
+ }
+
+ ndp.rtrSolicitTimer.Stop()
+ ndp.rtrSolicitTimer = nil
}
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index 5b901f947..1a52e0e68 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -21,6 +21,7 @@ import (
"time"
"github.com/google/go-cmp/cmp"
+ "gvisor.dev/gvisor/pkg/rand"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/buffer"
"gvisor.dev/gvisor/pkg/tcpip/checker"
@@ -29,24 +30,58 @@ import (
"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
"gvisor.dev/gvisor/pkg/tcpip/stack"
"gvisor.dev/gvisor/pkg/tcpip/transport/icmp"
+ "gvisor.dev/gvisor/pkg/tcpip/transport/udp"
+ "gvisor.dev/gvisor/pkg/waiter"
)
const (
- addr1 = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"
- addr2 = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"
- addr3 = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03"
- linkAddr1 = "\x02\x02\x03\x04\x05\x06"
- linkAddr2 = "\x02\x02\x03\x04\x05\x07"
- linkAddr3 = "\x02\x02\x03\x04\x05\x08"
- defaultTimeout = 250 * time.Millisecond
+ addr1 = tcpip.Address("\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+ addr2 = tcpip.Address("\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
+ addr3 = tcpip.Address("\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03")
+ linkAddr1 = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x06")
+ linkAddr2 = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x07")
+ linkAddr3 = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x08")
+ defaultTimeout = 100 * time.Millisecond
)
var (
llAddr1 = header.LinkLocalAddr(linkAddr1)
llAddr2 = header.LinkLocalAddr(linkAddr2)
llAddr3 = header.LinkLocalAddr(linkAddr3)
+ dstAddr = tcpip.FullAddress{
+ Addr: "\x0a\x0b\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01",
+ Port: 25,
+ }
)
+func addrForSubnet(subnet tcpip.Subnet, linkAddr tcpip.LinkAddress) tcpip.AddressWithPrefix {
+ if !header.IsValidUnicastEthernetAddress(linkAddr) {
+ return tcpip.AddressWithPrefix{}
+ }
+
+ addrBytes := []byte(subnet.ID())
+ header.EthernetAdddressToModifiedEUI64IntoBuf(linkAddr, addrBytes[header.IIDOffsetInIPv6Address:])
+ return tcpip.AddressWithPrefix{
+ Address: tcpip.Address(addrBytes),
+ PrefixLen: 64,
+ }
+}
+
+// prefixSubnetAddr returns a prefix (Address + Length), the prefix's equivalent
+// tcpip.Subnet, and an address where the lower half of the address is composed
+// of the EUI-64 of linkAddr if it is a valid unicast ethernet address.
+func prefixSubnetAddr(offset uint8, linkAddr tcpip.LinkAddress) (tcpip.AddressWithPrefix, tcpip.Subnet, tcpip.AddressWithPrefix) {
+ prefixBytes := []byte{1, 2, 3, 4, 5, 6, 7, 8 + offset, 0, 0, 0, 0, 0, 0, 0, 0}
+ prefix := tcpip.AddressWithPrefix{
+ Address: tcpip.Address(prefixBytes),
+ PrefixLen: 64,
+ }
+
+ subnet := prefix.Subnet()
+
+ return prefix, subnet, addrForSubnet(subnet, linkAddr)
+}
+
// TestDADDisabled tests that an address successfully resolves immediately
// when DAD is not enabled (the default for an empty stack.Options).
func TestDADDisabled(t *testing.T) {
@@ -54,7 +89,7 @@ func TestDADDisabled(t *testing.T) {
NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
}
- e := channel.New(10, 1280, linkAddr1)
+ e := channel.New(0, 1280, linkAddr1)
s := stack.New(opts)
if err := s.CreateNIC(1, e); err != nil {
t.Fatalf("CreateNIC(_) = %s", err)
@@ -103,17 +138,48 @@ type ndpPrefixEvent struct {
discovered bool
}
+type ndpAutoGenAddrEventType int
+
+const (
+ newAddr ndpAutoGenAddrEventType = iota
+ deprecatedAddr
+ invalidatedAddr
+)
+
+type ndpAutoGenAddrEvent struct {
+ nicID tcpip.NICID
+ addr tcpip.AddressWithPrefix
+ eventType ndpAutoGenAddrEventType
+}
+
+type ndpRDNSS struct {
+ addrs []tcpip.Address
+ lifetime time.Duration
+}
+
+type ndpRDNSSEvent struct {
+ nicID tcpip.NICID
+ rdnss ndpRDNSS
+}
+
+type ndpDHCPv6Event struct {
+ nicID tcpip.NICID
+ configuration stack.DHCPv6ConfigurationFromNDPRA
+}
+
var _ stack.NDPDispatcher = (*ndpDispatcher)(nil)
// ndpDispatcher implements NDPDispatcher so tests can know when various NDP
// related events happen for test purposes.
type ndpDispatcher struct {
- dadC chan ndpDADEvent
- routerC chan ndpRouterEvent
- rememberRouter bool
- prefixC chan ndpPrefixEvent
- rememberPrefix bool
- routeTable []tcpip.Route
+ dadC chan ndpDADEvent
+ routerC chan ndpRouterEvent
+ rememberRouter bool
+ prefixC chan ndpPrefixEvent
+ rememberPrefix bool
+ autoGenAddrC chan ndpAutoGenAddrEvent
+ rdnssC chan ndpRDNSSEvent
+ dhcpv6ConfigurationC chan ndpDHCPv6Event
}
// Implements stack.NDPDispatcher.OnDuplicateAddressDetectionStatus.
@@ -129,101 +195,105 @@ func (n *ndpDispatcher) OnDuplicateAddressDetectionStatus(nicID tcpip.NICID, add
}
// Implements stack.NDPDispatcher.OnDefaultRouterDiscovered.
-func (n *ndpDispatcher) OnDefaultRouterDiscovered(nicID tcpip.NICID, addr tcpip.Address) (bool, []tcpip.Route) {
- if n.routerC != nil {
- n.routerC <- ndpRouterEvent{
+func (n *ndpDispatcher) OnDefaultRouterDiscovered(nicID tcpip.NICID, addr tcpip.Address) bool {
+ if c := n.routerC; c != nil {
+ c <- ndpRouterEvent{
nicID,
addr,
true,
}
}
- if !n.rememberRouter {
- return false, nil
- }
-
- rt := append([]tcpip.Route(nil), n.routeTable...)
- rt = append(rt, tcpip.Route{
- Destination: header.IPv6EmptySubnet,
- Gateway: addr,
- NIC: nicID,
- })
- n.routeTable = rt
- return true, rt
+ return n.rememberRouter
}
// Implements stack.NDPDispatcher.OnDefaultRouterInvalidated.
-func (n *ndpDispatcher) OnDefaultRouterInvalidated(nicID tcpip.NICID, addr tcpip.Address) []tcpip.Route {
- if n.routerC != nil {
- n.routerC <- ndpRouterEvent{
+func (n *ndpDispatcher) OnDefaultRouterInvalidated(nicID tcpip.NICID, addr tcpip.Address) {
+ if c := n.routerC; c != nil {
+ c <- ndpRouterEvent{
nicID,
addr,
false,
}
}
-
- var rt []tcpip.Route
- exclude := tcpip.Route{
- Destination: header.IPv6EmptySubnet,
- Gateway: addr,
- NIC: nicID,
- }
-
- for _, r := range n.routeTable {
- if r != exclude {
- rt = append(rt, r)
- }
- }
- n.routeTable = rt
- return rt
}
// Implements stack.NDPDispatcher.OnOnLinkPrefixDiscovered.
-func (n *ndpDispatcher) OnOnLinkPrefixDiscovered(nicID tcpip.NICID, prefix tcpip.Subnet) (bool, []tcpip.Route) {
- if n.prefixC != nil {
- n.prefixC <- ndpPrefixEvent{
+func (n *ndpDispatcher) OnOnLinkPrefixDiscovered(nicID tcpip.NICID, prefix tcpip.Subnet) bool {
+ if c := n.prefixC; c != nil {
+ c <- ndpPrefixEvent{
nicID,
prefix,
true,
}
}
- if !n.rememberPrefix {
- return false, nil
- }
-
- rt := append([]tcpip.Route(nil), n.routeTable...)
- rt = append(rt, tcpip.Route{
- Destination: prefix,
- NIC: nicID,
- })
- n.routeTable = rt
- return true, rt
+ return n.rememberPrefix
}
// Implements stack.NDPDispatcher.OnOnLinkPrefixInvalidated.
-func (n *ndpDispatcher) OnOnLinkPrefixInvalidated(nicID tcpip.NICID, prefix tcpip.Subnet) []tcpip.Route {
- if n.prefixC != nil {
- n.prefixC <- ndpPrefixEvent{
+func (n *ndpDispatcher) OnOnLinkPrefixInvalidated(nicID tcpip.NICID, prefix tcpip.Subnet) {
+ if c := n.prefixC; c != nil {
+ c <- ndpPrefixEvent{
nicID,
prefix,
false,
}
}
+}
- rt := make([]tcpip.Route, 0)
- exclude := tcpip.Route{
- Destination: prefix,
- NIC: nicID,
+func (n *ndpDispatcher) OnAutoGenAddress(nicID tcpip.NICID, addr tcpip.AddressWithPrefix) bool {
+ if c := n.autoGenAddrC; c != nil {
+ c <- ndpAutoGenAddrEvent{
+ nicID,
+ addr,
+ newAddr,
+ }
+ }
+ return true
+}
+
+func (n *ndpDispatcher) OnAutoGenAddressDeprecated(nicID tcpip.NICID, addr tcpip.AddressWithPrefix) {
+ if c := n.autoGenAddrC; c != nil {
+ c <- ndpAutoGenAddrEvent{
+ nicID,
+ addr,
+ deprecatedAddr,
+ }
+ }
+}
+
+func (n *ndpDispatcher) OnAutoGenAddressInvalidated(nicID tcpip.NICID, addr tcpip.AddressWithPrefix) {
+ if c := n.autoGenAddrC; c != nil {
+ c <- ndpAutoGenAddrEvent{
+ nicID,
+ addr,
+ invalidatedAddr,
+ }
+ }
+}
+
+// Implements stack.NDPDispatcher.OnRecursiveDNSServerOption.
+func (n *ndpDispatcher) OnRecursiveDNSServerOption(nicID tcpip.NICID, addrs []tcpip.Address, lifetime time.Duration) {
+ if c := n.rdnssC; c != nil {
+ c <- ndpRDNSSEvent{
+ nicID,
+ ndpRDNSS{
+ addrs,
+ lifetime,
+ },
+ }
}
+}
- for _, r := range n.routeTable {
- if r != exclude {
- rt = append(rt, r)
+// Implements stack.NDPDispatcher.OnDHCPv6Configuration.
+func (n *ndpDispatcher) OnDHCPv6Configuration(nicID tcpip.NICID, configuration stack.DHCPv6ConfigurationFromNDPRA) {
+ if c := n.dhcpv6ConfigurationC; c != nil {
+ c <- ndpDHCPv6Event{
+ nicID,
+ configuration,
}
}
- n.routeTable = rt
- return rt
}
// TestDADResolve tests that an address successfully resolves after performing
@@ -246,7 +316,11 @@ func TestDADResolve(t *testing.T) {
}
for _, test := range tests {
+ test := test
+
t.Run(test.name, func(t *testing.T) {
+ t.Parallel()
+
ndpDisp := ndpDispatcher{
dadC: make(chan ndpDADEvent),
}
@@ -434,7 +508,7 @@ func TestDADFail(t *testing.T) {
}
opts.NDPConfigs.RetransmitTimer = time.Second * 2
- e := channel.New(10, 1280, linkAddr1)
+ e := channel.New(0, 1280, linkAddr1)
s := stack.New(opts)
if err := s.CreateNIC(1, e); err != nil {
t.Fatalf("CreateNIC(_) = %s", err)
@@ -515,7 +589,7 @@ func TestDADStop(t *testing.T) {
NDPConfigs: ndpConfigs,
}
- e := channel.New(10, 1280, linkAddr1)
+ e := channel.New(0, 1280, linkAddr1)
s := stack.New(opts)
if err := s.CreateNIC(1, e); err != nil {
t.Fatalf("CreateNIC(_) = %s", err)
@@ -616,7 +690,7 @@ func TestSetNDPConfigurations(t *testing.T) {
ndpDisp := ndpDispatcher{
dadC: make(chan ndpDADEvent),
}
- e := channel.New(10, 1280, linkAddr1)
+ e := channel.New(0, 1280, linkAddr1)
s := stack.New(stack.Options{
NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
NDPDisp: &ndpDisp,
@@ -739,21 +813,32 @@ func TestSetNDPConfigurations(t *testing.T) {
}
}
-// raBufWithOpts returns a valid NDP Router Advertisement with options.
-//
-// Note, raBufWithOpts does not populate any of the RA fields other than the
-// Router Lifetime.
-func raBufWithOpts(ip tcpip.Address, rl uint16, optSer header.NDPOptionsSerializer) tcpip.PacketBuffer {
+// raBufWithOptsAndDHCPv6 returns a valid NDP Router Advertisement with options
+// and DHCPv6 configurations specified.
+func raBufWithOptsAndDHCPv6(ip tcpip.Address, rl uint16, managedAddress, otherConfigurations bool, optSer header.NDPOptionsSerializer) tcpip.PacketBuffer {
icmpSize := header.ICMPv6HeaderSize + header.NDPRAMinimumSize + int(optSer.Length())
hdr := buffer.NewPrependable(header.IPv6MinimumSize + icmpSize)
pkt := header.ICMPv6(hdr.Prepend(icmpSize))
pkt.SetType(header.ICMPv6RouterAdvert)
pkt.SetCode(0)
- ra := header.NDPRouterAdvert(pkt.NDPPayload())
+ raPayload := pkt.NDPPayload()
+ ra := header.NDPRouterAdvert(raPayload)
+ // Populate the Router Lifetime.
+ binary.BigEndian.PutUint16(raPayload[2:], rl)
+ // Populate the Managed Address flag field.
+ if managedAddress {
+ // The Managed Addresses flag field is the 7th bit of byte #1 (0-indexing)
+ // of the RA payload.
+ raPayload[1] |= (1 << 7)
+ }
+ // Populate the Other Configurations flag field.
+ if otherConfigurations {
+ // The Other Configurations flag field is the 6th bit of byte #1
+ // (0-indexing) of the RA payload.
+ raPayload[1] |= (1 << 6)
+ }
opts := ra.Options()
opts.Serialize(optSer)
- // Populate the Router Lifetime.
- binary.BigEndian.PutUint16(pkt.NDPPayload()[2:], rl)
pkt.SetChecksum(header.ICMPv6Checksum(pkt, ip, header.IPv6AllNodesMulticastAddress, buffer.VectorisedView{}))
payloadLength := hdr.UsedLength()
iph := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
@@ -768,6 +853,23 @@ func raBufWithOpts(ip tcpip.Address, rl uint16, optSer header.NDPOptionsSerializ
return tcpip.PacketBuffer{Data: hdr.View().ToVectorisedView()}
}
+// raBufWithOpts returns a valid NDP Router Advertisement with options.
+//
+// Note, raBufWithOpts does not populate any of the RA fields other than the
+// Router Lifetime.
+func raBufWithOpts(ip tcpip.Address, rl uint16, optSer header.NDPOptionsSerializer) tcpip.PacketBuffer {
+ return raBufWithOptsAndDHCPv6(ip, rl, false, false, optSer)
+}
+
+// raBufWithDHCPv6 returns a valid NDP Router Advertisement with DHCPv6 related
+// fields set.
+//
+// Note, raBufWithDHCPv6 does not populate any of the RA fields other than the
+// DHCPv6 related ones.
+func raBufWithDHCPv6(ip tcpip.Address, managedAddresses, otherConfiguratiosns bool) tcpip.PacketBuffer {
+ return raBufWithOptsAndDHCPv6(ip, 0, managedAddresses, otherConfiguratiosns, header.NDPOptionsSerializer{})
+}
+
// raBuf returns a valid NDP Router Advertisement.
//
// Note, raBuf does not populate any of the RA fields other than the
@@ -781,16 +883,33 @@ func raBuf(ip tcpip.Address, rl uint16) tcpip.PacketBuffer {
//
// Note, raBufWithPI does not populate any of the RA fields other than the
// Router Lifetime.
-func raBufWithPI(ip tcpip.Address, rl uint16, prefix tcpip.AddressWithPrefix, onLink bool, vl uint32) tcpip.PacketBuffer {
+func raBufWithPI(ip tcpip.Address, rl uint16, prefix tcpip.AddressWithPrefix, onLink, auto bool, vl, pl uint32) tcpip.PacketBuffer {
flags := uint8(0)
if onLink {
- flags |= 128
+ // The OnLink flag is the 7th bit in the flags byte.
+ flags |= 1 << 7
+ }
+ if auto {
+ // The Address Auto-Configuration flag is the 6th bit in the
+ // flags byte.
+ flags |= 1 << 6
}
+ // A valid header.NDPPrefixInformation must be 30 bytes.
buf := [30]byte{}
+ // The first byte in a header.NDPPrefixInformation is the Prefix Length
+ // field.
buf[0] = uint8(prefix.PrefixLen)
+ // The 2nd byte within a header.NDPPrefixInformation is the Flags field.
buf[1] = flags
+ // The Valid Lifetime field starts after the 2nd byte within a
+ // header.NDPPrefixInformation.
binary.BigEndian.PutUint32(buf[2:], vl)
+ // The Preferred Lifetime field starts after the 6th byte within a
+ // header.NDPPrefixInformation.
+ binary.BigEndian.PutUint32(buf[6:], pl)
+ // The Prefix Address field starts after the 14th byte within a
+ // header.NDPPrefixInformation.
copy(buf[14:], prefix.Address)
return raBufWithOpts(ip, rl, header.NDPOptionsSerializer{
header.NDPPrefixInformation(buf[:]),
@@ -812,10 +931,12 @@ func TestNoRouterDiscovery(t *testing.T) {
forwarding := i&4 == 0
t.Run(fmt.Sprintf("HandleRAs(%t), DiscoverDefaultRouters(%t), Forwarding(%t)", handle, discover, forwarding), func(t *testing.T) {
+ t.Parallel()
+
ndpDisp := ndpDispatcher{
- routerC: make(chan ndpRouterEvent, 10),
+ routerC: make(chan ndpRouterEvent, 1),
}
- e := channel.New(10, 1280, linkAddr1)
+ e := channel.New(0, 1280, linkAddr1)
s := stack.New(stack.Options{
NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
NDPConfigs: stack.NDPConfigurations{
@@ -835,19 +956,27 @@ func TestNoRouterDiscovery(t *testing.T) {
select {
case <-ndpDisp.routerC:
t.Fatal("unexpectedly discovered a router when configured not to")
- case <-time.After(defaultTimeout):
+ default:
}
})
}
}
+// Check e to make sure that the event is for addr on nic with ID 1, and the
+// discovered flag set to discovered.
+func checkRouterEvent(e ndpRouterEvent, addr tcpip.Address, discovered bool) string {
+ return cmp.Diff(ndpRouterEvent{nicID: 1, addr: addr, discovered: discovered}, e, cmp.AllowUnexported(e))
+}
+
// TestRouterDiscoveryDispatcherNoRemember tests that the stack does not
// remember a discovered router when the dispatcher asks it not to.
func TestRouterDiscoveryDispatcherNoRemember(t *testing.T) {
+ t.Parallel()
+
ndpDisp := ndpDispatcher{
- routerC: make(chan ndpRouterEvent, 10),
+ routerC: make(chan ndpRouterEvent, 1),
}
- e := channel.New(10, 1280, linkAddr1)
+ e := channel.New(0, 1280, linkAddr1)
s := stack.New(stack.Options{
NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
NDPConfigs: stack.NDPConfigurations{
@@ -861,59 +990,36 @@ func TestRouterDiscoveryDispatcherNoRemember(t *testing.T) {
t.Fatalf("CreateNIC(1) = %s", err)
}
- routeTable := []tcpip.Route{
- {
- header.IPv6EmptySubnet,
- llAddr3,
- 1,
- },
- }
- s.SetRouteTable(routeTable)
-
- // Rx an RA with short lifetime.
- lifetime := time.Duration(1)
- e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, uint16(lifetime)))
+ // Receive an RA for a router we should not remember.
+ const lifetimeSeconds = 1
+ e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, lifetimeSeconds))
select {
- case r := <-ndpDisp.routerC:
- if r.nicID != 1 {
- t.Fatalf("got r.nicID = %d, want = 1", r.nicID)
- }
- if r.addr != llAddr2 {
- t.Fatalf("got r.addr = %s, want = %s", r.addr, llAddr2)
- }
- if !r.discovered {
- t.Fatal("got r.discovered = false, want = true")
+ case e := <-ndpDisp.routerC:
+ if diff := checkRouterEvent(e, llAddr2, true); diff != "" {
+ t.Errorf("router event mismatch (-want +got):\n%s", diff)
}
- case <-time.After(defaultTimeout):
- t.Fatal("timeout waiting for router discovery event")
+ default:
+ t.Fatal("expected router discovery event")
}
- // Original route table should not have been modified.
- if got := s.GetRouteTable(); !cmp.Equal(got, routeTable) {
- t.Fatalf("got GetRouteTable = %v, want = %v", got, routeTable)
- }
-
- // Wait for the normal invalidation time plus an extra second to
- // make sure we do not actually receive any invalidation events as
- // we should not have remembered the router in the first place.
+ // Wait for the invalidation time plus some buffer to make sure we do
+ // not actually receive any invalidation events as we should not have
+ // remembered the router in the first place.
select {
case <-ndpDisp.routerC:
t.Fatal("should not have received any router events")
- case <-time.After(lifetime*time.Second + defaultTimeout):
- }
-
- // Original route table should not have been modified.
- if got := s.GetRouteTable(); !cmp.Equal(got, routeTable) {
- t.Fatalf("got GetRouteTable = %v, want = %v", got, routeTable)
+ case <-time.After(lifetimeSeconds*time.Second + defaultTimeout):
}
}
func TestRouterDiscovery(t *testing.T) {
+ t.Parallel()
+
ndpDisp := ndpDispatcher{
- routerC: make(chan ndpRouterEvent, 10),
+ routerC: make(chan ndpRouterEvent, 1),
rememberRouter: true,
}
- e := channel.New(10, 1280, linkAddr1)
+ e := channel.New(0, 1280, linkAddr1)
s := stack.New(stack.Options{
NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
NDPConfigs: stack.NDPConfigurations{
@@ -923,22 +1029,29 @@ func TestRouterDiscovery(t *testing.T) {
NDPDisp: &ndpDisp,
})
- waitForEvent := func(addr tcpip.Address, discovered bool, timeout time.Duration) {
+ expectRouterEvent := func(addr tcpip.Address, discovered bool) {
t.Helper()
select {
- case r := <-ndpDisp.routerC:
- if r.nicID != 1 {
- t.Fatalf("got r.nicID = %d, want = 1", r.nicID)
- }
- if r.addr != addr {
- t.Fatalf("got r.addr = %s, want = %s", r.addr, addr)
+ case e := <-ndpDisp.routerC:
+ if diff := checkRouterEvent(e, addr, discovered); diff != "" {
+ t.Errorf("router event mismatch (-want +got):\n%s", diff)
}
- if r.discovered != discovered {
- t.Fatalf("got r.discovered = %t, want = %t", r.discovered, discovered)
+ default:
+ t.Fatal("expected router discovery event")
+ }
+ }
+
+ expectAsyncRouterInvalidationEvent := func(addr tcpip.Address, timeout time.Duration) {
+ t.Helper()
+
+ select {
+ case e := <-ndpDisp.routerC:
+ if diff := checkRouterEvent(e, addr, false); diff != "" {
+ t.Errorf("router event mismatch (-want +got):\n%s", diff)
}
case <-time.After(timeout):
- t.Fatal("timeout waiting for router discovery event")
+ t.Fatal("timed out waiting for router discovery event")
}
}
@@ -952,40 +1065,25 @@ func TestRouterDiscovery(t *testing.T) {
select {
case <-ndpDisp.routerC:
t.Fatal("unexpectedly discovered a router with 0 lifetime")
- case <-time.After(defaultTimeout):
+ default:
}
// Rx an RA from lladdr2 with a huge lifetime.
e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, 1000))
- waitForEvent(llAddr2, true, defaultTimeout)
-
- // Should have a default route through the discovered router.
- if got, want := s.GetRouteTable(), []tcpip.Route{{header.IPv6EmptySubnet, llAddr2, 1}}; !cmp.Equal(got, want) {
- t.Fatalf("got GetRouteTable = %v, want = %v", got, want)
- }
+ expectRouterEvent(llAddr2, true)
// Rx an RA from another router (lladdr3) with non-zero lifetime.
- l3Lifetime := time.Duration(6)
- e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr3, uint16(l3Lifetime)))
- waitForEvent(llAddr3, true, defaultTimeout)
-
- // Should have default routes through the discovered routers.
- if got, want := s.GetRouteTable(), []tcpip.Route{{header.IPv6EmptySubnet, llAddr2, 1}, {header.IPv6EmptySubnet, llAddr3, 1}}; !cmp.Equal(got, want) {
- t.Fatalf("got GetRouteTable = %v, want = %v", got, want)
- }
+ const l3LifetimeSeconds = 6
+ e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr3, l3LifetimeSeconds))
+ expectRouterEvent(llAddr3, true)
// Rx an RA from lladdr2 with lesser lifetime.
- l2Lifetime := time.Duration(2)
- e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, uint16(l2Lifetime)))
+ const l2LifetimeSeconds = 2
+ e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, l2LifetimeSeconds))
select {
case <-ndpDisp.routerC:
t.Fatal("Should not receive a router event when updating lifetimes for known routers")
- case <-time.After(defaultTimeout):
- }
-
- // Should still have a default route through the discovered routers.
- if got, want := s.GetRouteTable(), []tcpip.Route{{header.IPv6EmptySubnet, llAddr2, 1}, {header.IPv6EmptySubnet, llAddr3, 1}}; !cmp.Equal(got, want) {
- t.Fatalf("got GetRouteTable = %v, want = %v", got, want)
+ default:
}
// Wait for lladdr2's router invalidation timer to fire. The lifetime
@@ -995,31 +1093,15 @@ func TestRouterDiscovery(t *testing.T) {
// Wait for the normal lifetime plus an extra bit for the
// router to get invalidated. If we don't get an invalidation
// event after this time, then something is wrong.
- waitForEvent(llAddr2, false, l2Lifetime*time.Second+defaultTimeout)
-
- // Should no longer have the default route through lladdr2.
- if got, want := s.GetRouteTable(), []tcpip.Route{{header.IPv6EmptySubnet, llAddr3, 1}}; !cmp.Equal(got, want) {
- t.Fatalf("got GetRouteTable = %v, want = %v", got, want)
- }
+ expectAsyncRouterInvalidationEvent(llAddr2, l2LifetimeSeconds*time.Second+defaultTimeout)
// Rx an RA from lladdr2 with huge lifetime.
e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, 1000))
- waitForEvent(llAddr2, true, defaultTimeout)
-
- // Should have a default route through the discovered routers.
- if got, want := s.GetRouteTable(), []tcpip.Route{{header.IPv6EmptySubnet, llAddr3, 1}, {header.IPv6EmptySubnet, llAddr2, 1}}; !cmp.Equal(got, want) {
- t.Fatalf("got GetRouteTable = %v, want = %v", got, want)
- }
+ expectRouterEvent(llAddr2, true)
// Rx an RA from lladdr2 with zero lifetime. It should be invalidated.
e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, 0))
- waitForEvent(llAddr2, false, defaultTimeout)
-
- // Should have deleted the default route through the router that just
- // got invalidated.
- if got, want := s.GetRouteTable(), []tcpip.Route{{header.IPv6EmptySubnet, llAddr3, 1}}; !cmp.Equal(got, want) {
- t.Fatalf("got GetRouteTable = %v, want = %v", got, want)
- }
+ expectRouterEvent(llAddr2, false)
// Wait for lladdr3's router invalidation timer to fire. The lifetime
// of the router should have been updated to the most recent (smaller)
@@ -1028,23 +1110,19 @@ func TestRouterDiscovery(t *testing.T) {
// Wait for the normal lifetime plus an extra bit for the
// router to get invalidated. If we don't get an invalidation
// event after this time, then something is wrong.
- waitForEvent(llAddr3, false, l3Lifetime*time.Second+defaultTimeout)
-
- // Should not have any routes now that all discovered routers have been
- // invalidated.
- if got := len(s.GetRouteTable()); got != 0 {
- t.Fatalf("got len(s.GetRouteTable()) = %d, want = 0", got)
- }
+ expectAsyncRouterInvalidationEvent(llAddr3, l3LifetimeSeconds*time.Second+defaultTimeout)
}
// TestRouterDiscoveryMaxRouters tests that only
// stack.MaxDiscoveredDefaultRouters discovered routers are remembered.
func TestRouterDiscoveryMaxRouters(t *testing.T) {
+ t.Parallel()
+
ndpDisp := ndpDispatcher{
- routerC: make(chan ndpRouterEvent, 10),
+ routerC: make(chan ndpRouterEvent, 1),
rememberRouter: true,
}
- e := channel.New(10, 1280, linkAddr1)
+ e := channel.New(0, 1280, linkAddr1)
s := stack.New(stack.Options{
NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
NDPConfigs: stack.NDPConfigurations{
@@ -1058,8 +1136,6 @@ func TestRouterDiscoveryMaxRouters(t *testing.T) {
t.Fatalf("CreateNIC(1) = %s", err)
}
- expectedRt := [stack.MaxDiscoveredDefaultRouters]tcpip.Route{}
-
// Receive an RA from 2 more than the max number of discovered routers.
for i := 1; i <= stack.MaxDiscoveredDefaultRouters+2; i++ {
linkAddr := []byte{2, 2, 3, 4, 5, 0}
@@ -1069,36 +1145,23 @@ func TestRouterDiscoveryMaxRouters(t *testing.T) {
e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr, 5))
if i <= stack.MaxDiscoveredDefaultRouters {
- expectedRt[i-1] = tcpip.Route{header.IPv6EmptySubnet, llAddr, 1}
select {
- case r := <-ndpDisp.routerC:
- if r.nicID != 1 {
- t.Fatalf("got r.nicID = %d, want = 1", r.nicID)
+ case e := <-ndpDisp.routerC:
+ if diff := checkRouterEvent(e, llAddr, true); diff != "" {
+ t.Errorf("router event mismatch (-want +got):\n%s", diff)
}
- if r.addr != llAddr {
- t.Fatalf("got r.addr = %s, want = %s", r.addr, llAddr)
- }
- if !r.discovered {
- t.Fatal("got r.discovered = false, want = true")
- }
- case <-time.After(defaultTimeout):
- t.Fatal("timeout waiting for router discovery event")
+ default:
+ t.Fatal("expected router discovery event")
}
} else {
select {
case <-ndpDisp.routerC:
t.Fatal("should not have discovered a new router after we already discovered the max number of routers")
- case <-time.After(defaultTimeout):
+ default:
}
}
}
-
- // Should only have default routes for the first
- // stack.MaxDiscoveredDefaultRouters discovered routers.
- if got := s.GetRouteTable(); !cmp.Equal(got, expectedRt[:]) {
- t.Fatalf("got GetRouteTable = %v, want = %v", got, expectedRt)
- }
}
// TestNoPrefixDiscovery tests that prefix discovery will not be performed if
@@ -1121,10 +1184,12 @@ func TestNoPrefixDiscovery(t *testing.T) {
forwarding := i&4 == 0
t.Run(fmt.Sprintf("HandleRAs(%t), DiscoverOnLinkPrefixes(%t), Forwarding(%t)", handle, discover, forwarding), func(t *testing.T) {
+ t.Parallel()
+
ndpDisp := ndpDispatcher{
- prefixC: make(chan ndpPrefixEvent, 10),
+ prefixC: make(chan ndpPrefixEvent, 1),
}
- e := channel.New(10, 1280, linkAddr1)
+ e := channel.New(0, 1280, linkAddr1)
s := stack.New(stack.Options{
NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
NDPConfigs: stack.NDPConfigurations{
@@ -1140,30 +1205,34 @@ func TestNoPrefixDiscovery(t *testing.T) {
}
// Rx an RA with prefix with non-zero lifetime.
- e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, 10))
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, 10, 0))
select {
case <-ndpDisp.prefixC:
t.Fatal("unexpectedly discovered a prefix when configured not to")
- case <-time.After(defaultTimeout):
+ default:
}
})
}
}
+// Check e to make sure that the event is for prefix on nic with ID 1, and the
+// discovered flag set to discovered.
+func checkPrefixEvent(e ndpPrefixEvent, prefix tcpip.Subnet, discovered bool) string {
+ return cmp.Diff(ndpPrefixEvent{nicID: 1, prefix: prefix, discovered: discovered}, e, cmp.AllowUnexported(e))
+}
+
// TestPrefixDiscoveryDispatcherNoRemember tests that the stack does not
// remember a discovered on-link prefix when the dispatcher asks it not to.
func TestPrefixDiscoveryDispatcherNoRemember(t *testing.T) {
- prefix := tcpip.AddressWithPrefix{
- Address: tcpip.Address("\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x00"),
- PrefixLen: 64,
- }
- subnet := prefix.Subnet()
+ t.Parallel()
+
+ prefix, subnet, _ := prefixSubnetAddr(0, "")
ndpDisp := ndpDispatcher{
- prefixC: make(chan ndpPrefixEvent, 10),
+ prefixC: make(chan ndpPrefixEvent, 1),
}
- e := channel.New(10, 1280, linkAddr1)
+ e := channel.New(0, 1280, linkAddr1)
s := stack.New(stack.Options{
NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
NDPConfigs: stack.NDPConfigurations{
@@ -1178,75 +1247,40 @@ func TestPrefixDiscoveryDispatcherNoRemember(t *testing.T) {
t.Fatalf("CreateNIC(1) = %s", err)
}
- routeTable := []tcpip.Route{
- {
- header.IPv6EmptySubnet,
- llAddr3,
- 1,
- },
- }
- s.SetRouteTable(routeTable)
-
- // Rx an RA with prefix with a short lifetime.
- const lifetime = 1
- e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, lifetime))
+ // Receive an RA with prefix that we should not remember.
+ const lifetimeSeconds = 1
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, lifetimeSeconds, 0))
select {
- case r := <-ndpDisp.prefixC:
- if r.nicID != 1 {
- t.Fatalf("got r.nicID = %d, want = 1", r.nicID)
- }
- if r.prefix != subnet {
- t.Fatalf("got r.prefix = %s, want = %s", r.prefix, subnet)
+ case e := <-ndpDisp.prefixC:
+ if diff := checkPrefixEvent(e, subnet, true); diff != "" {
+ t.Errorf("prefix event mismatch (-want +got):\n%s", diff)
}
- if !r.discovered {
- t.Fatal("got r.discovered = false, want = true")
- }
- case <-time.After(defaultTimeout):
- t.Fatal("timeout waiting for prefix discovery event")
+ default:
+ t.Fatal("expected prefix discovery event")
}
- // Original route table should not have been modified.
- if got := s.GetRouteTable(); !cmp.Equal(got, routeTable) {
- t.Fatalf("got GetRouteTable = %v, want = %v", got, routeTable)
- }
-
- // Wait for the normal invalidation time plus some buffer to
- // make sure we do not actually receive any invalidation events as
- // we should not have remembered the prefix in the first place.
+ // Wait for the invalidation time plus some buffer to make sure we do
+ // not actually receive any invalidation events as we should not have
+ // remembered the prefix in the first place.
select {
case <-ndpDisp.prefixC:
t.Fatal("should not have received any prefix events")
- case <-time.After(lifetime*time.Second + defaultTimeout):
- }
-
- // Original route table should not have been modified.
- if got := s.GetRouteTable(); !cmp.Equal(got, routeTable) {
- t.Fatalf("got GetRouteTable = %v, want = %v", got, routeTable)
+ case <-time.After(lifetimeSeconds*time.Second + defaultTimeout):
}
}
func TestPrefixDiscovery(t *testing.T) {
- prefix1 := tcpip.AddressWithPrefix{
- Address: tcpip.Address("\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x00"),
- PrefixLen: 64,
- }
- prefix2 := tcpip.AddressWithPrefix{
- Address: tcpip.Address("\x01\x02\x03\x04\x05\x06\x07\x09\x00\x00\x00\x00\x00\x00\x00\x00"),
- PrefixLen: 64,
- }
- prefix3 := tcpip.AddressWithPrefix{
- Address: tcpip.Address("\x01\x02\x03\x04\x05\x06\x07\x09\x0a\x00\x00\x00\x00\x00\x00\x00"),
- PrefixLen: 72,
- }
- subnet1 := prefix1.Subnet()
- subnet2 := prefix2.Subnet()
- subnet3 := prefix3.Subnet()
+ t.Parallel()
+
+ prefix1, subnet1, _ := prefixSubnetAddr(0, "")
+ prefix2, subnet2, _ := prefixSubnetAddr(1, "")
+ prefix3, subnet3, _ := prefixSubnetAddr(2, "")
ndpDisp := ndpDispatcher{
- prefixC: make(chan ndpPrefixEvent, 10),
+ prefixC: make(chan ndpPrefixEvent, 1),
rememberPrefix: true,
}
- e := channel.New(10, 1280, linkAddr1)
+ e := channel.New(0, 1280, linkAddr1)
s := stack.New(stack.Options{
NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
NDPConfigs: stack.NDPConfigurations{
@@ -1256,106 +1290,72 @@ func TestPrefixDiscovery(t *testing.T) {
NDPDisp: &ndpDisp,
})
- waitForEvent := func(subnet tcpip.Subnet, discovered bool, timeout time.Duration) {
+ if err := s.CreateNIC(1, e); err != nil {
+ t.Fatalf("CreateNIC(1) = %s", err)
+ }
+
+ expectPrefixEvent := func(prefix tcpip.Subnet, discovered bool) {
t.Helper()
select {
- case r := <-ndpDisp.prefixC:
- if r.nicID != 1 {
- t.Fatalf("got r.nicID = %d, want = 1", r.nicID)
+ case e := <-ndpDisp.prefixC:
+ if diff := checkPrefixEvent(e, prefix, discovered); diff != "" {
+ t.Errorf("prefix event mismatch (-want +got):\n%s", diff)
}
- if r.prefix != subnet {
- t.Fatalf("got r.prefix = %s, want = %s", r.prefix, subnet)
- }
- if r.discovered != discovered {
- t.Fatalf("got r.discovered = %t, want = %t", r.discovered, discovered)
- }
- case <-time.After(timeout):
- t.Fatal("timeout waiting for prefix discovery event")
+ default:
+ t.Fatal("expected prefix discovery event")
}
}
- if err := s.CreateNIC(1, e); err != nil {
- t.Fatalf("CreateNIC(1) = %s", err)
- }
-
// Receive an RA with prefix1 in an NDP Prefix Information option (PI)
// with zero valid lifetime.
- e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, 0))
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, false, 0, 0))
select {
case <-ndpDisp.prefixC:
t.Fatal("unexpectedly discovered a prefix with 0 lifetime")
- case <-time.After(defaultTimeout):
+ default:
}
// Receive an RA with prefix1 in an NDP Prefix Information option (PI)
// with non-zero lifetime.
- e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, 100))
- waitForEvent(subnet1, true, defaultTimeout)
-
- // Should have added a device route for subnet1 through the nic.
- if got, want := s.GetRouteTable(), []tcpip.Route{{subnet1, tcpip.Address([]byte(nil)), 1}}; !cmp.Equal(got, want) {
- t.Fatalf("got GetRouteTable = %v, want = %v", got, want)
- }
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, false, 100, 0))
+ expectPrefixEvent(subnet1, true)
// Receive an RA with prefix2 in a PI.
- e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, 100))
- waitForEvent(subnet2, true, defaultTimeout)
-
- // Should have added a device route for subnet2 through the nic.
- if got, want := s.GetRouteTable(), []tcpip.Route{{subnet1, tcpip.Address([]byte(nil)), 1}, {subnet2, tcpip.Address([]byte(nil)), 1}}; !cmp.Equal(got, want) {
- t.Fatalf("got GetRouteTable = %v, want = %v", got, want)
- }
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, false, 100, 0))
+ expectPrefixEvent(subnet2, true)
// Receive an RA with prefix3 in a PI.
- e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix3, true, 100))
- waitForEvent(subnet3, true, defaultTimeout)
-
- // Should have added a device route for subnet3 through the nic.
- if got, want := s.GetRouteTable(), []tcpip.Route{{subnet1, tcpip.Address([]byte(nil)), 1}, {subnet2, tcpip.Address([]byte(nil)), 1}, {subnet3, tcpip.Address([]byte(nil)), 1}}; !cmp.Equal(got, want) {
- t.Fatalf("got GetRouteTable = %v, want = %v", got, want)
- }
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix3, true, false, 100, 0))
+ expectPrefixEvent(subnet3, true)
// Receive an RA with prefix1 in a PI with lifetime = 0.
- e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, 0))
- waitForEvent(subnet1, false, defaultTimeout)
-
- // Should have removed the device route for subnet1 through the nic.
- if got, want := s.GetRouteTable(), []tcpip.Route{{subnet2, tcpip.Address([]byte(nil)), 1}, {subnet3, tcpip.Address([]byte(nil)), 1}}; !cmp.Equal(got, want) {
- t.Fatalf("got GetRouteTable = %v, want = %v", got, want)
- }
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, false, 0, 0))
+ expectPrefixEvent(subnet1, false)
// Receive an RA with prefix2 in a PI with lesser lifetime.
lifetime := uint32(2)
- e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, lifetime))
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, false, lifetime, 0))
select {
case <-ndpDisp.prefixC:
t.Fatal("unexpectedly received prefix event when updating lifetime")
- case <-time.After(defaultTimeout):
- }
-
- // Should not have updated route table.
- if got, want := s.GetRouteTable(), []tcpip.Route{{subnet2, tcpip.Address([]byte(nil)), 1}, {subnet3, tcpip.Address([]byte(nil)), 1}}; !cmp.Equal(got, want) {
- t.Fatalf("got GetRouteTable = %v, want = %v", got, want)
+ default:
}
// Wait for prefix2's most recent invalidation timer plus some buffer to
// expire.
- waitForEvent(subnet2, false, time.Duration(lifetime)*time.Second+defaultTimeout)
-
- // Should have removed the device route for subnet2 through the nic.
- if got, want := s.GetRouteTable(), []tcpip.Route{{subnet3, tcpip.Address([]byte(nil)), 1}}; !cmp.Equal(got, want) {
- t.Fatalf("got GetRouteTable = %v, want = %v", got, want)
+ select {
+ case e := <-ndpDisp.prefixC:
+ if diff := checkPrefixEvent(e, subnet2, false); diff != "" {
+ t.Errorf("prefix event mismatch (-want +got):\n%s", diff)
+ }
+ case <-time.After(time.Duration(lifetime)*time.Second + defaultTimeout):
+ t.Fatal("timed out waiting for prefix discovery event")
}
// Receive RA to invalidate prefix3.
- e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix3, true, 0))
- waitForEvent(subnet3, false, defaultTimeout)
-
- // Should not have any routes.
- if got := len(s.GetRouteTable()); got != 0 {
- t.Fatalf("got len(s.GetRouteTable()) = %d, want = 0", got)
- }
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix3, true, false, 0, 0))
+ expectPrefixEvent(subnet3, false)
}
func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) {
@@ -1364,10 +1364,10 @@ func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) {
// invalidate the prefix.
const testInfiniteLifetimeSeconds = 2
const testInfiniteLifetime = testInfiniteLifetimeSeconds * time.Second
- saved := header.NDPPrefixInformationInfiniteLifetime
- header.NDPPrefixInformationInfiniteLifetime = testInfiniteLifetime
+ saved := header.NDPInfiniteLifetime
+ header.NDPInfiniteLifetime = testInfiniteLifetime
defer func() {
- header.NDPPrefixInformationInfiniteLifetime = saved
+ header.NDPInfiniteLifetime = saved
}()
prefix := tcpip.AddressWithPrefix{
@@ -1377,10 +1377,10 @@ func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) {
subnet := prefix.Subnet()
ndpDisp := ndpDispatcher{
- prefixC: make(chan ndpPrefixEvent, 10),
+ prefixC: make(chan ndpPrefixEvent, 1),
rememberPrefix: true,
}
- e := channel.New(10, 1280, linkAddr1)
+ e := channel.New(0, 1280, linkAddr1)
s := stack.New(stack.Options{
NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
NDPConfigs: stack.NDPConfigurations{
@@ -1390,33 +1390,27 @@ func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) {
NDPDisp: &ndpDisp,
})
- waitForEvent := func(discovered bool, timeout time.Duration) {
+ if err := s.CreateNIC(1, e); err != nil {
+ t.Fatalf("CreateNIC(1) = %s", err)
+ }
+
+ expectPrefixEvent := func(prefix tcpip.Subnet, discovered bool) {
t.Helper()
select {
- case r := <-ndpDisp.prefixC:
- if r.nicID != 1 {
- t.Errorf("got r.nicID = %d, want = 1", r.nicID)
- }
- if r.prefix != subnet {
- t.Errorf("got r.prefix = %s, want = %s", r.prefix, subnet)
- }
- if r.discovered != discovered {
- t.Errorf("got r.discovered = %t, want = %t", r.discovered, discovered)
+ case e := <-ndpDisp.prefixC:
+ if diff := checkPrefixEvent(e, prefix, discovered); diff != "" {
+ t.Errorf("prefix event mismatch (-want +got):\n%s", diff)
}
- case <-time.After(timeout):
- t.Fatal("timeout waiting for prefix discovery event")
+ default:
+ t.Fatal("expected prefix discovery event")
}
}
- if err := s.CreateNIC(1, e); err != nil {
- t.Fatalf("CreateNIC(1) = %s", err)
- }
-
// Receive an RA with prefix in an NDP Prefix Information option (PI)
// with infinite valid lifetime which should not get invalidated.
- e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, testInfiniteLifetimeSeconds))
- waitForEvent(true, defaultTimeout)
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, testInfiniteLifetimeSeconds, 0))
+ expectPrefixEvent(subnet, true)
select {
case <-ndpDisp.prefixC:
t.Fatal("unexpectedly invalidated a prefix with infinite lifetime")
@@ -1425,16 +1419,23 @@ func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) {
// Receive an RA with finite lifetime.
// The prefix should get invalidated after 1s.
- e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, testInfiniteLifetimeSeconds-1))
- waitForEvent(false, testInfiniteLifetime)
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, testInfiniteLifetimeSeconds-1, 0))
+ select {
+ case e := <-ndpDisp.prefixC:
+ if diff := checkPrefixEvent(e, subnet, false); diff != "" {
+ t.Errorf("prefix event mismatch (-want +got):\n%s", diff)
+ }
+ case <-time.After(testInfiniteLifetime):
+ t.Fatal("timed out waiting for prefix discovery event")
+ }
// Receive an RA with finite lifetime.
- e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, testInfiniteLifetimeSeconds-1))
- waitForEvent(true, defaultTimeout)
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, testInfiniteLifetimeSeconds-1, 0))
+ expectPrefixEvent(subnet, true)
// Receive an RA with prefix with an infinite lifetime.
// The prefix should not be invalidated.
- e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, testInfiniteLifetimeSeconds))
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, testInfiniteLifetimeSeconds, 0))
select {
case <-ndpDisp.prefixC:
t.Fatal("unexpectedly invalidated a prefix with infinite lifetime")
@@ -1443,7 +1444,7 @@ func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) {
// Receive an RA with a prefix with a lifetime value greater than the
// set infinite lifetime value.
- e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, testInfiniteLifetimeSeconds+1))
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, testInfiniteLifetimeSeconds+1, 0))
select {
case <-ndpDisp.prefixC:
t.Fatal("unexpectedly invalidated a prefix with infinite lifetime")
@@ -1452,18 +1453,20 @@ func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) {
// Receive an RA with 0 lifetime.
// The prefix should get invalidated.
- e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, 0))
- waitForEvent(false, defaultTimeout)
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, 0, 0))
+ expectPrefixEvent(subnet, false)
}
// TestPrefixDiscoveryMaxRouters tests that only
// stack.MaxDiscoveredOnLinkPrefixes discovered on-link prefixes are remembered.
func TestPrefixDiscoveryMaxOnLinkPrefixes(t *testing.T) {
+ t.Parallel()
+
ndpDisp := ndpDispatcher{
prefixC: make(chan ndpPrefixEvent, stack.MaxDiscoveredOnLinkPrefixes+3),
rememberPrefix: true,
}
- e := channel.New(10, 1280, linkAddr1)
+ e := channel.New(0, 1280, linkAddr1)
s := stack.New(stack.Options{
NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
NDPConfigs: stack.NDPConfigurations{
@@ -1479,7 +1482,6 @@ func TestPrefixDiscoveryMaxOnLinkPrefixes(t *testing.T) {
}
optSer := make(header.NDPOptionsSerializer, stack.MaxDiscoveredOnLinkPrefixes+2)
- expectedRt := [stack.MaxDiscoveredOnLinkPrefixes]tcpip.Route{}
prefixes := [stack.MaxDiscoveredOnLinkPrefixes + 2]tcpip.Subnet{}
// Receive an RA with 2 more than the max number of discovered on-link
@@ -1499,41 +1501,1937 @@ func TestPrefixDiscoveryMaxOnLinkPrefixes(t *testing.T) {
copy(buf[14:], prefix.Address)
optSer[i] = header.NDPPrefixInformation(buf[:])
-
- if i < stack.MaxDiscoveredOnLinkPrefixes {
- expectedRt[i] = tcpip.Route{prefixes[i], tcpip.Address([]byte(nil)), 1}
- }
}
e.InjectInbound(header.IPv6ProtocolNumber, raBufWithOpts(llAddr1, 0, optSer))
for i := 0; i < stack.MaxDiscoveredOnLinkPrefixes+2; i++ {
if i < stack.MaxDiscoveredOnLinkPrefixes {
select {
- case r := <-ndpDisp.prefixC:
- if r.nicID != 1 {
- t.Fatalf("got r.nicID = %d, want = 1", r.nicID)
- }
- if r.prefix != prefixes[i] {
- t.Fatalf("got r.prefix = %s, want = %s", r.prefix, prefixes[i])
- }
- if !r.discovered {
- t.Fatal("got r.discovered = false, want = true")
+ case e := <-ndpDisp.prefixC:
+ if diff := checkPrefixEvent(e, prefixes[i], true); diff != "" {
+ t.Errorf("prefix event mismatch (-want +got):\n%s", diff)
}
- case <-time.After(defaultTimeout):
- t.Fatal("timeout waiting for prefix discovery event")
+ default:
+ t.Fatal("expected prefix discovery event")
}
} else {
select {
case <-ndpDisp.prefixC:
t.Fatal("should not have discovered a new prefix after we already discovered the max number of prefixes")
+ default:
+ }
+ }
+ }
+}
+
+// Checks to see if list contains an IPv6 address, item.
+func contains(list []tcpip.ProtocolAddress, item tcpip.AddressWithPrefix) bool {
+ protocolAddress := tcpip.ProtocolAddress{
+ Protocol: header.IPv6ProtocolNumber,
+ AddressWithPrefix: item,
+ }
+
+ for _, i := range list {
+ if i == protocolAddress {
+ return true
+ }
+ }
+
+ return false
+}
+
+// TestNoAutoGenAddr tests that SLAAC is not performed when configured not to.
+func TestNoAutoGenAddr(t *testing.T) {
+ prefix, _, _ := prefixSubnetAddr(0, "")
+
+ // Being configured to auto-generate addresses means handle and
+ // autogen are set to true and forwarding is set to false.
+ // This tests all possible combinations of the configurations,
+ // except for the configuration where handle = true, autogen =
+ // true and forwarding = false (the required configuration to do
+ // SLAAC) - that will done in other tests.
+ for i := 0; i < 7; i++ {
+ handle := i&1 != 0
+ autogen := i&2 != 0
+ forwarding := i&4 == 0
+
+ t.Run(fmt.Sprintf("HandleRAs(%t), AutoGenAddr(%t), Forwarding(%t)", handle, autogen, forwarding), func(t *testing.T) {
+ t.Parallel()
+
+ ndpDisp := ndpDispatcher{
+ autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1),
+ }
+ e := channel.New(0, 1280, linkAddr1)
+ s := stack.New(stack.Options{
+ NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+ NDPConfigs: stack.NDPConfigurations{
+ HandleRAs: handle,
+ AutoGenGlobalAddresses: autogen,
+ },
+ NDPDisp: &ndpDisp,
+ })
+ s.SetForwarding(forwarding)
+
+ if err := s.CreateNIC(1, e); err != nil {
+ t.Fatalf("CreateNIC(1) = %s", err)
+ }
+
+ // Rx an RA with prefix with non-zero lifetime.
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, false, true, 10, 0))
+
+ select {
+ case <-ndpDisp.autoGenAddrC:
+ t.Fatal("unexpectedly auto-generated an address when configured not to")
+ default:
+ }
+ })
+ }
+}
+
+// Check e to make sure that the event is for addr on nic with ID 1, and the
+// event type is set to eventType.
+func checkAutoGenAddrEvent(e ndpAutoGenAddrEvent, addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) string {
+ return cmp.Diff(ndpAutoGenAddrEvent{nicID: 1, addr: addr, eventType: eventType}, e, cmp.AllowUnexported(e))
+}
+
+// TestAutoGenAddr tests that an address is properly generated and invalidated
+// when configured to do so.
+func TestAutoGenAddr(t *testing.T) {
+ const newMinVL = 2
+ newMinVLDuration := newMinVL * time.Second
+ saved := stack.MinPrefixInformationValidLifetimeForUpdate
+ defer func() {
+ stack.MinPrefixInformationValidLifetimeForUpdate = saved
+ }()
+ stack.MinPrefixInformationValidLifetimeForUpdate = newMinVLDuration
+
+ prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1)
+ prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1)
+
+ ndpDisp := ndpDispatcher{
+ autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1),
+ }
+ e := channel.New(0, 1280, linkAddr1)
+ s := stack.New(stack.Options{
+ NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+ NDPConfigs: stack.NDPConfigurations{
+ HandleRAs: true,
+ AutoGenGlobalAddresses: true,
+ },
+ NDPDisp: &ndpDisp,
+ })
+
+ if err := s.CreateNIC(1, e); err != nil {
+ t.Fatalf("CreateNIC(1) = %s", err)
+ }
+
+ expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+ t.Helper()
+
+ select {
+ case e := <-ndpDisp.autoGenAddrC:
+ if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+ t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+ }
+ default:
+ t.Fatal("expected addr auto gen event")
+ }
+ }
+
+ // Receive an RA with prefix1 in an NDP Prefix Information option (PI)
+ // with zero valid lifetime.
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 0, 0))
+ select {
+ case <-ndpDisp.autoGenAddrC:
+ t.Fatal("unexpectedly auto-generated an address with 0 lifetime")
+ default:
+ }
+
+ // Receive an RA with prefix1 in an NDP Prefix Information option (PI)
+ // with non-zero lifetime.
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 0))
+ expectAutoGenAddrEvent(addr1, newAddr)
+ if !contains(s.NICInfo()[1].ProtocolAddresses, addr1) {
+ t.Fatalf("Should have %s in the list of addresses", addr1)
+ }
+
+ // Receive an RA with prefix2 in an NDP Prefix Information option (PI)
+ // with preferred lifetime > valid lifetime
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 5, 6))
+ select {
+ case <-ndpDisp.autoGenAddrC:
+ t.Fatal("unexpectedly auto-generated an address with preferred lifetime > valid lifetime")
+ default:
+ }
+
+ // Receive an RA with prefix2 in a PI.
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 0))
+ expectAutoGenAddrEvent(addr2, newAddr)
+ if !contains(s.NICInfo()[1].ProtocolAddresses, addr1) {
+ t.Fatalf("Should have %s in the list of addresses", addr1)
+ }
+ if !contains(s.NICInfo()[1].ProtocolAddresses, addr2) {
+ t.Fatalf("Should have %s in the list of addresses", addr2)
+ }
+
+ // Refresh valid lifetime for addr of prefix1.
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, newMinVL, 0))
+ select {
+ case <-ndpDisp.autoGenAddrC:
+ t.Fatal("unexpectedly auto-generated an address when we already have an address for a prefix")
+ default:
+ }
+
+ // Wait for addr of prefix1 to be invalidated.
+ select {
+ case e := <-ndpDisp.autoGenAddrC:
+ if diff := checkAutoGenAddrEvent(e, addr1, invalidatedAddr); diff != "" {
+ t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+ }
+ case <-time.After(newMinVLDuration + defaultTimeout):
+ t.Fatal("timed out waiting for addr auto gen event")
+ }
+ if contains(s.NICInfo()[1].ProtocolAddresses, addr1) {
+ t.Fatalf("Should not have %s in the list of addresses", addr1)
+ }
+ if !contains(s.NICInfo()[1].ProtocolAddresses, addr2) {
+ t.Fatalf("Should have %s in the list of addresses", addr2)
+ }
+}
+
+// stackAndNdpDispatcherWithDefaultRoute returns an ndpDispatcher,
+// channel.Endpoint and stack.Stack.
+//
+// stack.Stack will have a default route through the router (llAddr3) installed
+// and a static link-address (linkAddr3) added to the link address cache for the
+// router.
+func stackAndNdpDispatcherWithDefaultRoute(t *testing.T, nicID tcpip.NICID) (*ndpDispatcher, *channel.Endpoint, *stack.Stack) {
+ t.Helper()
+ ndpDisp := &ndpDispatcher{
+ autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1),
+ }
+ e := channel.New(0, 1280, linkAddr1)
+ s := stack.New(stack.Options{
+ NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+ TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+ NDPConfigs: stack.NDPConfigurations{
+ HandleRAs: true,
+ AutoGenGlobalAddresses: true,
+ },
+ NDPDisp: ndpDisp,
+ })
+ if err := s.CreateNIC(nicID, e); err != nil {
+ t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+ }
+ s.SetRouteTable([]tcpip.Route{{
+ Destination: header.IPv6EmptySubnet,
+ Gateway: llAddr3,
+ NIC: nicID,
+ }})
+ s.AddLinkAddress(nicID, llAddr3, linkAddr3)
+ return ndpDisp, e, s
+}
+
+// addrForNewConnectionTo returns the local address used when creating a new
+// connection to addr.
+func addrForNewConnectionTo(t *testing.T, s *stack.Stack, addr tcpip.FullAddress) tcpip.Address {
+ t.Helper()
+
+ wq := waiter.Queue{}
+ we, ch := waiter.NewChannelEntry(nil)
+ wq.EventRegister(&we, waiter.EventIn)
+ defer wq.EventUnregister(&we)
+ defer close(ch)
+ ep, err := s.NewEndpoint(header.UDPProtocolNumber, header.IPv6ProtocolNumber, &wq)
+ if err != nil {
+ t.Fatalf("s.NewEndpoint(%d, %d, _): %s", header.UDPProtocolNumber, header.IPv6ProtocolNumber, err)
+ }
+ defer ep.Close()
+ if err := ep.SetSockOptBool(tcpip.V6OnlyOption, true); err != nil {
+ t.Fatalf("SetSockOpt(tcpip.V6OnlyOption, true): %s", err)
+ }
+ if err := ep.Connect(addr); err != nil {
+ t.Fatalf("ep.Connect(%+v): %s", addr, err)
+ }
+ got, err := ep.GetLocalAddress()
+ if err != nil {
+ t.Fatalf("ep.GetLocalAddress(): %s", err)
+ }
+ return got.Addr
+}
+
+// addrForNewConnection returns the local address used when creating a new
+// connection.
+func addrForNewConnection(t *testing.T, s *stack.Stack) tcpip.Address {
+ t.Helper()
+
+ return addrForNewConnectionTo(t, s, dstAddr)
+}
+
+// addrForNewConnectionWithAddr returns the local address used when creating a
+// new connection with a specific local address.
+func addrForNewConnectionWithAddr(t *testing.T, s *stack.Stack, addr tcpip.FullAddress) tcpip.Address {
+ t.Helper()
+
+ wq := waiter.Queue{}
+ we, ch := waiter.NewChannelEntry(nil)
+ wq.EventRegister(&we, waiter.EventIn)
+ defer wq.EventUnregister(&we)
+ defer close(ch)
+ ep, err := s.NewEndpoint(header.UDPProtocolNumber, header.IPv6ProtocolNumber, &wq)
+ if err != nil {
+ t.Fatalf("s.NewEndpoint(%d, %d, _): %s", header.UDPProtocolNumber, header.IPv6ProtocolNumber, err)
+ }
+ defer ep.Close()
+ if err := ep.SetSockOptBool(tcpip.V6OnlyOption, true); err != nil {
+ t.Fatalf("SetSockOpt(tcpip.V6OnlyOption, true): %s", err)
+ }
+ if err := ep.Bind(addr); err != nil {
+ t.Fatalf("ep.Bind(%+v): %s", addr, err)
+ }
+ if err := ep.Connect(dstAddr); err != nil {
+ t.Fatalf("ep.Connect(%+v): %s", dstAddr, err)
+ }
+ got, err := ep.GetLocalAddress()
+ if err != nil {
+ t.Fatalf("ep.GetLocalAddress(): %s", err)
+ }
+ return got.Addr
+}
+
+// TestAutoGenAddrDeprecateFromPI tests deprecating a SLAAC address when
+// receiving a PI with 0 preferred lifetime.
+func TestAutoGenAddrDeprecateFromPI(t *testing.T) {
+ const nicID = 1
+
+ prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1)
+ prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1)
+
+ ndpDisp, e, s := stackAndNdpDispatcherWithDefaultRoute(t, nicID)
+
+ expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+ t.Helper()
+
+ select {
+ case e := <-ndpDisp.autoGenAddrC:
+ if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+ t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+ }
+ default:
+ t.Fatal("expected addr auto gen event")
+ }
+ }
+
+ expectPrimaryAddr := func(addr tcpip.AddressWithPrefix) {
+ t.Helper()
+
+ if got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil {
+ t.Fatalf("s.GetMainNICAddress(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err)
+ } else if got != addr {
+ t.Errorf("got s.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, addr)
+ }
+
+ if got := addrForNewConnection(t, s); got != addr.Address {
+ t.Errorf("got addrForNewConnection = %s, want = %s", got, addr.Address)
+ }
+ }
+
+ // Receive PI for prefix1.
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 100))
+ expectAutoGenAddrEvent(addr1, newAddr)
+ if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+ t.Fatalf("should have %s in the list of addresses", addr1)
+ }
+ expectPrimaryAddr(addr1)
+
+ // Deprecate addr for prefix1 immedaitely.
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 0))
+ expectAutoGenAddrEvent(addr1, deprecatedAddr)
+ if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+ t.Fatalf("should have %s in the list of addresses", addr1)
+ }
+ // addr should still be the primary endpoint as there are no other addresses.
+ expectPrimaryAddr(addr1)
+
+ // Refresh lifetimes of addr generated from prefix1.
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 100))
+ select {
+ case <-ndpDisp.autoGenAddrC:
+ t.Fatal("unexpectedly got an auto-generated event")
+ default:
+ }
+ expectPrimaryAddr(addr1)
+
+ // Receive PI for prefix2.
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 100))
+ expectAutoGenAddrEvent(addr2, newAddr)
+ if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+ t.Fatalf("should have %s in the list of addresses", addr2)
+ }
+ expectPrimaryAddr(addr2)
+
+ // Deprecate addr for prefix2 immedaitely.
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 0))
+ expectAutoGenAddrEvent(addr2, deprecatedAddr)
+ if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+ t.Fatalf("should have %s in the list of addresses", addr2)
+ }
+ // addr1 should be the primary endpoint now since addr2 is deprecated but
+ // addr1 is not.
+ expectPrimaryAddr(addr1)
+ // addr2 is deprecated but if explicitly requested, it should be used.
+ fullAddr2 := tcpip.FullAddress{Addr: addr2.Address, NIC: nicID}
+ if got := addrForNewConnectionWithAddr(t, s, fullAddr2); got != addr2.Address {
+ t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", got, addr2.Address)
+ }
+
+ // Another PI w/ 0 preferred lifetime should not result in a deprecation
+ // event.
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 0))
+ select {
+ case <-ndpDisp.autoGenAddrC:
+ t.Fatal("unexpectedly got an auto-generated event")
+ default:
+ }
+ expectPrimaryAddr(addr1)
+ if got := addrForNewConnectionWithAddr(t, s, fullAddr2); got != addr2.Address {
+ t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", got, addr2.Address)
+ }
+
+ // Refresh lifetimes of addr generated from prefix2.
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 100))
+ select {
+ case <-ndpDisp.autoGenAddrC:
+ t.Fatal("unexpectedly got an auto-generated event")
+ default:
+ }
+ expectPrimaryAddr(addr2)
+}
+
+// TestAutoGenAddrTimerDeprecation tests that an address is properly deprecated
+// when its preferred lifetime expires.
+func TestAutoGenAddrTimerDeprecation(t *testing.T) {
+ const nicID = 1
+ const newMinVL = 2
+ newMinVLDuration := newMinVL * time.Second
+ saved := stack.MinPrefixInformationValidLifetimeForUpdate
+ defer func() {
+ stack.MinPrefixInformationValidLifetimeForUpdate = saved
+ }()
+ stack.MinPrefixInformationValidLifetimeForUpdate = newMinVLDuration
+
+ prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1)
+ prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1)
+
+ ndpDisp, e, s := stackAndNdpDispatcherWithDefaultRoute(t, nicID)
+
+ expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+ t.Helper()
+
+ select {
+ case e := <-ndpDisp.autoGenAddrC:
+ if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+ t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+ }
+ default:
+ t.Fatal("expected addr auto gen event")
+ }
+ }
+
+ expectAutoGenAddrEventAfter := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType, timeout time.Duration) {
+ t.Helper()
+
+ select {
+ case e := <-ndpDisp.autoGenAddrC:
+ if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+ t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+ }
+ case <-time.After(timeout):
+ t.Fatal("timed out waiting for addr auto gen event")
+ }
+ }
+
+ expectPrimaryAddr := func(addr tcpip.AddressWithPrefix) {
+ t.Helper()
+
+ if got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil {
+ t.Fatalf("s.GetMainNICAddress(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err)
+ } else if got != addr {
+ t.Errorf("got s.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, addr)
+ }
+
+ if got := addrForNewConnection(t, s); got != addr.Address {
+ t.Errorf("got addrForNewConnection = %s, want = %s", got, addr.Address)
+ }
+ }
+
+ // Receive PI for prefix2.
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 100))
+ expectAutoGenAddrEvent(addr2, newAddr)
+ if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+ t.Fatalf("should have %s in the list of addresses", addr2)
+ }
+ expectPrimaryAddr(addr2)
+
+ // Receive a PI for prefix1.
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 90))
+ expectAutoGenAddrEvent(addr1, newAddr)
+ if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+ t.Fatalf("should have %s in the list of addresses", addr1)
+ }
+ if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+ t.Fatalf("should have %s in the list of addresses", addr2)
+ }
+ expectPrimaryAddr(addr1)
+
+ // Refresh lifetime for addr of prefix1.
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, newMinVL, newMinVL-1))
+ select {
+ case <-ndpDisp.autoGenAddrC:
+ t.Fatal("unexpectedly got an auto-generated event")
+ default:
+ }
+ expectPrimaryAddr(addr1)
+
+ // Wait for addr of prefix1 to be deprecated.
+ expectAutoGenAddrEventAfter(addr1, deprecatedAddr, newMinVLDuration-time.Second+defaultTimeout)
+ if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+ t.Fatalf("should not have %s in the list of addresses", addr1)
+ }
+ if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+ t.Fatalf("should have %s in the list of addresses", addr2)
+ }
+ // addr2 should be the primary endpoint now since addr1 is deprecated but
+ // addr2 is not.
+ expectPrimaryAddr(addr2)
+ // addr1 is deprecated but if explicitly requested, it should be used.
+ fullAddr1 := tcpip.FullAddress{Addr: addr1.Address, NIC: nicID}
+ if got := addrForNewConnectionWithAddr(t, s, fullAddr1); got != addr1.Address {
+ t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", got, addr1.Address)
+ }
+
+ // Refresh valid lifetime for addr of prefix1, w/ 0 preferred lifetime to make
+ // sure we do not get a deprecation event again.
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, newMinVL, 0))
+ select {
+ case <-ndpDisp.autoGenAddrC:
+ t.Fatal("unexpectedly got an auto-generated event")
+ default:
+ }
+ expectPrimaryAddr(addr2)
+ if got := addrForNewConnectionWithAddr(t, s, fullAddr1); got != addr1.Address {
+ t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", got, addr1.Address)
+ }
+
+ // Refresh lifetimes for addr of prefix1.
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, newMinVL, newMinVL-1))
+ select {
+ case <-ndpDisp.autoGenAddrC:
+ t.Fatal("unexpectedly got an auto-generated event")
+ default:
+ }
+ // addr1 is the primary endpoint again since it is non-deprecated now.
+ expectPrimaryAddr(addr1)
+
+ // Wait for addr of prefix1 to be deprecated.
+ expectAutoGenAddrEventAfter(addr1, deprecatedAddr, newMinVLDuration-time.Second+defaultTimeout)
+ if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+ t.Fatalf("should not have %s in the list of addresses", addr1)
+ }
+ if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+ t.Fatalf("should have %s in the list of addresses", addr2)
+ }
+ // addr2 should be the primary endpoint now since it is not deprecated.
+ expectPrimaryAddr(addr2)
+ if got := addrForNewConnectionWithAddr(t, s, fullAddr1); got != addr1.Address {
+ t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", got, addr1.Address)
+ }
+
+ // Wait for addr of prefix1 to be invalidated.
+ expectAutoGenAddrEventAfter(addr1, invalidatedAddr, time.Second+defaultTimeout)
+ if contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+ t.Fatalf("should not have %s in the list of addresses", addr1)
+ }
+ if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+ t.Fatalf("should have %s in the list of addresses", addr2)
+ }
+ expectPrimaryAddr(addr2)
+
+ // Refresh both lifetimes for addr of prefix2 to the same value.
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, newMinVL, newMinVL))
+ select {
+ case <-ndpDisp.autoGenAddrC:
+ t.Fatal("unexpectedly got an auto-generated event")
+ default:
+ }
+
+ // Wait for a deprecation then invalidation events, or just an invalidation
+ // event. We need to cover both cases but cannot deterministically hit both
+ // cases because the deprecation and invalidation handlers could be handled in
+ // either deprecation then invalidation, or invalidation then deprecation
+ // (which should be cancelled by the invalidation handler).
+ select {
+ case e := <-ndpDisp.autoGenAddrC:
+ if diff := checkAutoGenAddrEvent(e, addr2, deprecatedAddr); diff == "" {
+ // If we get a deprecation event first, we should get an invalidation
+ // event almost immediately after.
+ select {
+ case e := <-ndpDisp.autoGenAddrC:
+ if diff := checkAutoGenAddrEvent(e, addr2, invalidatedAddr); diff != "" {
+ t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+ }
case <-time.After(defaultTimeout):
+ t.Fatal("timed out waiting for addr auto gen event")
}
+ } else if diff := checkAutoGenAddrEvent(e, addr2, invalidatedAddr); diff == "" {
+ // If we get an invalidation event first, we should not get a deprecation
+ // event after.
+ select {
+ case <-ndpDisp.autoGenAddrC:
+ t.Fatal("unexpectedly got an auto-generated event")
+ case <-time.After(defaultTimeout):
+ }
+ } else {
+ t.Fatalf("got unexpected auto-generated event")
}
+
+ case <-time.After(newMinVLDuration + defaultTimeout):
+ t.Fatal("timed out waiting for addr auto gen event")
+ }
+ if contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+ t.Fatalf("should not have %s in the list of addresses", addr1)
+ }
+ if contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+ t.Fatalf("should not have %s in the list of addresses", addr2)
+ }
+ // Should not have any primary endpoints.
+ if got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil {
+ t.Fatalf("s.GetMainNICAddress(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err)
+ } else if want := (tcpip.AddressWithPrefix{}); got != want {
+ t.Errorf("got s.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, want)
}
+ wq := waiter.Queue{}
+ we, ch := waiter.NewChannelEntry(nil)
+ wq.EventRegister(&we, waiter.EventIn)
+ defer wq.EventUnregister(&we)
+ defer close(ch)
+ ep, err := s.NewEndpoint(header.UDPProtocolNumber, header.IPv6ProtocolNumber, &wq)
+ if err != nil {
+ t.Fatalf("s.NewEndpoint(%d, %d, _): %s", header.UDPProtocolNumber, header.IPv6ProtocolNumber, err)
+ }
+ defer ep.Close()
+ if err := ep.SetSockOptBool(tcpip.V6OnlyOption, true); err != nil {
+ t.Fatalf("SetSockOpt(tcpip.V6OnlyOption, true): %s", err)
+ }
+
+ if err := ep.Connect(dstAddr); err != tcpip.ErrNoRoute {
+ t.Errorf("got ep.Connect(%+v) = %v, want = %s", dstAddr, err, tcpip.ErrNoRoute)
+ }
+}
- // Should only have device routes for the first
- // stack.MaxDiscoveredOnLinkPrefixes discovered on-link prefixes.
- if got := s.GetRouteTable(); !cmp.Equal(got, expectedRt[:]) {
- t.Fatalf("got GetRouteTable = %v, want = %v", got, expectedRt)
+// Tests transitioning a SLAAC address's valid lifetime between finite and
+// infinite values.
+func TestAutoGenAddrFiniteToInfiniteToFiniteVL(t *testing.T) {
+ const infiniteVLSeconds = 2
+ const minVLSeconds = 1
+ savedIL := header.NDPInfiniteLifetime
+ savedMinVL := stack.MinPrefixInformationValidLifetimeForUpdate
+ defer func() {
+ stack.MinPrefixInformationValidLifetimeForUpdate = savedMinVL
+ header.NDPInfiniteLifetime = savedIL
+ }()
+ stack.MinPrefixInformationValidLifetimeForUpdate = minVLSeconds * time.Second
+ header.NDPInfiniteLifetime = infiniteVLSeconds * time.Second
+
+ prefix, _, addr := prefixSubnetAddr(0, linkAddr1)
+
+ tests := []struct {
+ name string
+ infiniteVL uint32
+ }{
+ {
+ name: "EqualToInfiniteVL",
+ infiniteVL: infiniteVLSeconds,
+ },
+ // Our implementation supports changing header.NDPInfiniteLifetime for tests
+ // such that a packet can be received where the lifetime field has a value
+ // greater than header.NDPInfiniteLifetime. Because of this, we test to make
+ // sure that receiving a value greater than header.NDPInfiniteLifetime is
+ // handled the same as when receiving a value equal to
+ // header.NDPInfiniteLifetime.
+ {
+ name: "MoreThanInfiniteVL",
+ infiniteVL: infiniteVLSeconds + 1,
+ },
+ }
+
+ // This Run will not return until the parallel tests finish.
+ //
+ // We need this because we need to do some teardown work after the
+ // parallel tests complete.
+ //
+ // See https://godoc.org/testing#hdr-Subtests_and_Sub_benchmarks for
+ // more details.
+ t.Run("group", func(t *testing.T) {
+ for _, test := range tests {
+ test := test
+
+ t.Run(test.name, func(t *testing.T) {
+ t.Parallel()
+
+ ndpDisp := ndpDispatcher{
+ autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1),
+ }
+ e := channel.New(0, 1280, linkAddr1)
+ s := stack.New(stack.Options{
+ NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+ NDPConfigs: stack.NDPConfigurations{
+ HandleRAs: true,
+ AutoGenGlobalAddresses: true,
+ },
+ NDPDisp: &ndpDisp,
+ })
+
+ if err := s.CreateNIC(1, e); err != nil {
+ t.Fatalf("CreateNIC(1) = %s", err)
+ }
+
+ // Receive an RA with finite prefix.
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, minVLSeconds, 0))
+ select {
+ case e := <-ndpDisp.autoGenAddrC:
+ if diff := checkAutoGenAddrEvent(e, addr, newAddr); diff != "" {
+ t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+ }
+
+ default:
+ t.Fatal("expected addr auto gen event")
+ }
+
+ // Receive an new RA with prefix with infinite VL.
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, test.infiniteVL, 0))
+
+ // Receive a new RA with prefix with finite VL.
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, minVLSeconds, 0))
+
+ select {
+ case e := <-ndpDisp.autoGenAddrC:
+ if diff := checkAutoGenAddrEvent(e, addr, invalidatedAddr); diff != "" {
+ t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+ }
+
+ case <-time.After(minVLSeconds*time.Second + defaultTimeout):
+ t.Fatal("timeout waiting for addr auto gen event")
+ }
+ })
+ }
+ })
+}
+
+// TestAutoGenAddrValidLifetimeUpdates tests that the valid lifetime of an
+// auto-generated address only gets updated when required to, as specified in
+// RFC 4862 section 5.5.3.e.
+func TestAutoGenAddrValidLifetimeUpdates(t *testing.T) {
+ const infiniteVL = 4294967295
+ const newMinVL = 4
+ saved := stack.MinPrefixInformationValidLifetimeForUpdate
+ defer func() {
+ stack.MinPrefixInformationValidLifetimeForUpdate = saved
+ }()
+ stack.MinPrefixInformationValidLifetimeForUpdate = newMinVL * time.Second
+
+ prefix, _, addr := prefixSubnetAddr(0, linkAddr1)
+
+ tests := []struct {
+ name string
+ ovl uint32
+ nvl uint32
+ evl uint32
+ }{
+ // Should update the VL to the minimum VL for updating if the
+ // new VL is less than newMinVL but was originally greater than
+ // it.
+ {
+ "LargeVLToVLLessThanMinVLForUpdate",
+ 9999,
+ 1,
+ newMinVL,
+ },
+ {
+ "LargeVLTo0",
+ 9999,
+ 0,
+ newMinVL,
+ },
+ {
+ "InfiniteVLToVLLessThanMinVLForUpdate",
+ infiniteVL,
+ 1,
+ newMinVL,
+ },
+ {
+ "InfiniteVLTo0",
+ infiniteVL,
+ 0,
+ newMinVL,
+ },
+
+ // Should not update VL if original VL was less than newMinVL
+ // and the new VL is also less than newMinVL.
+ {
+ "ShouldNotUpdateWhenBothOldAndNewAreLessThanMinVLForUpdate",
+ newMinVL - 1,
+ newMinVL - 3,
+ newMinVL - 1,
+ },
+
+ // Should take the new VL if the new VL is greater than the
+ // remaining time or is greater than newMinVL.
+ {
+ "MorethanMinVLToLesserButStillMoreThanMinVLForUpdate",
+ newMinVL + 5,
+ newMinVL + 3,
+ newMinVL + 3,
+ },
+ {
+ "SmallVLToGreaterVLButStillLessThanMinVLForUpdate",
+ newMinVL - 3,
+ newMinVL - 1,
+ newMinVL - 1,
+ },
+ {
+ "SmallVLToGreaterVLThatIsMoreThaMinVLForUpdate",
+ newMinVL - 3,
+ newMinVL + 1,
+ newMinVL + 1,
+ },
+ }
+
+ const delta = 500 * time.Millisecond
+
+ // This Run will not return until the parallel tests finish.
+ //
+ // We need this because we need to do some teardown work after the
+ // parallel tests complete.
+ //
+ // See https://godoc.org/testing#hdr-Subtests_and_Sub_benchmarks for
+ // more details.
+ t.Run("group", func(t *testing.T) {
+ for _, test := range tests {
+ test := test
+
+ t.Run(test.name, func(t *testing.T) {
+ t.Parallel()
+
+ ndpDisp := ndpDispatcher{
+ autoGenAddrC: make(chan ndpAutoGenAddrEvent, 10),
+ }
+ e := channel.New(10, 1280, linkAddr1)
+ s := stack.New(stack.Options{
+ NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+ NDPConfigs: stack.NDPConfigurations{
+ HandleRAs: true,
+ AutoGenGlobalAddresses: true,
+ },
+ NDPDisp: &ndpDisp,
+ })
+
+ if err := s.CreateNIC(1, e); err != nil {
+ t.Fatalf("CreateNIC(1) = %s", err)
+ }
+
+ // Receive an RA with prefix with initial VL,
+ // test.ovl.
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, test.ovl, 0))
+ select {
+ case e := <-ndpDisp.autoGenAddrC:
+ if diff := checkAutoGenAddrEvent(e, addr, newAddr); diff != "" {
+ t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+ }
+ default:
+ t.Fatal("expected addr auto gen event")
+ }
+
+ // Receive an new RA with prefix with new VL,
+ // test.nvl.
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, test.nvl, 0))
+
+ //
+ // Validate that the VL for the address got set
+ // to test.evl.
+ //
+
+ // Make sure we do not get any invalidation
+ // events until atleast 500ms (delta) before
+ // test.evl.
+ select {
+ case <-ndpDisp.autoGenAddrC:
+ t.Fatal("unexpectedly received an auto gen addr event")
+ case <-time.After(time.Duration(test.evl)*time.Second - delta):
+ }
+
+ // Wait for another second (2x delta), but now
+ // we expect the invalidation event.
+ select {
+ case e := <-ndpDisp.autoGenAddrC:
+ if diff := checkAutoGenAddrEvent(e, addr, invalidatedAddr); diff != "" {
+ t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+ }
+
+ case <-time.After(2 * delta):
+ t.Fatal("timeout waiting for addr auto gen event")
+ }
+ })
+ }
+ })
+}
+
+// TestAutoGenAddrRemoval tests that when auto-generated addresses are removed
+// by the user, its resources will be cleaned up and an invalidation event will
+// be sent to the integrator.
+func TestAutoGenAddrRemoval(t *testing.T) {
+ t.Parallel()
+
+ prefix, _, addr := prefixSubnetAddr(0, linkAddr1)
+
+ ndpDisp := ndpDispatcher{
+ autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1),
+ }
+ e := channel.New(0, 1280, linkAddr1)
+ s := stack.New(stack.Options{
+ NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+ NDPConfigs: stack.NDPConfigurations{
+ HandleRAs: true,
+ AutoGenGlobalAddresses: true,
+ },
+ NDPDisp: &ndpDisp,
+ })
+
+ if err := s.CreateNIC(1, e); err != nil {
+ t.Fatalf("CreateNIC(1) = %s", err)
+ }
+
+ expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+ t.Helper()
+
+ select {
+ case e := <-ndpDisp.autoGenAddrC:
+ if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+ t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+ }
+ default:
+ t.Fatal("expected addr auto gen event")
+ }
+ }
+
+ // Receive a PI to auto-generate an address.
+ const lifetimeSeconds = 1
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, lifetimeSeconds, 0))
+ expectAutoGenAddrEvent(addr, newAddr)
+
+ // Removing the address should result in an invalidation event
+ // immediately.
+ if err := s.RemoveAddress(1, addr.Address); err != nil {
+ t.Fatalf("RemoveAddress(_, %s) = %s", addr.Address, err)
+ }
+ expectAutoGenAddrEvent(addr, invalidatedAddr)
+
+ // Wait for the original valid lifetime to make sure the original timer
+ // got stopped/cleaned up.
+ select {
+ case <-ndpDisp.autoGenAddrC:
+ t.Fatal("unexpectedly received an auto gen addr event")
+ case <-time.After(lifetimeSeconds*time.Second + defaultTimeout):
+ }
+}
+
+// TestAutoGenAddrAfterRemoval tests adding a SLAAC address that was previously
+// assigned to the NIC but is in the permanentExpired state.
+func TestAutoGenAddrAfterRemoval(t *testing.T) {
+ t.Parallel()
+
+ const nicID = 1
+
+ prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1)
+ prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1)
+ ndpDisp, e, s := stackAndNdpDispatcherWithDefaultRoute(t, nicID)
+
+ expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+ t.Helper()
+
+ select {
+ case e := <-ndpDisp.autoGenAddrC:
+ if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+ t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+ }
+ default:
+ t.Fatal("expected addr auto gen event")
+ }
+ }
+
+ expectPrimaryAddr := func(addr tcpip.AddressWithPrefix) {
+ t.Helper()
+
+ if got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil {
+ t.Fatalf("s.GetMainNICAddress(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err)
+ } else if got != addr {
+ t.Errorf("got s.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, addr)
+ }
+
+ if got := addrForNewConnection(t, s); got != addr.Address {
+ t.Errorf("got addrForNewConnection = %s, want = %s", got, addr.Address)
+ }
+ }
+
+ // Receive a PI to auto-generate addr1 with a large valid and preferred
+ // lifetime.
+ const largeLifetimeSeconds = 999
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, 0, prefix1, true, true, largeLifetimeSeconds, largeLifetimeSeconds))
+ expectAutoGenAddrEvent(addr1, newAddr)
+ expectPrimaryAddr(addr1)
+
+ // Add addr2 as a static address.
+ protoAddr2 := tcpip.ProtocolAddress{
+ Protocol: header.IPv6ProtocolNumber,
+ AddressWithPrefix: addr2,
+ }
+ if err := s.AddProtocolAddressWithOptions(nicID, protoAddr2, stack.FirstPrimaryEndpoint); err != nil {
+ t.Fatalf("AddProtocolAddressWithOptions(%d, %+v, %d, %s) = %s", nicID, protoAddr2, stack.FirstPrimaryEndpoint, err)
+ }
+ // addr2 should be more preferred now since it is at the front of the primary
+ // list.
+ expectPrimaryAddr(addr2)
+
+ // Get a route using addr2 to increment its reference count then remove it
+ // to leave it in the permanentExpired state.
+ r, err := s.FindRoute(nicID, addr2.Address, addr3, header.IPv6ProtocolNumber, false)
+ if err != nil {
+ t.Fatalf("FindRoute(%d, %s, %s, %d, false): %s", nicID, addr2.Address, addr3, header.IPv6ProtocolNumber, err)
+ }
+ defer r.Release()
+ if err := s.RemoveAddress(nicID, addr2.Address); err != nil {
+ t.Fatalf("s.RemoveAddress(%d, %s): %s", nicID, addr2.Address, err)
+ }
+ // addr1 should be preferred again since addr2 is in the expired state.
+ expectPrimaryAddr(addr1)
+
+ // Receive a PI to auto-generate addr2 as valid and preferred.
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, 0, prefix2, true, true, largeLifetimeSeconds, largeLifetimeSeconds))
+ expectAutoGenAddrEvent(addr2, newAddr)
+ // addr2 should be more preferred now that it is closer to the front of the
+ // primary list and not deprecated.
+ expectPrimaryAddr(addr2)
+
+ // Removing the address should result in an invalidation event immediately.
+ // It should still be in the permanentExpired state because r is still held.
+ //
+ // We remove addr2 here to make sure addr2 was marked as a SLAAC address
+ // (it was previously marked as a static address).
+ if err := s.RemoveAddress(1, addr2.Address); err != nil {
+ t.Fatalf("RemoveAddress(_, %s) = %s", addr2.Address, err)
+ }
+ expectAutoGenAddrEvent(addr2, invalidatedAddr)
+ // addr1 should be more preferred since addr2 is in the expired state.
+ expectPrimaryAddr(addr1)
+
+ // Receive a PI to auto-generate addr2 as valid and deprecated.
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, 0, prefix2, true, true, largeLifetimeSeconds, 0))
+ expectAutoGenAddrEvent(addr2, newAddr)
+ // addr1 should still be more preferred since addr2 is deprecated, even though
+ // it is closer to the front of the primary list.
+ expectPrimaryAddr(addr1)
+
+ // Receive a PI to refresh addr2's preferred lifetime.
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, 0, prefix2, true, true, largeLifetimeSeconds, largeLifetimeSeconds))
+ select {
+ case <-ndpDisp.autoGenAddrC:
+ t.Fatal("unexpectedly got an auto gen addr event")
+ default:
+ }
+ // addr2 should be more preferred now that it is not deprecated.
+ expectPrimaryAddr(addr2)
+
+ if err := s.RemoveAddress(1, addr2.Address); err != nil {
+ t.Fatalf("RemoveAddress(_, %s) = %s", addr2.Address, err)
+ }
+ expectAutoGenAddrEvent(addr2, invalidatedAddr)
+ expectPrimaryAddr(addr1)
+}
+
+// TestAutoGenAddrStaticConflict tests that if SLAAC generates an address that
+// is already assigned to the NIC, the static address remains.
+func TestAutoGenAddrStaticConflict(t *testing.T) {
+ t.Parallel()
+
+ prefix, _, addr := prefixSubnetAddr(0, linkAddr1)
+
+ ndpDisp := ndpDispatcher{
+ autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1),
+ }
+ e := channel.New(0, 1280, linkAddr1)
+ s := stack.New(stack.Options{
+ NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+ NDPConfigs: stack.NDPConfigurations{
+ HandleRAs: true,
+ AutoGenGlobalAddresses: true,
+ },
+ NDPDisp: &ndpDisp,
+ })
+
+ if err := s.CreateNIC(1, e); err != nil {
+ t.Fatalf("CreateNIC(1) = %s", err)
+ }
+
+ // Add the address as a static address before SLAAC tries to add it.
+ if err := s.AddProtocolAddress(1, tcpip.ProtocolAddress{Protocol: header.IPv6ProtocolNumber, AddressWithPrefix: addr}); err != nil {
+ t.Fatalf("AddAddress(_, %d, %s) = %s", header.IPv6ProtocolNumber, addr.Address, err)
+ }
+ if !contains(s.NICInfo()[1].ProtocolAddresses, addr) {
+ t.Fatalf("Should have %s in the list of addresses", addr1)
+ }
+
+ // Receive a PI where the generated address will be the same as the one
+ // that we already have assigned statically.
+ const lifetimeSeconds = 1
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, lifetimeSeconds, 0))
+ select {
+ case <-ndpDisp.autoGenAddrC:
+ t.Fatal("unexpectedly received an auto gen addr event for an address we already have statically")
+ default:
+ }
+ if !contains(s.NICInfo()[1].ProtocolAddresses, addr) {
+ t.Fatalf("Should have %s in the list of addresses", addr1)
+ }
+
+ // Should not get an invalidation event after the PI's invalidation
+ // time.
+ select {
+ case <-ndpDisp.autoGenAddrC:
+ t.Fatal("unexpectedly received an auto gen addr event")
+ case <-time.After(lifetimeSeconds*time.Second + defaultTimeout):
+ }
+ if !contains(s.NICInfo()[1].ProtocolAddresses, addr) {
+ t.Fatalf("Should have %s in the list of addresses", addr1)
+ }
+}
+
+// TestAutoGenAddrWithOpaqueIID tests that SLAAC generated addresses will use
+// opaque interface identifiers when configured to do so.
+func TestAutoGenAddrWithOpaqueIID(t *testing.T) {
+ t.Parallel()
+
+ const nicID = 1
+ const nicName = "nic1"
+ var secretKeyBuf [header.OpaqueIIDSecretKeyMinBytes]byte
+ secretKey := secretKeyBuf[:]
+ n, err := rand.Read(secretKey)
+ if err != nil {
+ t.Fatalf("rand.Read(_): %s", err)
+ }
+ if n != header.OpaqueIIDSecretKeyMinBytes {
+ t.Fatalf("got rand.Read(_) = (%d, _), want = (%d, _)", n, header.OpaqueIIDSecretKeyMinBytes)
+ }
+
+ prefix1, subnet1, _ := prefixSubnetAddr(0, linkAddr1)
+ prefix2, subnet2, _ := prefixSubnetAddr(1, linkAddr1)
+ // addr1 and addr2 are the addresses that are expected to be generated when
+ // stack.Stack is configured to generate opaque interface identifiers as
+ // defined by RFC 7217.
+ addrBytes := []byte(subnet1.ID())
+ addr1 := tcpip.AddressWithPrefix{
+ Address: tcpip.Address(header.AppendOpaqueInterfaceIdentifier(addrBytes[:header.IIDOffsetInIPv6Address], subnet1, nicName, 0, secretKey)),
+ PrefixLen: 64,
+ }
+ addrBytes = []byte(subnet2.ID())
+ addr2 := tcpip.AddressWithPrefix{
+ Address: tcpip.Address(header.AppendOpaqueInterfaceIdentifier(addrBytes[:header.IIDOffsetInIPv6Address], subnet2, nicName, 0, secretKey)),
+ PrefixLen: 64,
+ }
+
+ ndpDisp := ndpDispatcher{
+ autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1),
+ }
+ e := channel.New(0, 1280, linkAddr1)
+ s := stack.New(stack.Options{
+ NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+ NDPConfigs: stack.NDPConfigurations{
+ HandleRAs: true,
+ AutoGenGlobalAddresses: true,
+ },
+ NDPDisp: &ndpDisp,
+ OpaqueIIDOpts: stack.OpaqueInterfaceIdentifierOptions{
+ NICNameFromID: func(_ tcpip.NICID, nicName string) string {
+ return nicName
+ },
+ SecretKey: secretKey,
+ },
+ })
+ opts := stack.NICOptions{Name: nicName}
+ if err := s.CreateNICWithOptions(nicID, e, opts); err != nil {
+ t.Fatalf("CreateNICWithOptions(%d, _, %+v, _) = %s", nicID, opts, err)
+ }
+
+ expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+ t.Helper()
+
+ select {
+ case e := <-ndpDisp.autoGenAddrC:
+ if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+ t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+ }
+ default:
+ t.Fatal("expected addr auto gen event")
+ }
+ }
+
+ // Receive an RA with prefix1 in a PI.
+ const validLifetimeSecondPrefix1 = 1
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, validLifetimeSecondPrefix1, 0))
+ expectAutoGenAddrEvent(addr1, newAddr)
+ if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+ t.Fatalf("should have %s in the list of addresses", addr1)
+ }
+
+ // Receive an RA with prefix2 in a PI with a large valid lifetime.
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 0))
+ expectAutoGenAddrEvent(addr2, newAddr)
+ if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+ t.Fatalf("should have %s in the list of addresses", addr1)
+ }
+ if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+ t.Fatalf("should have %s in the list of addresses", addr2)
+ }
+
+ // Wait for addr of prefix1 to be invalidated.
+ select {
+ case e := <-ndpDisp.autoGenAddrC:
+ if diff := checkAutoGenAddrEvent(e, addr1, invalidatedAddr); diff != "" {
+ t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+ }
+ case <-time.After(validLifetimeSecondPrefix1*time.Second + defaultTimeout):
+ t.Fatal("timed out waiting for addr auto gen event")
+ }
+ if contains(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+ t.Fatalf("should not have %s in the list of addresses", addr1)
+ }
+ if !contains(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+ t.Fatalf("should have %s in the list of addresses", addr2)
+ }
+}
+
+// TestNDPRecursiveDNSServerDispatch tests that we properly dispatch an event
+// to the integrator when an RA is received with the NDP Recursive DNS Server
+// option with at least one valid address.
+func TestNDPRecursiveDNSServerDispatch(t *testing.T) {
+ t.Parallel()
+
+ tests := []struct {
+ name string
+ opt header.NDPRecursiveDNSServer
+ expected *ndpRDNSS
+ }{
+ {
+ "Unspecified",
+ header.NDPRecursiveDNSServer([]byte{
+ 0, 0,
+ 0, 0, 0, 2,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ }),
+ nil,
+ },
+ {
+ "Multicast",
+ header.NDPRecursiveDNSServer([]byte{
+ 0, 0,
+ 0, 0, 0, 2,
+ 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
+ }),
+ nil,
+ },
+ {
+ "OptionTooSmall",
+ header.NDPRecursiveDNSServer([]byte{
+ 0, 0,
+ 0, 0, 0, 2,
+ 1, 2, 3, 4, 5, 6, 7, 8,
+ }),
+ nil,
+ },
+ {
+ "0Addresses",
+ header.NDPRecursiveDNSServer([]byte{
+ 0, 0,
+ 0, 0, 0, 2,
+ }),
+ nil,
+ },
+ {
+ "Valid1Address",
+ header.NDPRecursiveDNSServer([]byte{
+ 0, 0,
+ 0, 0, 0, 2,
+ 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 1,
+ }),
+ &ndpRDNSS{
+ []tcpip.Address{
+ "\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x01",
+ },
+ 2 * time.Second,
+ },
+ },
+ {
+ "Valid2Addresses",
+ header.NDPRecursiveDNSServer([]byte{
+ 0, 0,
+ 0, 0, 0, 1,
+ 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 1,
+ 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 2,
+ }),
+ &ndpRDNSS{
+ []tcpip.Address{
+ "\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x01",
+ "\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x02",
+ },
+ time.Second,
+ },
+ },
+ {
+ "Valid3Addresses",
+ header.NDPRecursiveDNSServer([]byte{
+ 0, 0,
+ 0, 0, 0, 0,
+ 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 1,
+ 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 2,
+ 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 3,
+ }),
+ &ndpRDNSS{
+ []tcpip.Address{
+ "\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x01",
+ "\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x02",
+ "\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x03",
+ },
+ 0,
+ },
+ },
+ }
+
+ for _, test := range tests {
+ test := test
+
+ t.Run(test.name, func(t *testing.T) {
+ t.Parallel()
+
+ ndpDisp := ndpDispatcher{
+ // We do not expect more than a single RDNSS
+ // event at any time for this test.
+ rdnssC: make(chan ndpRDNSSEvent, 1),
+ }
+ e := channel.New(0, 1280, linkAddr1)
+ s := stack.New(stack.Options{
+ NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+ NDPConfigs: stack.NDPConfigurations{
+ HandleRAs: true,
+ },
+ NDPDisp: &ndpDisp,
+ })
+ if err := s.CreateNIC(1, e); err != nil {
+ t.Fatalf("CreateNIC(1) = %s", err)
+ }
+
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithOpts(llAddr1, 0, header.NDPOptionsSerializer{test.opt}))
+
+ if test.expected != nil {
+ select {
+ case e := <-ndpDisp.rdnssC:
+ if e.nicID != 1 {
+ t.Errorf("got rdnss nicID = %d, want = 1", e.nicID)
+ }
+ if diff := cmp.Diff(e.rdnss.addrs, test.expected.addrs); diff != "" {
+ t.Errorf("rdnss addrs mismatch (-want +got):\n%s", diff)
+ }
+ if e.rdnss.lifetime != test.expected.lifetime {
+ t.Errorf("got rdnss lifetime = %s, want = %s", e.rdnss.lifetime, test.expected.lifetime)
+ }
+ default:
+ t.Fatal("expected an RDNSS option event")
+ }
+ }
+
+ // Should have no more RDNSS options.
+ select {
+ case e := <-ndpDisp.rdnssC:
+ t.Fatalf("unexpectedly got a new RDNSS option event: %+v", e)
+ default:
+ }
+ })
+ }
+}
+
+// TestCleanupHostOnlyStateOnBecomingRouter tests that all discovered routers
+// and prefixes, and auto-generated addresses get invalidated when a NIC
+// becomes a router.
+func TestCleanupHostOnlyStateOnBecomingRouter(t *testing.T) {
+ t.Parallel()
+
+ const (
+ lifetimeSeconds = 5
+ maxEvents = 4
+ nicID1 = 1
+ nicID2 = 2
+ )
+
+ prefix1, subnet1, e1Addr1 := prefixSubnetAddr(0, linkAddr1)
+ prefix2, subnet2, e1Addr2 := prefixSubnetAddr(1, linkAddr1)
+ e2Addr1 := addrForSubnet(subnet1, linkAddr2)
+ e2Addr2 := addrForSubnet(subnet2, linkAddr2)
+
+ ndpDisp := ndpDispatcher{
+ routerC: make(chan ndpRouterEvent, maxEvents),
+ rememberRouter: true,
+ prefixC: make(chan ndpPrefixEvent, maxEvents),
+ rememberPrefix: true,
+ autoGenAddrC: make(chan ndpAutoGenAddrEvent, maxEvents),
+ }
+ s := stack.New(stack.Options{
+ NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+ NDPConfigs: stack.NDPConfigurations{
+ HandleRAs: true,
+ DiscoverDefaultRouters: true,
+ DiscoverOnLinkPrefixes: true,
+ AutoGenGlobalAddresses: true,
+ },
+ NDPDisp: &ndpDisp,
+ })
+
+ e1 := channel.New(0, 1280, linkAddr1)
+ if err := s.CreateNIC(nicID1, e1); err != nil {
+ t.Fatalf("CreateNIC(%d, _) = %s", nicID1, err)
+ }
+
+ e2 := channel.New(0, 1280, linkAddr2)
+ if err := s.CreateNIC(nicID2, e2); err != nil {
+ t.Fatalf("CreateNIC(%d, _) = %s", nicID2, err)
+ }
+
+ expectRouterEvent := func() (bool, ndpRouterEvent) {
+ select {
+ case e := <-ndpDisp.routerC:
+ return true, e
+ default:
+ }
+
+ return false, ndpRouterEvent{}
+ }
+
+ expectPrefixEvent := func() (bool, ndpPrefixEvent) {
+ select {
+ case e := <-ndpDisp.prefixC:
+ return true, e
+ default:
+ }
+
+ return false, ndpPrefixEvent{}
+ }
+
+ expectAutoGenAddrEvent := func() (bool, ndpAutoGenAddrEvent) {
+ select {
+ case e := <-ndpDisp.autoGenAddrC:
+ return true, e
+ default:
+ }
+
+ return false, ndpAutoGenAddrEvent{}
+ }
+
+ // Receive RAs on NIC(1) and NIC(2) from default routers (llAddr1 and
+ // llAddr2) w/ PI (for prefix1 in RA from llAddr1 and prefix2 in RA from
+ // llAddr2) to discover multiple routers and prefixes, and auto-gen
+ // multiple addresses.
+
+ e1.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr1, lifetimeSeconds, prefix1, true, true, lifetimeSeconds, lifetimeSeconds))
+ // We have other tests that make sure we receive the *correct* events
+ // on normal discovery of routers/prefixes, and auto-generated
+ // addresses. Here we just make sure we get an event and let other tests
+ // handle the correctness check.
+ if ok, _ := expectRouterEvent(); !ok {
+ t.Errorf("expected router event for %s on NIC(%d)", llAddr1, nicID1)
+ }
+ if ok, _ := expectPrefixEvent(); !ok {
+ t.Errorf("expected prefix event for %s on NIC(%d)", prefix1, nicID1)
+ }
+ if ok, _ := expectAutoGenAddrEvent(); !ok {
+ t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e1Addr1, nicID1)
+ }
+
+ e1.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, lifetimeSeconds, prefix2, true, true, lifetimeSeconds, lifetimeSeconds))
+ if ok, _ := expectRouterEvent(); !ok {
+ t.Errorf("expected router event for %s on NIC(%d)", llAddr2, nicID1)
+ }
+ if ok, _ := expectPrefixEvent(); !ok {
+ t.Errorf("expected prefix event for %s on NIC(%d)", prefix2, nicID1)
+ }
+ if ok, _ := expectAutoGenAddrEvent(); !ok {
+ t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e1Addr2, nicID1)
+ }
+
+ e2.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr1, lifetimeSeconds, prefix1, true, true, lifetimeSeconds, lifetimeSeconds))
+ if ok, _ := expectRouterEvent(); !ok {
+ t.Errorf("expected router event for %s on NIC(%d)", llAddr1, nicID2)
+ }
+ if ok, _ := expectPrefixEvent(); !ok {
+ t.Errorf("expected prefix event for %s on NIC(%d)", prefix1, nicID2)
+ }
+ if ok, _ := expectAutoGenAddrEvent(); !ok {
+ t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e1Addr2, nicID2)
+ }
+
+ e2.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, lifetimeSeconds, prefix2, true, true, lifetimeSeconds, lifetimeSeconds))
+ if ok, _ := expectRouterEvent(); !ok {
+ t.Errorf("expected router event for %s on NIC(%d)", llAddr2, nicID2)
+ }
+ if ok, _ := expectPrefixEvent(); !ok {
+ t.Errorf("expected prefix event for %s on NIC(%d)", prefix2, nicID2)
+ }
+ if ok, _ := expectAutoGenAddrEvent(); !ok {
+ t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e2Addr2, nicID2)
+ }
+
+ // We should have the auto-generated addresses added.
+ nicinfo := s.NICInfo()
+ nic1Addrs := nicinfo[nicID1].ProtocolAddresses
+ nic2Addrs := nicinfo[nicID2].ProtocolAddresses
+ if !contains(nic1Addrs, e1Addr1) {
+ t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e1Addr1, nicID1, nic1Addrs)
+ }
+ if !contains(nic1Addrs, e1Addr2) {
+ t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e1Addr2, nicID1, nic1Addrs)
+ }
+ if !contains(nic2Addrs, e2Addr1) {
+ t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e2Addr1, nicID2, nic2Addrs)
+ }
+ if !contains(nic2Addrs, e2Addr2) {
+ t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e2Addr2, nicID2, nic2Addrs)
+ }
+
+ // We can't proceed any further if we already failed the test (missing
+ // some discovery/auto-generated address events or addresses).
+ if t.Failed() {
+ t.FailNow()
+ }
+
+ s.SetForwarding(true)
+
+ // Collect invalidation events after becoming a router
+ gotRouterEvents := make(map[ndpRouterEvent]int)
+ for i := 0; i < maxEvents; i++ {
+ ok, e := expectRouterEvent()
+ if !ok {
+ t.Errorf("expected %d router events after becoming a router; got = %d", maxEvents, i)
+ break
+ }
+ gotRouterEvents[e]++
+ }
+ gotPrefixEvents := make(map[ndpPrefixEvent]int)
+ for i := 0; i < maxEvents; i++ {
+ ok, e := expectPrefixEvent()
+ if !ok {
+ t.Errorf("expected %d prefix events after becoming a router; got = %d", maxEvents, i)
+ break
+ }
+ gotPrefixEvents[e]++
+ }
+ gotAutoGenAddrEvents := make(map[ndpAutoGenAddrEvent]int)
+ for i := 0; i < maxEvents; i++ {
+ ok, e := expectAutoGenAddrEvent()
+ if !ok {
+ t.Errorf("expected %d auto-generated address events after becoming a router; got = %d", maxEvents, i)
+ break
+ }
+ gotAutoGenAddrEvents[e]++
+ }
+
+ // No need to proceed any further if we already failed the test (missing
+ // some invalidation events).
+ if t.Failed() {
+ t.FailNow()
+ }
+
+ expectedRouterEvents := map[ndpRouterEvent]int{
+ {nicID: nicID1, addr: llAddr1, discovered: false}: 1,
+ {nicID: nicID1, addr: llAddr2, discovered: false}: 1,
+ {nicID: nicID2, addr: llAddr1, discovered: false}: 1,
+ {nicID: nicID2, addr: llAddr2, discovered: false}: 1,
+ }
+ if diff := cmp.Diff(expectedRouterEvents, gotRouterEvents); diff != "" {
+ t.Errorf("router events mismatch (-want +got):\n%s", diff)
+ }
+ expectedPrefixEvents := map[ndpPrefixEvent]int{
+ {nicID: nicID1, prefix: subnet1, discovered: false}: 1,
+ {nicID: nicID1, prefix: subnet2, discovered: false}: 1,
+ {nicID: nicID2, prefix: subnet1, discovered: false}: 1,
+ {nicID: nicID2, prefix: subnet2, discovered: false}: 1,
+ }
+ if diff := cmp.Diff(expectedPrefixEvents, gotPrefixEvents); diff != "" {
+ t.Errorf("prefix events mismatch (-want +got):\n%s", diff)
+ }
+ expectedAutoGenAddrEvents := map[ndpAutoGenAddrEvent]int{
+ {nicID: nicID1, addr: e1Addr1, eventType: invalidatedAddr}: 1,
+ {nicID: nicID1, addr: e1Addr2, eventType: invalidatedAddr}: 1,
+ {nicID: nicID2, addr: e2Addr1, eventType: invalidatedAddr}: 1,
+ {nicID: nicID2, addr: e2Addr2, eventType: invalidatedAddr}: 1,
+ }
+ if diff := cmp.Diff(expectedAutoGenAddrEvents, gotAutoGenAddrEvents); diff != "" {
+ t.Errorf("auto-generated address events mismatch (-want +got):\n%s", diff)
+ }
+
+ // Make sure the auto-generated addresses got removed.
+ nicinfo = s.NICInfo()
+ nic1Addrs = nicinfo[nicID1].ProtocolAddresses
+ nic2Addrs = nicinfo[nicID2].ProtocolAddresses
+ if contains(nic1Addrs, e1Addr1) {
+ t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e1Addr1, nicID1, nic1Addrs)
+ }
+ if contains(nic1Addrs, e1Addr2) {
+ t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e1Addr2, nicID1, nic1Addrs)
+ }
+ if contains(nic2Addrs, e2Addr1) {
+ t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e2Addr1, nicID2, nic2Addrs)
+ }
+ if contains(nic2Addrs, e2Addr2) {
+ t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e2Addr2, nicID2, nic2Addrs)
+ }
+
+ // Should not get any more events (invalidation timers should have been
+ // cancelled when we transitioned into a router).
+ time.Sleep(lifetimeSeconds*time.Second + defaultTimeout)
+ select {
+ case <-ndpDisp.routerC:
+ t.Error("unexpected router event")
+ default:
+ }
+ select {
+ case <-ndpDisp.prefixC:
+ t.Error("unexpected prefix event")
+ default:
+ }
+ select {
+ case <-ndpDisp.autoGenAddrC:
+ t.Error("unexpected auto-generated address event")
+ default:
+ }
+}
+
+// TestDHCPv6ConfigurationFromNDPDA tests that the NDPDispatcher is properly
+// informed when new information about what configurations are available via
+// DHCPv6 is learned.
+func TestDHCPv6ConfigurationFromNDPDA(t *testing.T) {
+ const nicID = 1
+
+ ndpDisp := ndpDispatcher{
+ dhcpv6ConfigurationC: make(chan ndpDHCPv6Event, 1),
+ rememberRouter: true,
+ }
+ e := channel.New(0, 1280, linkAddr1)
+ s := stack.New(stack.Options{
+ NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+ NDPConfigs: stack.NDPConfigurations{
+ HandleRAs: true,
+ },
+ NDPDisp: &ndpDisp,
+ })
+
+ if err := s.CreateNIC(nicID, e); err != nil {
+ t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+ }
+
+ expectDHCPv6Event := func(configuration stack.DHCPv6ConfigurationFromNDPRA) {
+ t.Helper()
+ select {
+ case e := <-ndpDisp.dhcpv6ConfigurationC:
+ if diff := cmp.Diff(ndpDHCPv6Event{nicID: nicID, configuration: configuration}, e, cmp.AllowUnexported(e)); diff != "" {
+ t.Errorf("dhcpv6 event mismatch (-want +got):\n%s", diff)
+ }
+ default:
+ t.Fatal("expected DHCPv6 configuration event")
+ }
+ }
+
+ expectNoDHCPv6Event := func() {
+ t.Helper()
+ select {
+ case <-ndpDisp.dhcpv6ConfigurationC:
+ t.Fatal("unexpected DHCPv6 configuration event")
+ default:
+ }
+ }
+
+ // The initial DHCPv6 configuration should be stack.DHCPv6NoConfiguration.
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, false))
+ expectNoDHCPv6Event()
+
+ // Receive an RA that updates the DHCPv6 configuration to Other
+ // Configurations.
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, true))
+ expectDHCPv6Event(stack.DHCPv6OtherConfigurations)
+ // Receiving the same update again should not result in an event to the
+ // NDPDispatcher.
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, true))
+ expectNoDHCPv6Event()
+
+ // Receive an RA that updates the DHCPv6 configuration to Managed Address.
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, true, false))
+ expectDHCPv6Event(stack.DHCPv6ManagedAddress)
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, true, false))
+ expectNoDHCPv6Event()
+
+ // Receive an RA that updates the DHCPv6 configuration to none.
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, false))
+ expectDHCPv6Event(stack.DHCPv6NoConfiguration)
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, false))
+ expectNoDHCPv6Event()
+
+ // Receive an RA that updates the DHCPv6 configuration to Managed Address.
+ //
+ // Note, when the M flag is set, the O flag is redundant.
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, true, true))
+ expectDHCPv6Event(stack.DHCPv6ManagedAddress)
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, true, true))
+ expectNoDHCPv6Event()
+ // Even though the DHCPv6 flags are different, the effective configuration is
+ // the same so we should not receive a new event.
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, true, false))
+ expectNoDHCPv6Event()
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, true, true))
+ expectNoDHCPv6Event()
+
+ // Receive an RA that updates the DHCPv6 configuration to Other
+ // Configurations.
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, true))
+ expectDHCPv6Event(stack.DHCPv6OtherConfigurations)
+ e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, true))
+ expectNoDHCPv6Event()
+}
+
+// TestRouterSolicitation tests the initial Router Solicitations that are sent
+// when a NIC newly becomes enabled.
+func TestRouterSolicitation(t *testing.T) {
+ t.Parallel()
+
+ tests := []struct {
+ name string
+ maxRtrSolicit uint8
+ rtrSolicitInt time.Duration
+ effectiveRtrSolicitInt time.Duration
+ maxRtrSolicitDelay time.Duration
+ effectiveMaxRtrSolicitDelay time.Duration
+ }{
+ {
+ name: "Single RS with delay",
+ maxRtrSolicit: 1,
+ rtrSolicitInt: time.Second,
+ effectiveRtrSolicitInt: time.Second,
+ maxRtrSolicitDelay: time.Second,
+ effectiveMaxRtrSolicitDelay: time.Second,
+ },
+ {
+ name: "Two RS with delay",
+ maxRtrSolicit: 2,
+ rtrSolicitInt: time.Second,
+ effectiveRtrSolicitInt: time.Second,
+ maxRtrSolicitDelay: 500 * time.Millisecond,
+ effectiveMaxRtrSolicitDelay: 500 * time.Millisecond,
+ },
+ {
+ name: "Single RS without delay",
+ maxRtrSolicit: 1,
+ rtrSolicitInt: time.Second,
+ effectiveRtrSolicitInt: time.Second,
+ maxRtrSolicitDelay: 0,
+ effectiveMaxRtrSolicitDelay: 0,
+ },
+ {
+ name: "Two RS without delay and invalid zero interval",
+ maxRtrSolicit: 2,
+ rtrSolicitInt: 0,
+ effectiveRtrSolicitInt: 4 * time.Second,
+ maxRtrSolicitDelay: 0,
+ effectiveMaxRtrSolicitDelay: 0,
+ },
+ {
+ name: "Three RS without delay",
+ maxRtrSolicit: 3,
+ rtrSolicitInt: 500 * time.Millisecond,
+ effectiveRtrSolicitInt: 500 * time.Millisecond,
+ maxRtrSolicitDelay: 0,
+ effectiveMaxRtrSolicitDelay: 0,
+ },
+ {
+ name: "Two RS with invalid negative delay",
+ maxRtrSolicit: 2,
+ rtrSolicitInt: time.Second,
+ effectiveRtrSolicitInt: time.Second,
+ maxRtrSolicitDelay: -3 * time.Second,
+ effectiveMaxRtrSolicitDelay: time.Second,
+ },
+ }
+
+ // This Run will not return until the parallel tests finish.
+ //
+ // We need this because we need to do some teardown work after the
+ // parallel tests complete.
+ //
+ // See https://godoc.org/testing#hdr-Subtests_and_Sub_benchmarks for
+ // more details.
+ t.Run("group", func(t *testing.T) {
+ for _, test := range tests {
+ test := test
+
+ t.Run(test.name, func(t *testing.T) {
+ t.Parallel()
+ e := channel.New(int(test.maxRtrSolicit), 1280, linkAddr1)
+ waitForPkt := func(timeout time.Duration) {
+ t.Helper()
+ select {
+ case p := <-e.C:
+ if p.Proto != header.IPv6ProtocolNumber {
+ t.Fatalf("got Proto = %d, want = %d", p.Proto, header.IPv6ProtocolNumber)
+ }
+ checker.IPv6(t,
+ p.Pkt.Header.View(),
+ checker.SrcAddr(header.IPv6Any),
+ checker.DstAddr(header.IPv6AllRoutersMulticastAddress),
+ checker.TTL(header.NDPHopLimit),
+ checker.NDPRS(),
+ )
+
+ case <-time.After(timeout):
+ t.Fatal("timed out waiting for packet")
+ }
+ }
+ waitForNothing := func(timeout time.Duration) {
+ t.Helper()
+ select {
+ case <-e.C:
+ t.Fatal("unexpectedly got a packet")
+ case <-time.After(timeout):
+ }
+ }
+ s := stack.New(stack.Options{
+ NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+ NDPConfigs: stack.NDPConfigurations{
+ MaxRtrSolicitations: test.maxRtrSolicit,
+ RtrSolicitationInterval: test.rtrSolicitInt,
+ MaxRtrSolicitationDelay: test.maxRtrSolicitDelay,
+ },
+ })
+ if err := s.CreateNIC(1, e); err != nil {
+ t.Fatalf("CreateNIC(1) = %s", err)
+ }
+
+ // Make sure each RS got sent at the right
+ // times.
+ remaining := test.maxRtrSolicit
+ if remaining > 0 {
+ waitForPkt(test.effectiveMaxRtrSolicitDelay + defaultTimeout)
+ remaining--
+ }
+ for ; remaining > 0; remaining-- {
+ waitForNothing(test.effectiveRtrSolicitInt - defaultTimeout)
+ waitForPkt(2 * defaultTimeout)
+ }
+
+ // Make sure no more RS.
+ if test.effectiveRtrSolicitInt > test.effectiveMaxRtrSolicitDelay {
+ waitForNothing(test.effectiveRtrSolicitInt + defaultTimeout)
+ } else {
+ waitForNothing(test.effectiveMaxRtrSolicitDelay + defaultTimeout)
+ }
+
+ // Make sure the counter got properly
+ // incremented.
+ if got, want := s.Stats().ICMP.V6PacketsSent.RouterSolicit.Value(), uint64(test.maxRtrSolicit); got != want {
+ t.Fatalf("got sent RouterSolicit = %d, want = %d", got, want)
+ }
+ })
+ }
+ })
+}
+
+// TestStopStartSolicitingRouters tests that when forwarding is enabled or
+// disabled, router solicitations are stopped or started, respecitively.
+func TestStopStartSolicitingRouters(t *testing.T) {
+ t.Parallel()
+
+ const interval = 500 * time.Millisecond
+ const delay = time.Second
+ const maxRtrSolicitations = 3
+ e := channel.New(maxRtrSolicitations, 1280, linkAddr1)
+ waitForPkt := func(timeout time.Duration) {
+ t.Helper()
+ select {
+ case p := <-e.C:
+ if p.Proto != header.IPv6ProtocolNumber {
+ t.Fatalf("got Proto = %d, want = %d", p.Proto, header.IPv6ProtocolNumber)
+ }
+ checker.IPv6(t, p.Pkt.Header.View(),
+ checker.SrcAddr(header.IPv6Any),
+ checker.DstAddr(header.IPv6AllRoutersMulticastAddress),
+ checker.TTL(header.NDPHopLimit),
+ checker.NDPRS())
+
+ case <-time.After(timeout):
+ t.Fatal("timed out waiting for packet")
+ }
+ }
+ s := stack.New(stack.Options{
+ NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+ NDPConfigs: stack.NDPConfigurations{
+ MaxRtrSolicitations: maxRtrSolicitations,
+ RtrSolicitationInterval: interval,
+ MaxRtrSolicitationDelay: delay,
+ },
+ })
+ if err := s.CreateNIC(1, e); err != nil {
+ t.Fatalf("CreateNIC(1) = %s", err)
+ }
+
+ // Enable forwarding which should stop router solicitations.
+ s.SetForwarding(true)
+ select {
+ case <-e.C:
+ // A single RS may have been sent before forwarding was enabled.
+ select {
+ case <-e.C:
+ t.Fatal("Should not have sent more than one RS message")
+ case <-time.After(interval + defaultTimeout):
+ }
+ case <-time.After(delay + defaultTimeout):
+ }
+
+ // Enabling forwarding again should do nothing.
+ s.SetForwarding(true)
+ select {
+ case <-e.C:
+ t.Fatal("unexpectedly got a packet after becoming a router")
+ case <-time.After(delay + defaultTimeout):
+ }
+
+ // Disable forwarding which should start router solicitations.
+ s.SetForwarding(false)
+ waitForPkt(delay + defaultTimeout)
+ waitForPkt(interval + defaultTimeout)
+ waitForPkt(interval + defaultTimeout)
+ select {
+ case <-e.C:
+ t.Fatal("unexpectedly got an extra packet after sending out the expected RSs")
+ case <-time.After(interval + defaultTimeout):
+ }
+
+ // Disabling forwarding again should do nothing.
+ s.SetForwarding(false)
+ select {
+ case <-e.C:
+ t.Fatal("unexpectedly got a packet after becoming a router")
+ case <-time.After(delay + defaultTimeout):
}
}
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 3f8d7312c..4452a1302 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -15,10 +15,12 @@
package stack
import (
+ "log"
+ "sort"
"strings"
- "sync"
"sync/atomic"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/buffer"
"gvisor.dev/gvisor/pkg/tcpip/header"
@@ -27,11 +29,11 @@ import (
// NIC represents a "network interface card" to which the networking stack is
// attached.
type NIC struct {
- stack *Stack
- id tcpip.NICID
- name string
- linkEP LinkEndpoint
- loopback bool
+ stack *Stack
+ id tcpip.NICID
+ name string
+ linkEP LinkEndpoint
+ context NICContext
mu sync.RWMutex
spoofing bool
@@ -85,7 +87,7 @@ const (
)
// newNIC returns a new NIC using the default NDP configurations from stack.
-func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, loopback bool) *NIC {
+func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, ctx NICContext) *NIC {
// TODO(b/141011931): Validate a LinkEndpoint (ep) is valid. For
// example, make sure that the link address it provides is a valid
// unicast ethernet address.
@@ -99,7 +101,7 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, loopback
id: id,
name: name,
linkEP: ep,
- loopback: loopback,
+ context: ctx,
primary: make(map[tcpip.NetworkProtocolNumber][]*referencedNetworkEndpoint),
endpoints: make(map[NetworkEndpointID]*referencedNetworkEndpoint),
mcastJoins: make(map[NetworkEndpointID]int32),
@@ -115,10 +117,11 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, loopback
},
},
ndp: ndpState{
- configs: stack.ndpConfigs,
- dad: make(map[tcpip.Address]dadState),
- defaultRouters: make(map[tcpip.Address]defaultRouterState),
- onLinkPrefixes: make(map[tcpip.Subnet]onLinkPrefixState),
+ configs: stack.ndpConfigs,
+ dad: make(map[tcpip.Address]dadState),
+ defaultRouters: make(map[tcpip.Address]defaultRouterState),
+ onLinkPrefixes: make(map[tcpip.Subnet]onLinkPrefixState),
+ autoGenAddresses: make(map[tcpip.Address]autoGenAddressState),
},
}
nic.ndp.nic = nic
@@ -173,33 +176,73 @@ func (n *NIC) enable() *tcpip.Error {
return err
}
- if !n.stack.autoGenIPv6LinkLocal {
- return nil
- }
+ // Do not auto-generate an IPv6 link-local address for loopback devices.
+ if n.stack.autoGenIPv6LinkLocal && !n.isLoopback() {
+ var addr tcpip.Address
+ if oIID := n.stack.opaqueIIDOpts; oIID.NICNameFromID != nil {
+ addr = header.LinkLocalAddrWithOpaqueIID(oIID.NICNameFromID(n.ID(), n.name), 0, oIID.SecretKey)
+ } else {
+ l2addr := n.linkEP.LinkAddress()
+
+ // Only attempt to generate the link-local address if we have a valid MAC
+ // address.
+ //
+ // TODO(b/141011931): Validate a LinkEndpoint's link address (provided by
+ // LinkEndpoint.LinkAddress) before reaching this point.
+ if !header.IsValidUnicastEthernetAddress(l2addr) {
+ return nil
+ }
+
+ addr = header.LinkLocalAddr(l2addr)
+ }
- l2addr := n.linkEP.LinkAddress()
+ if _, err := n.addPermanentAddressLocked(tcpip.ProtocolAddress{
+ Protocol: header.IPv6ProtocolNumber,
+ AddressWithPrefix: tcpip.AddressWithPrefix{
+ Address: addr,
+ PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen,
+ },
+ }, CanBePrimaryEndpoint, static, false /* deprecated */); err != nil {
+ return err
+ }
+ }
- // Only attempt to generate the link-local address if we have a
- // valid MAC address.
+ // If we are operating as a router, then do not solicit routers since we
+ // won't process the RAs anyways.
//
- // TODO(b/141011931): Validate a LinkEndpoint's link address
- // (provided by LinkEndpoint.LinkAddress) before reaching this
- // point.
- if !header.IsValidUnicastEthernetAddress(l2addr) {
- return nil
+ // Routers do not process Router Advertisements (RA) the same way a host
+ // does. That is, routers do not learn from RAs (e.g. on-link prefixes
+ // and default routers). Therefore, soliciting RAs from other routers on
+ // a link is unnecessary for routers.
+ if !n.stack.forwarding {
+ n.ndp.startSolicitingRouters()
}
- addr := header.LinkLocalAddr(l2addr)
+ return nil
+}
+
+// becomeIPv6Router transitions n into an IPv6 router.
+//
+// When transitioning into an IPv6 router, host-only state (NDP discovered
+// routers, discovered on-link prefixes, and auto-generated addresses) will
+// be cleaned up/invalidated and NDP router solicitations will be stopped.
+func (n *NIC) becomeIPv6Router() {
+ n.mu.Lock()
+ defer n.mu.Unlock()
- _, err := n.addPermanentAddressLocked(tcpip.ProtocolAddress{
- Protocol: header.IPv6ProtocolNumber,
- AddressWithPrefix: tcpip.AddressWithPrefix{
- Address: addr,
- PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen,
- },
- }, CanBePrimaryEndpoint)
+ n.ndp.cleanupHostOnlyState()
+ n.ndp.stopSolicitingRouters()
+}
- return err
+// becomeIPv6Host transitions n into an IPv6 host.
+//
+// When transitioning into an IPv6 host, NDP router solicitations will be
+// started.
+func (n *NIC) becomeIPv6Host() {
+ n.mu.Lock()
+ defer n.mu.Unlock()
+
+ n.ndp.startSolicitingRouters()
}
// attachLinkEndpoint attaches the NIC to the endpoint, which will enable it
@@ -222,6 +265,10 @@ func (n *NIC) isPromiscuousMode() bool {
return rv
}
+func (n *NIC) isLoopback() bool {
+ return n.linkEP.Capabilities()&CapabilityLoopback != 0
+}
+
// setSpoofing enables or disables address spoofing.
func (n *NIC) setSpoofing(enable bool) {
n.mu.Lock()
@@ -229,14 +276,145 @@ func (n *NIC) setSpoofing(enable bool) {
n.mu.Unlock()
}
-// primaryEndpoint returns the primary endpoint of n for the given network
-// protocol.
-func (n *NIC) primaryEndpoint(protocol tcpip.NetworkProtocolNumber) *referencedNetworkEndpoint {
+// primaryEndpoint will return the first non-deprecated endpoint if such an
+// endpoint exists for the given protocol and remoteAddr. If no non-deprecated
+// endpoint exists, the first deprecated endpoint will be returned.
+//
+// If an IPv6 primary endpoint is requested, Source Address Selection (as
+// defined by RFC 6724 section 5) will be performed.
+func (n *NIC) primaryEndpoint(protocol tcpip.NetworkProtocolNumber, remoteAddr tcpip.Address) *referencedNetworkEndpoint {
+ if protocol == header.IPv6ProtocolNumber && remoteAddr != "" {
+ return n.primaryIPv6Endpoint(remoteAddr)
+ }
+
n.mu.RLock()
defer n.mu.RUnlock()
+ var deprecatedEndpoint *referencedNetworkEndpoint
for _, r := range n.primary[protocol] {
- if r.isValidForOutgoing() && r.tryIncRef() {
+ if !r.isValidForOutgoing() {
+ continue
+ }
+
+ if !r.deprecated {
+ if r.tryIncRef() {
+ // r is not deprecated, so return it immediately.
+ //
+ // If we kept track of a deprecated endpoint, decrement its reference
+ // count since it was incremented when we decided to keep track of it.
+ if deprecatedEndpoint != nil {
+ deprecatedEndpoint.decRefLocked()
+ deprecatedEndpoint = nil
+ }
+
+ return r
+ }
+ } else if deprecatedEndpoint == nil && r.tryIncRef() {
+ // We prefer an endpoint that is not deprecated, but we keep track of r in
+ // case n doesn't have any non-deprecated endpoints.
+ //
+ // If we end up finding a more preferred endpoint, r's reference count
+ // will be decremented when such an endpoint is found.
+ deprecatedEndpoint = r
+ }
+ }
+
+ // n doesn't have any valid non-deprecated endpoints, so return
+ // deprecatedEndpoint (which may be nil if n doesn't have any valid deprecated
+ // endpoints either).
+ return deprecatedEndpoint
+}
+
+// ipv6AddrCandidate is an IPv6 candidate for Source Address Selection (RFC
+// 6724 section 5).
+type ipv6AddrCandidate struct {
+ ref *referencedNetworkEndpoint
+ scope header.IPv6AddressScope
+}
+
+// primaryIPv6Endpoint returns an IPv6 endpoint following Source Address
+// Selection (RFC 6724 section 5).
+//
+// Note, only rules 1-3 are followed.
+//
+// remoteAddr must be a valid IPv6 address.
+func (n *NIC) primaryIPv6Endpoint(remoteAddr tcpip.Address) *referencedNetworkEndpoint {
+ n.mu.RLock()
+ defer n.mu.RUnlock()
+
+ primaryAddrs := n.primary[header.IPv6ProtocolNumber]
+
+ if len(primaryAddrs) == 0 {
+ return nil
+ }
+
+ // Create a candidate set of available addresses we can potentially use as a
+ // source address.
+ cs := make([]ipv6AddrCandidate, 0, len(primaryAddrs))
+ for _, r := range primaryAddrs {
+ // If r is not valid for outgoing connections, it is not a valid endpoint.
+ if !r.isValidForOutgoing() {
+ continue
+ }
+
+ addr := r.ep.ID().LocalAddress
+ scope, err := header.ScopeForIPv6Address(addr)
+ if err != nil {
+ // Should never happen as we got r from the primary IPv6 endpoint list and
+ // ScopeForIPv6Address only returns an error if addr is not an IPv6
+ // address.
+ log.Fatalf("header.ScopeForIPv6Address(%s): %s", addr, err)
+ }
+
+ cs = append(cs, ipv6AddrCandidate{
+ ref: r,
+ scope: scope,
+ })
+ }
+
+ remoteScope, err := header.ScopeForIPv6Address(remoteAddr)
+ if err != nil {
+ // primaryIPv6Endpoint should never be called with an invalid IPv6 address.
+ log.Fatalf("header.ScopeForIPv6Address(%s): %s", remoteAddr, err)
+ }
+
+ // Sort the addresses as per RFC 6724 section 5 rules 1-3.
+ //
+ // TODO(b/146021396): Implement rules 4-8 of RFC 6724 section 5.
+ sort.Slice(cs, func(i, j int) bool {
+ sa := cs[i]
+ sb := cs[j]
+
+ // Prefer same address as per RFC 6724 section 5 rule 1.
+ if sa.ref.ep.ID().LocalAddress == remoteAddr {
+ return true
+ }
+ if sb.ref.ep.ID().LocalAddress == remoteAddr {
+ return false
+ }
+
+ // Prefer appropriate scope as per RFC 6724 section 5 rule 2.
+ if sa.scope < sb.scope {
+ return sa.scope >= remoteScope
+ } else if sb.scope < sa.scope {
+ return sb.scope < remoteScope
+ }
+
+ // Avoid deprecated addresses as per RFC 6724 section 5 rule 3.
+ if saDep, sbDep := sa.ref.deprecated, sb.ref.deprecated; saDep != sbDep {
+ // If sa is not deprecated, it is preferred over sb.
+ return sbDep
+ }
+
+ // sa and sb are equal, return the endpoint that is closest to the front of
+ // the primary endpoint list.
+ return i < j
+ })
+
+ // Return the most preferred address that can have its reference count
+ // incremented.
+ for _, c := range cs {
+ if r := c.ref; r.tryIncRef() {
return r
}
}
@@ -244,6 +422,20 @@ func (n *NIC) primaryEndpoint(protocol tcpip.NetworkProtocolNumber) *referencedN
return nil
}
+// hasPermanentAddrLocked returns true if n has a permanent (including currently
+// tentative) address, addr.
+func (n *NIC) hasPermanentAddrLocked(addr tcpip.Address) bool {
+ ref, ok := n.endpoints[NetworkEndpointID{addr}]
+
+ if !ok {
+ return false
+ }
+
+ kind := ref.getKind()
+
+ return kind == permanent || kind == permanentTentative
+}
+
func (n *NIC) getRef(protocol tcpip.NetworkProtocolNumber, dst tcpip.Address) *referencedNetworkEndpoint {
return n.getRefOrCreateTemp(protocol, dst, CanBePrimaryEndpoint, n.promiscuous)
}
@@ -335,13 +527,18 @@ func (n *NIC) getRefOrCreateTemp(protocol tcpip.NetworkProtocolNumber, address t
Address: address,
PrefixLen: netProto.DefaultPrefixLen(),
},
- }, peb, temporary)
+ }, peb, temporary, static, false)
n.mu.Unlock()
return ref
}
-func (n *NIC) addPermanentAddressLocked(protocolAddress tcpip.ProtocolAddress, peb PrimaryEndpointBehavior) (*referencedNetworkEndpoint, *tcpip.Error) {
+// addPermanentAddressLocked adds a permanent address to n.
+//
+// If n already has the address in a non-permanent state,
+// addPermanentAddressLocked will promote it to permanent and update the
+// endpoint with the properties provided.
+func (n *NIC) addPermanentAddressLocked(protocolAddress tcpip.ProtocolAddress, peb PrimaryEndpointBehavior, configType networkEndpointConfigType, deprecated bool) (*referencedNetworkEndpoint, *tcpip.Error) {
id := NetworkEndpointID{protocolAddress.AddressWithPrefix.Address}
if ref, ok := n.endpoints[id]; ok {
switch ref.getKind() {
@@ -349,10 +546,14 @@ func (n *NIC) addPermanentAddressLocked(protocolAddress tcpip.ProtocolAddress, p
// The NIC already have a permanent endpoint with that address.
return nil, tcpip.ErrDuplicateAddress
case permanentExpired, temporary:
- // Promote the endpoint to become permanent and respect
- // the new peb.
+ // Promote the endpoint to become permanent and respect the new peb,
+ // configType and deprecated status.
if ref.tryIncRef() {
+ // TODO(b/147748385): Perform Duplicate Address Detection when promoting
+ // an IPv6 endpoint to permanent.
ref.setKind(permanent)
+ ref.deprecated = deprecated
+ ref.configType = configType
refs := n.primary[ref.protocol]
for i, r := range refs {
@@ -384,10 +585,14 @@ func (n *NIC) addPermanentAddressLocked(protocolAddress tcpip.ProtocolAddress, p
}
}
- return n.addAddressLocked(protocolAddress, peb, permanent)
+ return n.addAddressLocked(protocolAddress, peb, permanent, configType, deprecated)
}
-func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb PrimaryEndpointBehavior, kind networkEndpointKind) (*referencedNetworkEndpoint, *tcpip.Error) {
+// addAddressLocked adds a new protocolAddress to n.
+//
+// If the address is already known by n (irrespective of the state it is in),
+// addAddressLocked does nothing and returns tcpip.ErrDuplicateAddress.
+func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb PrimaryEndpointBehavior, kind networkEndpointKind, configType networkEndpointConfigType, deprecated bool) (*referencedNetworkEndpoint, *tcpip.Error) {
// TODO(b/141022673): Validate IP address before adding them.
// Sanity check.
@@ -417,11 +622,13 @@ func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb Primar
}
ref := &referencedNetworkEndpoint{
- refs: 1,
- ep: ep,
- nic: n,
- protocol: protocolAddress.Protocol,
- kind: kind,
+ refs: 1,
+ ep: ep,
+ nic: n,
+ protocol: protocolAddress.Protocol,
+ kind: kind,
+ configType: configType,
+ deprecated: deprecated,
}
// Set up cache if link address resolution exists for this protocol.
@@ -459,7 +666,7 @@ func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb Primar
func (n *NIC) AddAddress(protocolAddress tcpip.ProtocolAddress, peb PrimaryEndpointBehavior) *tcpip.Error {
// Add the endpoint.
n.mu.Lock()
- _, err := n.addPermanentAddressLocked(protocolAddress, peb)
+ _, err := n.addPermanentAddressLocked(protocolAddress, peb, static, false /* deprecated */)
n.mu.Unlock()
return err
@@ -520,6 +727,51 @@ func (n *NIC) PrimaryAddresses() []tcpip.ProtocolAddress {
return addrs
}
+// primaryAddress returns the primary address associated with this NIC.
+//
+// primaryAddress will return the first non-deprecated address if such an
+// address exists. If no non-deprecated address exists, the first deprecated
+// address will be returned.
+func (n *NIC) primaryAddress(proto tcpip.NetworkProtocolNumber) tcpip.AddressWithPrefix {
+ n.mu.RLock()
+ defer n.mu.RUnlock()
+
+ list, ok := n.primary[proto]
+ if !ok {
+ return tcpip.AddressWithPrefix{}
+ }
+
+ var deprecatedEndpoint *referencedNetworkEndpoint
+ for _, ref := range list {
+ // Don't include tentative, expired or tempory endpoints to avoid confusion
+ // and prevent the caller from using those.
+ switch ref.getKind() {
+ case permanentTentative, permanentExpired, temporary:
+ continue
+ }
+
+ if !ref.deprecated {
+ return tcpip.AddressWithPrefix{
+ Address: ref.ep.ID().LocalAddress,
+ PrefixLen: ref.ep.PrefixLen(),
+ }
+ }
+
+ if deprecatedEndpoint == nil {
+ deprecatedEndpoint = ref
+ }
+ }
+
+ if deprecatedEndpoint != nil {
+ return tcpip.AddressWithPrefix{
+ Address: deprecatedEndpoint.ep.ID().LocalAddress,
+ PrefixLen: deprecatedEndpoint.ep.PrefixLen(),
+ }
+ }
+
+ return tcpip.AddressWithPrefix{}
+}
+
// AddAddressRange adds a range of addresses to n, so that it starts accepting
// packets targeted at the given addresses and network protocol. The range is
// given by a subnet address, and all addresses contained in the subnet are
@@ -547,7 +799,7 @@ func (n *NIC) RemoveAddressRange(subnet tcpip.Subnet) {
n.mu.Unlock()
}
-// Subnets returns the Subnets associated with this NIC.
+// AddressRanges returns the Subnets associated with this NIC.
func (n *NIC) AddressRanges() []tcpip.Subnet {
n.mu.RLock()
defer n.mu.RUnlock()
@@ -624,9 +876,18 @@ func (n *NIC) removePermanentAddressLocked(addr tcpip.Address) *tcpip.Error {
isIPv6Unicast := r.protocol == header.IPv6ProtocolNumber && header.IsV6UnicastAddress(addr)
- // If we are removing a tentative IPv6 unicast address, stop DAD.
- if isIPv6Unicast && kind == permanentTentative {
- n.ndp.stopDuplicateAddressDetection(addr)
+ if isIPv6Unicast {
+ // If we are removing a tentative IPv6 unicast address, stop
+ // DAD.
+ if kind == permanentTentative {
+ n.ndp.stopDuplicateAddressDetection(addr)
+ }
+
+ // If we are removing an address generated via SLAAC, cleanup
+ // its SLAAC resources and notify the integrator.
+ if r.configType == slaac {
+ n.ndp.cleanupAutoGenAddrResourcesAndNotify(addr)
+ }
}
r.setKind(permanentExpired)
@@ -687,7 +948,7 @@ func (n *NIC) joinGroupLocked(protocol tcpip.NetworkProtocolNumber, addr tcpip.A
Address: addr,
PrefixLen: netProto.DefaultPrefixLen(),
},
- }, NeverPrimaryEndpoint); err != nil {
+ }, NeverPrimaryEndpoint, static, false /* deprecated */); err != nil {
return err
}
}
@@ -989,7 +1250,7 @@ const (
// removing the permanent address from the NIC.
permanent
- // An expired permanent endoint is a permanent endoint that had its address
+ // An expired permanent endpoint is a permanent endpoint that had its address
// removed from the NIC, and it is waiting to be removed once no more routes
// hold a reference to it. This is achieved by decreasing its reference count
// by 1. If its address is re-added before the endpoint is removed, its type
@@ -1035,6 +1296,19 @@ func (n *NIC) unregisterPacketEndpoint(netProto tcpip.NetworkProtocolNumber, ep
}
}
+type networkEndpointConfigType int32
+
+const (
+ // A statically configured endpoint is an address that was added by
+ // some user-specified action (adding an explicit address, joining a
+ // multicast group).
+ static networkEndpointConfigType = iota
+
+ // A slaac configured endpoint is an IPv6 endpoint that was
+ // added by SLAAC as per RFC 4862 section 5.5.3.
+ slaac
+)
+
type referencedNetworkEndpoint struct {
ep NetworkEndpoint
nic *NIC
@@ -1050,6 +1324,16 @@ type referencedNetworkEndpoint struct {
// networkEndpointKind must only be accessed using {get,set}Kind().
kind networkEndpointKind
+
+ // configType is the method that was used to configure this endpoint.
+ // This must never change except during endpoint creation and promotion to
+ // permanent.
+ configType networkEndpointConfigType
+
+ // deprecated indicates whether or not the endpoint should be considered
+ // deprecated. That is, when deprecated is true, other endpoints that are not
+ // deprecated should be preferred.
+ deprecated bool
}
func (r *referencedNetworkEndpoint) getKind() networkEndpointKind {
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index 61fd46d66..2b8751d49 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -234,15 +234,15 @@ type NetworkEndpoint interface {
// WritePacket writes a packet to the given destination address and
// protocol. It sets pkt.NetworkHeader. pkt.TransportHeader must have
// already been set.
- WritePacket(r *Route, gso *GSO, params NetworkHeaderParams, loop PacketLooping, pkt tcpip.PacketBuffer) *tcpip.Error
+ WritePacket(r *Route, gso *GSO, params NetworkHeaderParams, pkt tcpip.PacketBuffer) *tcpip.Error
// WritePackets writes packets to the given destination address and
// protocol. pkts must not be zero length.
- WritePackets(r *Route, gso *GSO, pkts []tcpip.PacketBuffer, params NetworkHeaderParams, loop PacketLooping) (int, *tcpip.Error)
+ WritePackets(r *Route, gso *GSO, pkts []tcpip.PacketBuffer, params NetworkHeaderParams) (int, *tcpip.Error)
// WriteHeaderIncludedPacket writes a packet that includes a network
// header to the given destination address.
- WriteHeaderIncludedPacket(r *Route, loop PacketLooping, pkt tcpip.PacketBuffer) *tcpip.Error
+ WriteHeaderIncludedPacket(r *Route, pkt tcpip.PacketBuffer) *tcpip.Error
// ID returns the network protocol endpoint ID.
ID() *NetworkEndpointID
diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go
index 34307ae07..517f4b941 100644
--- a/pkg/tcpip/stack/route.go
+++ b/pkg/tcpip/stack/route.go
@@ -158,7 +158,7 @@ func (r *Route) WritePacket(gso *GSO, params NetworkHeaderParams, pkt tcpip.Pack
return tcpip.ErrInvalidEndpointState
}
- err := r.ref.ep.WritePacket(r, gso, params, r.Loop, pkt)
+ err := r.ref.ep.WritePacket(r, gso, params, pkt)
if err != nil {
r.Stats().IP.OutgoingPacketErrors.Increment()
} else {
@@ -174,7 +174,7 @@ func (r *Route) WritePackets(gso *GSO, pkts []tcpip.PacketBuffer, params Network
return 0, tcpip.ErrInvalidEndpointState
}
- n, err := r.ref.ep.WritePackets(r, gso, pkts, params, r.Loop)
+ n, err := r.ref.ep.WritePackets(r, gso, pkts, params)
if err != nil {
r.Stats().IP.OutgoingPacketErrors.IncrementBy(uint64(len(pkts) - n))
}
@@ -195,7 +195,7 @@ func (r *Route) WriteHeaderIncludedPacket(pkt tcpip.PacketBuffer) *tcpip.Error {
return tcpip.ErrInvalidEndpointState
}
- if err := r.ref.ep.WriteHeaderIncludedPacket(r, r.Loop, pkt); err != nil {
+ if err := r.ref.ep.WriteHeaderIncludedPacket(r, pkt); err != nil {
r.Stats().IP.OutgoingPacketErrors.Increment()
return err
}
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 0e88643a4..fc56a6d79 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -21,13 +21,13 @@ package stack
import (
"encoding/binary"
- "sync"
"sync/atomic"
"time"
"golang.org/x/time/rate"
"gvisor.dev/gvisor/pkg/rand"
"gvisor.dev/gvisor/pkg/sleep"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/buffer"
"gvisor.dev/gvisor/pkg/tcpip/header"
@@ -352,6 +352,38 @@ func (u *uniqueIDGenerator) UniqueID() uint64 {
return atomic.AddUint64((*uint64)(u), 1)
}
+// NICNameFromID is a function that returns a stable name for the specified NIC,
+// even if different NIC IDs are used to refer to the same NIC in different
+// program runs. It is used when generating opaque interface identifiers (IIDs).
+// If the NIC was created with a name, it will be passed to NICNameFromID.
+//
+// NICNameFromID SHOULD return unique NIC names so unique opaque IIDs are
+// generated for the same prefix on differnt NICs.
+type NICNameFromID func(tcpip.NICID, string) string
+
+// OpaqueInterfaceIdentifierOptions holds the options related to the generation
+// of opaque interface indentifiers (IIDs) as defined by RFC 7217.
+type OpaqueInterfaceIdentifierOptions struct {
+ // NICNameFromID is a function that returns a stable name for a specified NIC,
+ // even if the NIC ID changes over time.
+ //
+ // Must be specified to generate the opaque IID.
+ NICNameFromID NICNameFromID
+
+ // SecretKey is a pseudo-random number used as the secret key when generating
+ // opaque IIDs as defined by RFC 7217. The key SHOULD be at least
+ // header.OpaqueIIDSecretKeyMinBytes bytes and MUST follow minimum randomness
+ // requirements for security as outlined by RFC 4086. SecretKey MUST NOT
+ // change between program runs, unless explicitly changed.
+ //
+ // OpaqueInterfaceIdentifierOptions takes ownership of SecretKey. SecretKey
+ // MUST NOT be modified after Stack is created.
+ //
+ // May be nil, but a nil value is highly discouraged to maintain
+ // some level of randomness between nodes.
+ SecretKey []byte
+}
+
// Stack is a networking stack, with all supported protocols, NICs, and route
// table.
type Stack struct {
@@ -412,8 +444,8 @@ type Stack struct {
ndpConfigs NDPConfigurations
// autoGenIPv6LinkLocal determines whether or not the stack will attempt
- // to auto-generate an IPv6 link-local address for newly enabled NICs.
- // See the AutoGenIPv6LinkLocal field of Options for more details.
+ // to auto-generate an IPv6 link-local address for newly enabled non-loopback
+ // NICs. See the AutoGenIPv6LinkLocal field of Options for more details.
autoGenIPv6LinkLocal bool
// ndpDisp is the NDP event dispatcher that is used to send the netstack
@@ -422,6 +454,10 @@ type Stack struct {
// uniqueIDGenerator is a generator of unique identifiers.
uniqueIDGenerator UniqueID
+
+ // opaqueIIDOpts hold the options for generating opaque interface identifiers
+ // (IIDs) as outlined by RFC 7217.
+ opaqueIIDOpts OpaqueInterfaceIdentifierOptions
}
// UniqueID is an abstract generator of unique identifiers.
@@ -460,13 +496,15 @@ type Options struct {
// before assigning an address to a NIC.
NDPConfigs NDPConfigurations
- // AutoGenIPv6LinkLocal determins whether or not the stack will attempt
- // to auto-generate an IPv6 link-local address for newly enabled NICs.
+ // AutoGenIPv6LinkLocal determines whether or not the stack will attempt to
+ // auto-generate an IPv6 link-local address for newly enabled non-loopback
+ // NICs.
+ //
// Note, setting this to true does not mean that a link-local address
- // will be assigned right away, or at all. If Duplicate Address
- // Detection is enabled, an address will only be assigned if it
- // successfully resolves. If it fails, no further attempt will be made
- // to auto-generate an IPv6 link-local address.
+ // will be assigned right away, or at all. If Duplicate Address Detection
+ // is enabled, an address will only be assigned if it successfully resolves.
+ // If it fails, no further attempt will be made to auto-generate an IPv6
+ // link-local address.
//
// The generated link-local address will follow RFC 4291 Appendix A
// guidelines.
@@ -479,6 +517,10 @@ type Options struct {
// RawFactory produces raw endpoints. Raw endpoints are enabled only if
// this is non-nil.
RawFactory RawFactory
+
+ // OpaqueIIDOpts hold the options for generating opaque interface identifiers
+ // (IIDs) as outlined by RFC 7217.
+ OpaqueIIDOpts OpaqueInterfaceIdentifierOptions
}
// TransportEndpointInfo holds useful information about a transport endpoint
@@ -505,6 +547,49 @@ type TransportEndpointInfo struct {
RegisterNICID tcpip.NICID
}
+// AddrNetProto unwraps the specified address if it is a V4-mapped V6 address
+// and returns the network protocol number to be used to communicate with the
+// specified address. It returns an error if the passed address is incompatible
+// with the receiver.
+func (e *TransportEndpointInfo) AddrNetProto(addr tcpip.FullAddress, v6only bool) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, *tcpip.Error) {
+ netProto := e.NetProto
+ switch len(addr.Addr) {
+ case header.IPv4AddressSize:
+ netProto = header.IPv4ProtocolNumber
+ case header.IPv6AddressSize:
+ if header.IsV4MappedAddress(addr.Addr) {
+ netProto = header.IPv4ProtocolNumber
+ addr.Addr = addr.Addr[header.IPv6AddressSize-header.IPv4AddressSize:]
+ if addr.Addr == header.IPv4Any {
+ addr.Addr = ""
+ }
+ }
+ }
+
+ switch len(e.ID.LocalAddress) {
+ case header.IPv4AddressSize:
+ if len(addr.Addr) == header.IPv6AddressSize {
+ return tcpip.FullAddress{}, 0, tcpip.ErrInvalidEndpointState
+ }
+ case header.IPv6AddressSize:
+ if len(addr.Addr) == header.IPv4AddressSize {
+ return tcpip.FullAddress{}, 0, tcpip.ErrNetworkUnreachable
+ }
+ }
+
+ switch {
+ case netProto == e.NetProto:
+ case netProto == header.IPv4ProtocolNumber && e.NetProto == header.IPv6ProtocolNumber:
+ if v6only {
+ return tcpip.FullAddress{}, 0, tcpip.ErrNoRoute
+ }
+ default:
+ return tcpip.FullAddress{}, 0, tcpip.ErrInvalidEndpointState
+ }
+
+ return addr, netProto, nil
+}
+
// IsEndpointInfo is an empty method to implement the tcpip.EndpointInfo
// marker interface.
func (*TransportEndpointInfo) IsEndpointInfo() {}
@@ -549,6 +634,7 @@ func New(opts Options) *Stack {
autoGenIPv6LinkLocal: opts.AutoGenIPv6LinkLocal,
uniqueIDGenerator: opts.UniqueID,
ndpDisp: opts.NDPDisp,
+ opaqueIIDOpts: opts.OpaqueIIDOpts,
}
// Add specified network protocols.
@@ -662,11 +748,37 @@ func (s *Stack) Stats() tcpip.Stats {
}
// SetForwarding enables or disables the packet forwarding between NICs.
+//
+// When forwarding becomes enabled, any host-only state on all NICs will be
+// cleaned up and if IPv6 is enabled, NDP Router Solicitations will be started.
+// When forwarding becomes disabled and if IPv6 is enabled, NDP Router
+// Solicitations will be stopped.
func (s *Stack) SetForwarding(enable bool) {
// TODO(igudger, bgeffon): Expose via /proc/sys/net/ipv4/ip_forward.
s.mu.Lock()
+ defer s.mu.Unlock()
+
+ // If forwarding status didn't change, do nothing further.
+ if s.forwarding == enable {
+ return
+ }
+
s.forwarding = enable
- s.mu.Unlock()
+
+ // If this stack does not support IPv6, do nothing further.
+ if _, ok := s.networkProtocols[header.IPv6ProtocolNumber]; !ok {
+ return
+ }
+
+ if enable {
+ for _, nic := range s.nics {
+ nic.becomeIPv6Router()
+ }
+ } else {
+ for _, nic := range s.nics {
+ nic.becomeIPv6Host()
+ }
+ }
}
// Forwarding returns if the packet forwarding between NICs is enabled.
@@ -733,9 +845,30 @@ func (s *Stack) NewPacketEndpoint(cooked bool, netProto tcpip.NetworkProtocolNum
return s.rawFactory.NewPacketEndpoint(s, cooked, netProto, waiterQueue)
}
-// createNIC creates a NIC with the provided id and link-layer endpoint, and
-// optionally enable it.
-func (s *Stack) createNIC(id tcpip.NICID, name string, ep LinkEndpoint, enabled, loopback bool) *tcpip.Error {
+// NICContext is an opaque pointer used to store client-supplied NIC metadata.
+type NICContext interface{}
+
+// NICOptions specifies the configuration of a NIC as it is being created.
+// The zero value creates an enabled, unnamed NIC.
+type NICOptions struct {
+ // Name specifies the name of the NIC.
+ Name string
+
+ // Disabled specifies whether to avoid calling Attach on the passed
+ // LinkEndpoint.
+ Disabled bool
+
+ // Context specifies user-defined data that will be returned in stack.NICInfo
+ // for the NIC. Clients of this library can use it to add metadata that
+ // should be tracked alongside a NIC, to avoid having to keep a
+ // map[tcpip.NICID]metadata mirroring stack.Stack's nic map.
+ Context NICContext
+}
+
+// CreateNICWithOptions creates a NIC with the provided id, LinkEndpoint, and
+// NICOptions. See the documentation on type NICOptions for details on how
+// NICs can be configured.
+func (s *Stack) CreateNICWithOptions(id tcpip.NICID, ep LinkEndpoint, opts NICOptions) *tcpip.Error {
s.mu.Lock()
defer s.mu.Unlock()
@@ -744,44 +877,20 @@ func (s *Stack) createNIC(id tcpip.NICID, name string, ep LinkEndpoint, enabled,
return tcpip.ErrDuplicateNICID
}
- n := newNIC(s, id, name, ep, loopback)
+ n := newNIC(s, id, opts.Name, ep, opts.Context)
s.nics[id] = n
- if enabled {
+ if !opts.Disabled {
return n.enable()
}
return nil
}
-// CreateNIC creates a NIC with the provided id and link-layer endpoint.
+// CreateNIC creates a NIC with the provided id and LinkEndpoint and calls
+// `LinkEndpoint.Attach` to start delivering packets to it.
func (s *Stack) CreateNIC(id tcpip.NICID, ep LinkEndpoint) *tcpip.Error {
- return s.createNIC(id, "", ep, true, false)
-}
-
-// CreateNamedNIC creates a NIC with the provided id and link-layer endpoint,
-// and a human-readable name.
-func (s *Stack) CreateNamedNIC(id tcpip.NICID, name string, ep LinkEndpoint) *tcpip.Error {
- return s.createNIC(id, name, ep, true, false)
-}
-
-// CreateNamedLoopbackNIC creates a NIC with the provided id and link-layer
-// endpoint, and a human-readable name.
-func (s *Stack) CreateNamedLoopbackNIC(id tcpip.NICID, name string, ep LinkEndpoint) *tcpip.Error {
- return s.createNIC(id, name, ep, true, true)
-}
-
-// CreateDisabledNIC creates a NIC with the provided id and link-layer endpoint,
-// but leave it disable. Stack.EnableNIC must be called before the link-layer
-// endpoint starts delivering packets to it.
-func (s *Stack) CreateDisabledNIC(id tcpip.NICID, ep LinkEndpoint) *tcpip.Error {
- return s.createNIC(id, "", ep, false, false)
-}
-
-// CreateDisabledNamedNIC is a combination of CreateNamedNIC and
-// CreateDisabledNIC.
-func (s *Stack) CreateDisabledNamedNIC(id tcpip.NICID, name string, ep LinkEndpoint) *tcpip.Error {
- return s.createNIC(id, name, ep, false, false)
+ return s.CreateNICWithOptions(id, ep, NICOptions{})
}
// EnableNIC enables the given NIC so that the link-layer endpoint can start
@@ -809,7 +918,7 @@ func (s *Stack) CheckNIC(id tcpip.NICID) bool {
return false
}
-// NICSubnets returns a map of NICIDs to their associated subnets.
+// NICAddressRanges returns a map of NICIDs to their associated subnets.
func (s *Stack) NICAddressRanges() map[tcpip.NICID][]tcpip.Subnet {
s.mu.RLock()
defer s.mu.RUnlock()
@@ -835,6 +944,18 @@ type NICInfo struct {
MTU uint32
Stats NICStats
+
+ // Context is user-supplied data optionally supplied in CreateNICWithOptions.
+ // See type NICOptions for more details.
+ Context NICContext
+}
+
+// HasNIC returns true if the NICID is defined in the stack.
+func (s *Stack) HasNIC(id tcpip.NICID) bool {
+ s.mu.RLock()
+ _, ok := s.nics[id]
+ s.mu.RUnlock()
+ return ok
}
// NICInfo returns a map of NICIDs to their associated information.
@@ -848,7 +969,7 @@ func (s *Stack) NICInfo() map[tcpip.NICID]NICInfo {
Up: true, // Netstack interfaces are always up.
Running: nic.linkEP.IsAttached(),
Promiscuous: nic.isPromiscuousMode(),
- Loopback: nic.linkEP.Capabilities()&CapabilityLoopback != 0,
+ Loopback: nic.isLoopback(),
}
nics[id] = NICInfo{
Name: nic.name,
@@ -857,6 +978,7 @@ func (s *Stack) NICInfo() map[tcpip.NICID]NICInfo {
Flags: flags,
MTU: nic.linkEP.MTU(),
Stats: nic.stats,
+ Context: nic.context,
}
}
return nics
@@ -973,9 +1095,11 @@ func (s *Stack) AllAddresses() map[tcpip.NICID][]tcpip.ProtocolAddress {
return nics
}
-// GetMainNICAddress returns the first primary address and prefix for the given
-// NIC and protocol. Returns an error if the NIC doesn't exist and an empty
-// value if the NIC doesn't have a primary address for the given protocol.
+// GetMainNICAddress returns the first non-deprecated primary address and prefix
+// for the given NIC and protocol. If no non-deprecated primary address exists,
+// a deprecated primary address and prefix will be returned. Returns an error if
+// the NIC doesn't exist and an empty value if the NIC doesn't have a primary
+// address for the given protocol.
func (s *Stack) GetMainNICAddress(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber) (tcpip.AddressWithPrefix, *tcpip.Error) {
s.mu.RLock()
defer s.mu.RUnlock()
@@ -985,17 +1109,12 @@ func (s *Stack) GetMainNICAddress(id tcpip.NICID, protocol tcpip.NetworkProtocol
return tcpip.AddressWithPrefix{}, tcpip.ErrUnknownNICID
}
- for _, a := range nic.PrimaryAddresses() {
- if a.Protocol == protocol {
- return a.AddressWithPrefix, nil
- }
- }
- return tcpip.AddressWithPrefix{}, nil
+ return nic.primaryAddress(protocol), nil
}
-func (s *Stack) getRefEP(nic *NIC, localAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber) (ref *referencedNetworkEndpoint) {
+func (s *Stack) getRefEP(nic *NIC, localAddr, remoteAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber) (ref *referencedNetworkEndpoint) {
if len(localAddr) == 0 {
- return nic.primaryEndpoint(netProto)
+ return nic.primaryEndpoint(netProto, remoteAddr)
}
return nic.findEndpoint(netProto, localAddr, CanBePrimaryEndpoint)
}
@@ -1011,8 +1130,8 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n
needRoute := !(isBroadcast || isMulticast || header.IsV6LinkLocalAddress(remoteAddr))
if id != 0 && !needRoute {
if nic, ok := s.nics[id]; ok {
- if ref := s.getRefEP(nic, localAddr, netProto); ref != nil {
- return makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref, s.handleLocal && !nic.loopback, multicastLoop && !nic.loopback), nil
+ if ref := s.getRefEP(nic, localAddr, remoteAddr, netProto); ref != nil {
+ return makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref, s.handleLocal && !nic.isLoopback(), multicastLoop && !nic.isLoopback()), nil
}
}
} else {
@@ -1021,14 +1140,14 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n
continue
}
if nic, ok := s.nics[route.NIC]; ok {
- if ref := s.getRefEP(nic, localAddr, netProto); ref != nil {
+ if ref := s.getRefEP(nic, localAddr, remoteAddr, netProto); ref != nil {
if len(remoteAddr) == 0 {
// If no remote address was provided, then the route
// provided will refer to the link local address.
remoteAddr = ref.ep.ID().LocalAddress
}
- r := makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref, s.handleLocal && !nic.loopback, multicastLoop && !nic.loopback)
+ r := makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref, s.handleLocal && !nic.isLoopback(), multicastLoop && !nic.isLoopback())
if needRoute {
r.NextHop = route.Gateway
}
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index 8fc034ca1..4b3d18f1b 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -27,12 +27,15 @@ import (
"time"
"github.com/google/go-cmp/cmp"
+ "gvisor.dev/gvisor/pkg/rand"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/buffer"
"gvisor.dev/gvisor/pkg/tcpip/header"
"gvisor.dev/gvisor/pkg/tcpip/link/channel"
+ "gvisor.dev/gvisor/pkg/tcpip/link/loopback"
"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
"gvisor.dev/gvisor/pkg/tcpip/stack"
+ "gvisor.dev/gvisor/pkg/tcpip/transport/udp"
)
const (
@@ -122,7 +125,7 @@ func (f *fakeNetworkEndpoint) Capabilities() stack.LinkEndpointCapabilities {
return f.ep.Capabilities()
}
-func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, loop stack.PacketLooping, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt tcpip.PacketBuffer) *tcpip.Error {
// Increment the sent packet count in the protocol descriptor.
f.proto.sendPacketCount[int(r.RemoteAddress[0])%len(f.proto.sendPacketCount)]++
@@ -133,7 +136,7 @@ func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, params
b[1] = f.id.LocalAddress[0]
b[2] = byte(params.Protocol)
- if loop&stack.PacketLoop != 0 {
+ if r.Loop&stack.PacketLoop != 0 {
views := make([]buffer.View, 1, 1+len(pkt.Data.Views()))
views[0] = pkt.Header.View()
views = append(views, pkt.Data.Views()...)
@@ -141,7 +144,7 @@ func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, params
Data: buffer.NewVectorisedView(len(views[0])+pkt.Data.Size(), views),
})
}
- if loop&stack.PacketOut == 0 {
+ if r.Loop&stack.PacketOut == 0 {
return nil
}
@@ -149,11 +152,11 @@ func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, params
}
// WritePackets implements stack.LinkEndpoint.WritePackets.
-func (f *fakeNetworkEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, params stack.NetworkHeaderParams, loop stack.PacketLooping) (int, *tcpip.Error) {
+func (f *fakeNetworkEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts []tcpip.PacketBuffer, params stack.NetworkHeaderParams) (int, *tcpip.Error) {
panic("not implemented")
}
-func (*fakeNetworkEndpoint) WriteHeaderIncludedPacket(r *stack.Route, loop stack.PacketLooping, pkt tcpip.PacketBuffer) *tcpip.Error {
+func (*fakeNetworkEndpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt tcpip.PacketBuffer) *tcpip.Error {
return tcpip.ErrNotSupported
}
@@ -1894,55 +1897,67 @@ func TestNICForwarding(t *testing.T) {
}
// TestNICAutoGenAddr tests the auto-generation of IPv6 link-local addresses
-// (or lack there-of if disabled (default)). Note, DAD will be disabled in
-// these tests.
+// using the modified EUI-64 of the NIC's MAC address (or lack there-of if
+// disabled (default)). Note, DAD will be disabled in these tests.
func TestNICAutoGenAddr(t *testing.T) {
tests := []struct {
name string
autoGen bool
linkAddr tcpip.LinkAddress
+ iidOpts stack.OpaqueInterfaceIdentifierOptions
shouldGen bool
}{
{
"Disabled",
false,
linkAddr1,
+ stack.OpaqueInterfaceIdentifierOptions{
+ NICNameFromID: func(nicID tcpip.NICID, _ string) string {
+ return fmt.Sprintf("nic%d", nicID)
+ },
+ },
false,
},
{
"Enabled",
true,
linkAddr1,
+ stack.OpaqueInterfaceIdentifierOptions{},
true,
},
{
"Nil MAC",
true,
tcpip.LinkAddress([]byte(nil)),
+ stack.OpaqueInterfaceIdentifierOptions{},
false,
},
{
"Empty MAC",
true,
tcpip.LinkAddress(""),
+ stack.OpaqueInterfaceIdentifierOptions{},
false,
},
{
"Invalid MAC",
true,
tcpip.LinkAddress("\x01\x02\x03"),
+ stack.OpaqueInterfaceIdentifierOptions{},
false,
},
{
"Multicast MAC",
true,
tcpip.LinkAddress("\x01\x02\x03\x04\x05\x06"),
+ stack.OpaqueInterfaceIdentifierOptions{},
false,
},
{
"Unspecified MAC",
true,
tcpip.LinkAddress("\x00\x00\x00\x00\x00\x00"),
+ stack.OpaqueInterfaceIdentifierOptions{},
false,
},
}
@@ -1951,13 +1966,12 @@ func TestNICAutoGenAddr(t *testing.T) {
t.Run(test.name, func(t *testing.T) {
opts := stack.Options{
NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+ OpaqueIIDOpts: test.iidOpts,
}
if test.autoGen {
- // Only set opts.AutoGenIPv6LinkLocal when
- // test.autoGen is true because
- // opts.AutoGenIPv6LinkLocal should be false by
- // default.
+ // Only set opts.AutoGenIPv6LinkLocal when test.autoGen is true because
+ // opts.AutoGenIPv6LinkLocal should be false by default.
opts.AutoGenIPv6LinkLocal = true
}
@@ -1973,9 +1987,171 @@ func TestNICAutoGenAddr(t *testing.T) {
}
if test.shouldGen {
+ // Should have auto-generated an address and resolved immediately (DAD
+ // is disabled).
+ if want := (tcpip.AddressWithPrefix{Address: header.LinkLocalAddr(test.linkAddr), PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen}); addr != want {
+ t.Fatalf("got stack.GetMainNICAddress(_, _) = %s, want = %s", addr, want)
+ }
+ } else {
+ // Should not have auto-generated an address.
+ if want := (tcpip.AddressWithPrefix{}); addr != want {
+ t.Fatalf("got stack.GetMainNICAddress(_, _) = (%s, nil), want = (%s, nil)", addr, want)
+ }
+ }
+ })
+ }
+}
+
+// TestNICContextPreservation tests that you can read out via stack.NICInfo the
+// Context data you pass via NICContext.Context in stack.CreateNICWithOptions.
+func TestNICContextPreservation(t *testing.T) {
+ var ctx *int
+ tests := []struct {
+ name string
+ opts stack.NICOptions
+ want stack.NICContext
+ }{
+ {
+ "context_set",
+ stack.NICOptions{Context: ctx},
+ ctx,
+ },
+ {
+ "context_not_set",
+ stack.NICOptions{},
+ nil,
+ },
+ }
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ s := stack.New(stack.Options{})
+ id := tcpip.NICID(1)
+ ep := channel.New(0, 0, tcpip.LinkAddress("\x00\x00\x00\x00\x00\x00"))
+ if err := s.CreateNICWithOptions(id, ep, test.opts); err != nil {
+ t.Fatalf("got stack.CreateNICWithOptions(%d, %+v, %+v) = %s, want nil", id, ep, test.opts, err)
+ }
+ nicinfos := s.NICInfo()
+ nicinfo, ok := nicinfos[id]
+ if !ok {
+ t.Fatalf("got nicinfos[%d] = _, %t, want _, true; nicinfos = %+v", id, ok, nicinfos)
+ }
+ if got, want := nicinfo.Context == test.want, true; got != want {
+ t.Fatal("got nicinfo.Context == ctx = %t, want %t; nicinfo.Context = %p, ctx = %p", got, want, nicinfo.Context, test.want)
+ }
+ })
+ }
+}
+
+// TestNICAutoGenAddrWithOpaque tests the auto-generation of IPv6 link-local
+// addresses with opaque interface identifiers. Link Local addresses should
+// always be generated with opaque IIDs if configured to use them, even if the
+// NIC has an invalid MAC address.
+func TestNICAutoGenAddrWithOpaque(t *testing.T) {
+ const nicID = 1
+
+ var secretKey [header.OpaqueIIDSecretKeyMinBytes]byte
+ n, err := rand.Read(secretKey[:])
+ if err != nil {
+ t.Fatalf("rand.Read(_): %s", err)
+ }
+ if n != header.OpaqueIIDSecretKeyMinBytes {
+ t.Fatalf("expected rand.Read to read %d bytes, read %d bytes", header.OpaqueIIDSecretKeyMinBytes, n)
+ }
+
+ tests := []struct {
+ name string
+ nicName string
+ autoGen bool
+ linkAddr tcpip.LinkAddress
+ secretKey []byte
+ }{
+ {
+ name: "Disabled",
+ nicName: "nic1",
+ autoGen: false,
+ linkAddr: linkAddr1,
+ secretKey: secretKey[:],
+ },
+ {
+ name: "Enabled",
+ nicName: "nic1",
+ autoGen: true,
+ linkAddr: linkAddr1,
+ secretKey: secretKey[:],
+ },
+ // These are all cases where we would not have generated a
+ // link-local address if opaque IIDs were disabled.
+ {
+ name: "Nil MAC and empty nicName",
+ nicName: "",
+ autoGen: true,
+ linkAddr: tcpip.LinkAddress([]byte(nil)),
+ secretKey: secretKey[:1],
+ },
+ {
+ name: "Empty MAC and empty nicName",
+ autoGen: true,
+ linkAddr: tcpip.LinkAddress(""),
+ secretKey: secretKey[:2],
+ },
+ {
+ name: "Invalid MAC",
+ nicName: "test",
+ autoGen: true,
+ linkAddr: tcpip.LinkAddress("\x01\x02\x03"),
+ secretKey: secretKey[:3],
+ },
+ {
+ name: "Multicast MAC",
+ nicName: "test2",
+ autoGen: true,
+ linkAddr: tcpip.LinkAddress("\x01\x02\x03\x04\x05\x06"),
+ secretKey: secretKey[:4],
+ },
+ {
+ name: "Unspecified MAC and nil SecretKey",
+ nicName: "test3",
+ autoGen: true,
+ linkAddr: tcpip.LinkAddress("\x00\x00\x00\x00\x00\x00"),
+ },
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ opts := stack.Options{
+ NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+ OpaqueIIDOpts: stack.OpaqueInterfaceIdentifierOptions{
+ NICNameFromID: func(_ tcpip.NICID, nicName string) string {
+ return nicName
+ },
+ SecretKey: test.secretKey,
+ },
+ }
+
+ if test.autoGen {
+ // Only set opts.AutoGenIPv6LinkLocal when
+ // test.autoGen is true because
+ // opts.AutoGenIPv6LinkLocal should be false by
+ // default.
+ opts.AutoGenIPv6LinkLocal = true
+ }
+
+ e := channel.New(10, 1280, test.linkAddr)
+ s := stack.New(opts)
+ nicOpts := stack.NICOptions{Name: test.nicName}
+ if err := s.CreateNICWithOptions(nicID, e, nicOpts); err != nil {
+ t.Fatalf("CreateNICWithOptions(%d, _, %+v) = %s", nicID, opts, err)
+ }
+
+ addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
+ if err != nil {
+ t.Fatalf("stack.GetMainNICAddress(%d, _) err = %s", nicID, err)
+ }
+
+ if test.autoGen {
// Should have auto-generated an address and
// resolved immediately (DAD is disabled).
- if want := (tcpip.AddressWithPrefix{Address: header.LinkLocalAddr(test.linkAddr), PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen}); addr != want {
+ if want := (tcpip.AddressWithPrefix{Address: header.LinkLocalAddrWithOpaqueIID(test.nicName, 0, test.secretKey), PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen}); addr != want {
t.Fatalf("got stack.GetMainNICAddress(_, _) = %s, want = %s", addr, want)
}
} else {
@@ -1988,6 +2164,56 @@ func TestNICAutoGenAddr(t *testing.T) {
}
}
+// TestNoLinkLocalAutoGenForLoopbackNIC tests that IPv6 link-local addresses are
+// not auto-generated for loopback NICs.
+func TestNoLinkLocalAutoGenForLoopbackNIC(t *testing.T) {
+ const nicID = 1
+ const nicName = "nicName"
+
+ tests := []struct {
+ name string
+ opaqueIIDOpts stack.OpaqueInterfaceIdentifierOptions
+ }{
+ {
+ name: "IID From MAC",
+ opaqueIIDOpts: stack.OpaqueInterfaceIdentifierOptions{},
+ },
+ {
+ name: "Opaque IID",
+ opaqueIIDOpts: stack.OpaqueInterfaceIdentifierOptions{
+ NICNameFromID: func(_ tcpip.NICID, nicName string) string {
+ return nicName
+ },
+ },
+ },
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ opts := stack.Options{
+ NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+ AutoGenIPv6LinkLocal: true,
+ OpaqueIIDOpts: test.opaqueIIDOpts,
+ }
+
+ e := loopback.New()
+ s := stack.New(opts)
+ nicOpts := stack.NICOptions{Name: nicName}
+ if err := s.CreateNICWithOptions(nicID, e, nicOpts); err != nil {
+ t.Fatalf("CreateNICWithOptions(%d, _, %+v) = %s", nicID, nicOpts, err)
+ }
+
+ addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
+ if err != nil {
+ t.Fatalf("stack.GetMainNICAddress(%d, _) err = %s", nicID, err)
+ }
+ if want := (tcpip.AddressWithPrefix{}); addr != want {
+ t.Errorf("got stack.GetMainNICAddress(%d, _) = %s, want = %s", nicID, addr, want)
+ }
+ })
+ }
+}
+
// TestNICAutoGenAddrDoesDAD tests that the successful auto-generation of IPv6
// link-local addresses will only be assigned after the DAD process resolves.
func TestNICAutoGenAddrDoesDAD(t *testing.T) {
@@ -2186,3 +2412,154 @@ func TestNewPEBOnPromotionToPermanent(t *testing.T) {
}
}
}
+
+func TestIPv6SourceAddressSelectionScopeAndSameAddress(t *testing.T) {
+ const (
+ linkLocalAddr1 = tcpip.Address("\xfe\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+ linkLocalAddr2 = tcpip.Address("\xfe\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
+ uniqueLocalAddr1 = tcpip.Address("\xfc\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+ uniqueLocalAddr2 = tcpip.Address("\xfd\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
+ globalAddr1 = tcpip.Address("\xa0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+ globalAddr2 = tcpip.Address("\xa0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
+ nicID = 1
+ )
+
+ // Rule 3 is not tested here, and is instead tested by NDP's AutoGenAddr test.
+ tests := []struct {
+ name string
+ nicAddrs []tcpip.Address
+ connectAddr tcpip.Address
+ expectedLocalAddr tcpip.Address
+ }{
+ // Test Rule 1 of RFC 6724 section 5.
+ {
+ name: "Same Global most preferred (last address)",
+ nicAddrs: []tcpip.Address{linkLocalAddr1, uniqueLocalAddr1, globalAddr1},
+ connectAddr: globalAddr1,
+ expectedLocalAddr: globalAddr1,
+ },
+ {
+ name: "Same Global most preferred (first address)",
+ nicAddrs: []tcpip.Address{globalAddr1, linkLocalAddr1, uniqueLocalAddr1},
+ connectAddr: globalAddr1,
+ expectedLocalAddr: globalAddr1,
+ },
+ {
+ name: "Same Link Local most preferred (last address)",
+ nicAddrs: []tcpip.Address{globalAddr1, uniqueLocalAddr1, linkLocalAddr1},
+ connectAddr: linkLocalAddr1,
+ expectedLocalAddr: linkLocalAddr1,
+ },
+ {
+ name: "Same Link Local most preferred (first address)",
+ nicAddrs: []tcpip.Address{linkLocalAddr1, uniqueLocalAddr1, globalAddr1},
+ connectAddr: linkLocalAddr1,
+ expectedLocalAddr: linkLocalAddr1,
+ },
+ {
+ name: "Same Unique Local most preferred (last address)",
+ nicAddrs: []tcpip.Address{uniqueLocalAddr1, globalAddr1, linkLocalAddr1},
+ connectAddr: uniqueLocalAddr1,
+ expectedLocalAddr: uniqueLocalAddr1,
+ },
+ {
+ name: "Same Unique Local most preferred (first address)",
+ nicAddrs: []tcpip.Address{globalAddr1, linkLocalAddr1, uniqueLocalAddr1},
+ connectAddr: uniqueLocalAddr1,
+ expectedLocalAddr: uniqueLocalAddr1,
+ },
+
+ // Test Rule 2 of RFC 6724 section 5.
+ {
+ name: "Global most preferred (last address)",
+ nicAddrs: []tcpip.Address{linkLocalAddr1, uniqueLocalAddr1, globalAddr1},
+ connectAddr: globalAddr2,
+ expectedLocalAddr: globalAddr1,
+ },
+ {
+ name: "Global most preferred (first address)",
+ nicAddrs: []tcpip.Address{globalAddr1, linkLocalAddr1, uniqueLocalAddr1},
+ connectAddr: globalAddr2,
+ expectedLocalAddr: globalAddr1,
+ },
+ {
+ name: "Link Local most preferred (last address)",
+ nicAddrs: []tcpip.Address{globalAddr1, uniqueLocalAddr1, linkLocalAddr1},
+ connectAddr: linkLocalAddr2,
+ expectedLocalAddr: linkLocalAddr1,
+ },
+ {
+ name: "Link Local most preferred (first address)",
+ nicAddrs: []tcpip.Address{linkLocalAddr1, uniqueLocalAddr1, globalAddr1},
+ connectAddr: linkLocalAddr2,
+ expectedLocalAddr: linkLocalAddr1,
+ },
+ {
+ name: "Unique Local most preferred (last address)",
+ nicAddrs: []tcpip.Address{uniqueLocalAddr1, globalAddr1, linkLocalAddr1},
+ connectAddr: uniqueLocalAddr2,
+ expectedLocalAddr: uniqueLocalAddr1,
+ },
+ {
+ name: "Unique Local most preferred (first address)",
+ nicAddrs: []tcpip.Address{globalAddr1, linkLocalAddr1, uniqueLocalAddr1},
+ connectAddr: uniqueLocalAddr2,
+ expectedLocalAddr: uniqueLocalAddr1,
+ },
+
+ // Test returning the endpoint that is closest to the front when
+ // candidate addresses are "equal" from the perspective of RFC 6724
+ // section 5.
+ {
+ name: "Unique Local for Global",
+ nicAddrs: []tcpip.Address{linkLocalAddr1, uniqueLocalAddr1, uniqueLocalAddr2},
+ connectAddr: globalAddr2,
+ expectedLocalAddr: uniqueLocalAddr1,
+ },
+ {
+ name: "Link Local for Global",
+ nicAddrs: []tcpip.Address{linkLocalAddr1, linkLocalAddr2},
+ connectAddr: globalAddr2,
+ expectedLocalAddr: linkLocalAddr1,
+ },
+ {
+ name: "Link Local for Unique Local",
+ nicAddrs: []tcpip.Address{linkLocalAddr1, linkLocalAddr2},
+ connectAddr: uniqueLocalAddr2,
+ expectedLocalAddr: linkLocalAddr1,
+ },
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ e := channel.New(0, 1280, linkAddr1)
+ s := stack.New(stack.Options{
+ NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+ TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+ })
+ if err := s.CreateNIC(nicID, e); err != nil {
+ t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+ }
+ s.SetRouteTable([]tcpip.Route{{
+ Destination: header.IPv6EmptySubnet,
+ Gateway: llAddr3,
+ NIC: nicID,
+ }})
+ s.AddLinkAddress(nicID, llAddr3, linkAddr3)
+
+ for _, a := range test.nicAddrs {
+ if err := s.AddAddress(nicID, ipv6.ProtocolNumber, a); err != nil {
+ t.Errorf("s.AddAddress(%d, %d, %s): %s", nicID, ipv6.ProtocolNumber, a, err)
+ }
+ }
+
+ if t.Failed() {
+ t.FailNow()
+ }
+
+ if got := addrForNewConnectionTo(t, s, tcpip.FullAddress{Addr: test.connectAddr, NIC: nicID, Port: 1234}); got != test.expectedLocalAddr {
+ t.Errorf("got local address = %s, want = %s", got, test.expectedLocalAddr)
+ }
+ })
+ }
+}
diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go
index 67c21be42..d686e6eb8 100644
--- a/pkg/tcpip/stack/transport_demuxer.go
+++ b/pkg/tcpip/stack/transport_demuxer.go
@@ -18,8 +18,8 @@ import (
"fmt"
"math/rand"
"sort"
- "sync"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/hash/jenkins"
"gvisor.dev/gvisor/pkg/tcpip/header"
@@ -104,7 +104,14 @@ func (epsByNic *endpointsByNic) handlePacket(r *Route, id TransportEndpointID, p
return
}
// multiPortEndpoints are guaranteed to have at least one element.
- selectEndpoint(id, mpep, epsByNic.seed).HandlePacket(r, id, pkt)
+ transEP := selectEndpoint(id, mpep, epsByNic.seed)
+ if queuedProtocol, mustQueue := mpep.demux.queuedProtocols[protocolIDs{mpep.netProto, mpep.transProto}]; mustQueue {
+ queuedProtocol.QueuePacket(r, transEP, id, pkt)
+ epsByNic.mu.RUnlock()
+ return
+ }
+
+ transEP.HandlePacket(r, id, pkt)
epsByNic.mu.RUnlock() // Don't use defer for performance reasons.
}
@@ -130,7 +137,7 @@ func (epsByNic *endpointsByNic) handleControlPacket(n *NIC, id TransportEndpoint
// registerEndpoint returns true if it succeeds. It fails and returns
// false if ep already has an element with the same key.
-func (epsByNic *endpointsByNic) registerEndpoint(t TransportEndpoint, reusePort bool, bindToDevice tcpip.NICID) *tcpip.Error {
+func (epsByNic *endpointsByNic) registerEndpoint(d *transportDemuxer, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, t TransportEndpoint, reusePort bool, bindToDevice tcpip.NICID) *tcpip.Error {
epsByNic.mu.Lock()
defer epsByNic.mu.Unlock()
@@ -140,7 +147,7 @@ func (epsByNic *endpointsByNic) registerEndpoint(t TransportEndpoint, reusePort
}
// This is a new binding.
- multiPortEp := &multiPortEndpoint{}
+ multiPortEp := &multiPortEndpoint{demux: d, netProto: netProto, transProto: transProto}
multiPortEp.endpointsMap = make(map[TransportEndpoint]int)
multiPortEp.reuse = reusePort
epsByNic.endpoints[bindToDevice] = multiPortEp
@@ -168,18 +175,34 @@ func (epsByNic *endpointsByNic) unregisterEndpoint(bindToDevice tcpip.NICID, t T
// newTransportDemuxer.
type transportDemuxer struct {
// protocol is immutable.
- protocol map[protocolIDs]*transportEndpoints
+ protocol map[protocolIDs]*transportEndpoints
+ queuedProtocols map[protocolIDs]queuedTransportProtocol
+}
+
+// queuedTransportProtocol if supported by a protocol implementation will cause
+// the dispatcher to delivery packets to the QueuePacket method instead of
+// calling HandlePacket directly on the endpoint.
+type queuedTransportProtocol interface {
+ QueuePacket(r *Route, ep TransportEndpoint, id TransportEndpointID, pkt tcpip.PacketBuffer)
}
func newTransportDemuxer(stack *Stack) *transportDemuxer {
- d := &transportDemuxer{protocol: make(map[protocolIDs]*transportEndpoints)}
+ d := &transportDemuxer{
+ protocol: make(map[protocolIDs]*transportEndpoints),
+ queuedProtocols: make(map[protocolIDs]queuedTransportProtocol),
+ }
// Add each network and transport pair to the demuxer.
for netProto := range stack.networkProtocols {
for proto := range stack.transportProtocols {
- d.protocol[protocolIDs{netProto, proto}] = &transportEndpoints{
+ protoIDs := protocolIDs{netProto, proto}
+ d.protocol[protoIDs] = &transportEndpoints{
endpoints: make(map[TransportEndpointID]*endpointsByNic),
}
+ qTransProto, isQueued := (stack.transportProtocols[proto].proto).(queuedTransportProtocol)
+ if isQueued {
+ d.queuedProtocols[protoIDs] = qTransProto
+ }
}
}
@@ -209,7 +232,11 @@ func (d *transportDemuxer) registerEndpoint(netProtos []tcpip.NetworkProtocolNum
//
// +stateify savable
type multiPortEndpoint struct {
- mu sync.RWMutex `state:"nosave"`
+ mu sync.RWMutex `state:"nosave"`
+ demux *transportDemuxer
+ netProto tcpip.NetworkProtocolNumber
+ transProto tcpip.TransportProtocolNumber
+
endpointsArr []TransportEndpoint
endpointsMap map[TransportEndpoint]int
// reuse indicates if more than one endpoint is allowed.
@@ -258,13 +285,22 @@ func selectEndpoint(id TransportEndpointID, mpep *multiPortEndpoint, seed uint32
func (ep *multiPortEndpoint) handlePacketAll(r *Route, id TransportEndpointID, pkt tcpip.PacketBuffer) {
ep.mu.RLock()
+ queuedProtocol, mustQueue := ep.demux.queuedProtocols[protocolIDs{ep.netProto, ep.transProto}]
for i, endpoint := range ep.endpointsArr {
// HandlePacket takes ownership of pkt, so each endpoint needs
// its own copy except for the final one.
if i == len(ep.endpointsArr)-1 {
+ if mustQueue {
+ queuedProtocol.QueuePacket(r, endpoint, id, pkt)
+ break
+ }
endpoint.HandlePacket(r, id, pkt)
break
}
+ if mustQueue {
+ queuedProtocol.QueuePacket(r, endpoint, id, pkt.Clone())
+ continue
+ }
endpoint.HandlePacket(r, id, pkt.Clone())
}
ep.mu.RUnlock() // Don't use defer for performance reasons.
@@ -357,7 +393,7 @@ func (d *transportDemuxer) singleRegisterEndpoint(netProto tcpip.NetworkProtocol
if epsByNic, ok := eps.endpoints[id]; ok {
// There was already a binding.
- return epsByNic.registerEndpoint(ep, reusePort, bindToDevice)
+ return epsByNic.registerEndpoint(d, netProto, protocol, ep, reusePort, bindToDevice)
}
// This is a new binding.
@@ -367,7 +403,7 @@ func (d *transportDemuxer) singleRegisterEndpoint(netProto tcpip.NetworkProtocol
}
eps.endpoints[id] = epsByNic
- return epsByNic.registerEndpoint(ep, reusePort, bindToDevice)
+ return epsByNic.registerEndpoint(d, netProto, protocol, ep, reusePort, bindToDevice)
}
// unregisterEndpoint unregisters the endpoint with the given id such that it
diff --git a/pkg/tcpip/stack/transport_demuxer_test.go b/pkg/tcpip/stack/transport_demuxer_test.go
index 3b28b06d0..5e9237de9 100644
--- a/pkg/tcpip/stack/transport_demuxer_test.go
+++ b/pkg/tcpip/stack/transport_demuxer_test.go
@@ -41,7 +41,7 @@ const (
type testContext struct {
t *testing.T
- linkEPs map[string]*channel.Endpoint
+ linkEps map[tcpip.NICID]*channel.Endpoint
s *stack.Stack
ep tcpip.Endpoint
@@ -61,35 +61,29 @@ func (c *testContext) createV6Endpoint(v6only bool) {
c.t.Fatalf("NewEndpoint failed: %v", err)
}
- var v tcpip.V6OnlyOption
- if v6only {
- v = 1
- }
- if err := c.ep.SetSockOpt(v); err != nil {
+ if err := c.ep.SetSockOptBool(tcpip.V6OnlyOption, v6only); err != nil {
c.t.Fatalf("SetSockOpt failed: %v", err)
}
}
-// newDualTestContextMultiNic creates the testing context and also linkEpNames
-// named NICs.
-func newDualTestContextMultiNic(t *testing.T, mtu uint32, linkEpNames []string) *testContext {
+// newDualTestContextMultiNIC creates the testing context and also linkEpIDs NICs.
+func newDualTestContextMultiNIC(t *testing.T, mtu uint32, linkEpIDs []tcpip.NICID) *testContext {
s := stack.New(stack.Options{
NetworkProtocols: []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()}})
- linkEPs := make(map[string]*channel.Endpoint)
- for i, linkEpName := range linkEpNames {
- channelEP := channel.New(256, mtu, "")
- nicID := tcpip.NICID(i + 1)
- if err := s.CreateNamedNIC(nicID, linkEpName, channelEP); err != nil {
+ linkEps := make(map[tcpip.NICID]*channel.Endpoint)
+ for _, linkEpID := range linkEpIDs {
+ channelEp := channel.New(256, mtu, "")
+ if err := s.CreateNIC(linkEpID, channelEp); err != nil {
t.Fatalf("CreateNIC failed: %v", err)
}
- linkEPs[linkEpName] = channelEP
+ linkEps[linkEpID] = channelEp
- if err := s.AddAddress(nicID, ipv4.ProtocolNumber, stackAddr); err != nil {
+ if err := s.AddAddress(linkEpID, ipv4.ProtocolNumber, stackAddr); err != nil {
t.Fatalf("AddAddress IPv4 failed: %v", err)
}
- if err := s.AddAddress(nicID, ipv6.ProtocolNumber, stackV6Addr); err != nil {
+ if err := s.AddAddress(linkEpID, ipv6.ProtocolNumber, stackV6Addr); err != nil {
t.Fatalf("AddAddress IPv6 failed: %v", err)
}
}
@@ -108,7 +102,7 @@ func newDualTestContextMultiNic(t *testing.T, mtu uint32, linkEpNames []string)
return &testContext{
t: t,
s: s,
- linkEPs: linkEPs,
+ linkEps: linkEps,
}
}
@@ -125,7 +119,7 @@ func newPayload() []byte {
return b
}
-func (c *testContext) sendV6Packet(payload []byte, h *headers, linkEpName string) {
+func (c *testContext) sendV6Packet(payload []byte, h *headers, linkEpID tcpip.NICID) {
// Allocate a buffer for data and headers.
buf := buffer.NewView(header.UDPMinimumSize + header.IPv6MinimumSize + len(payload))
copy(buf[len(buf)-len(payload):], payload)
@@ -156,7 +150,7 @@ func (c *testContext) sendV6Packet(payload []byte, h *headers, linkEpName string
u.SetChecksum(^u.CalculateChecksum(xsum))
// Inject packet.
- c.linkEPs[linkEpName].InjectInbound(ipv6.ProtocolNumber, tcpip.PacketBuffer{
+ c.linkEps[linkEpID].InjectInbound(ipv6.ProtocolNumber, tcpip.PacketBuffer{
Data: buf.ToVectorisedView(),
})
}
@@ -186,7 +180,7 @@ func TestTransportDemuxerRegister(t *testing.T) {
func TestDistribution(t *testing.T) {
type endpointSockopts struct {
reuse int
- bindToDevice string
+ bindToDevice tcpip.NICID
}
for _, test := range []struct {
name string
@@ -194,71 +188,71 @@ func TestDistribution(t *testing.T) {
endpoints []endpointSockopts
// wantedDistribution is the wanted ratio of packets received on each
// endpoint for each NIC on which packets are injected.
- wantedDistributions map[string][]float64
+ wantedDistributions map[tcpip.NICID][]float64
}{
{
"BindPortReuse",
// 5 endpoints that all have reuse set.
[]endpointSockopts{
- endpointSockopts{1, ""},
- endpointSockopts{1, ""},
- endpointSockopts{1, ""},
- endpointSockopts{1, ""},
- endpointSockopts{1, ""},
+ {1, 0},
+ {1, 0},
+ {1, 0},
+ {1, 0},
+ {1, 0},
},
- map[string][]float64{
+ map[tcpip.NICID][]float64{
// Injected packets on dev0 get distributed evenly.
- "dev0": []float64{0.2, 0.2, 0.2, 0.2, 0.2},
+ 1: {0.2, 0.2, 0.2, 0.2, 0.2},
},
},
{
"BindToDevice",
// 3 endpoints with various bindings.
[]endpointSockopts{
- endpointSockopts{0, "dev0"},
- endpointSockopts{0, "dev1"},
- endpointSockopts{0, "dev2"},
+ {0, 1},
+ {0, 2},
+ {0, 3},
},
- map[string][]float64{
+ map[tcpip.NICID][]float64{
// Injected packets on dev0 go only to the endpoint bound to dev0.
- "dev0": []float64{1, 0, 0},
+ 1: {1, 0, 0},
// Injected packets on dev1 go only to the endpoint bound to dev1.
- "dev1": []float64{0, 1, 0},
+ 2: {0, 1, 0},
// Injected packets on dev2 go only to the endpoint bound to dev2.
- "dev2": []float64{0, 0, 1},
+ 3: {0, 0, 1},
},
},
{
"ReuseAndBindToDevice",
// 6 endpoints with various bindings.
[]endpointSockopts{
- endpointSockopts{1, "dev0"},
- endpointSockopts{1, "dev0"},
- endpointSockopts{1, "dev1"},
- endpointSockopts{1, "dev1"},
- endpointSockopts{1, "dev1"},
- endpointSockopts{1, ""},
+ {1, 1},
+ {1, 1},
+ {1, 2},
+ {1, 2},
+ {1, 2},
+ {1, 0},
},
- map[string][]float64{
+ map[tcpip.NICID][]float64{
// Injected packets on dev0 get distributed among endpoints bound to
// dev0.
- "dev0": []float64{0.5, 0.5, 0, 0, 0, 0},
+ 1: {0.5, 0.5, 0, 0, 0, 0},
// Injected packets on dev1 get distributed among endpoints bound to
// dev1 or unbound.
- "dev1": []float64{0, 0, 1. / 3, 1. / 3, 1. / 3, 0},
+ 2: {0, 0, 1. / 3, 1. / 3, 1. / 3, 0},
// Injected packets on dev999 go only to the unbound.
- "dev999": []float64{0, 0, 0, 0, 0, 1},
+ 1000: {0, 0, 0, 0, 0, 1},
},
},
} {
t.Run(test.name, func(t *testing.T) {
for device, wantedDistribution := range test.wantedDistributions {
- t.Run(device, func(t *testing.T) {
- var devices []string
+ t.Run(string(device), func(t *testing.T) {
+ var devices []tcpip.NICID
for d := range test.wantedDistributions {
devices = append(devices, d)
}
- c := newDualTestContextMultiNic(t, defaultMTU, devices)
+ c := newDualTestContextMultiNIC(t, defaultMTU, devices)
defer c.cleanup()
c.createV6Endpoint(false)
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index 748ce4ea5..f50604a8a 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -102,13 +102,23 @@ func (*fakeTransportEndpoint) SetSockOpt(interface{}) *tcpip.Error {
return tcpip.ErrInvalidEndpointState
}
+// SetSockOptBool sets a socket option. Currently not supported.
+func (*fakeTransportEndpoint) SetSockOptBool(tcpip.SockOptBool, bool) *tcpip.Error {
+ return tcpip.ErrInvalidEndpointState
+}
+
// SetSockOptInt sets a socket option. Currently not supported.
-func (*fakeTransportEndpoint) SetSockOptInt(tcpip.SockOpt, int) *tcpip.Error {
+func (*fakeTransportEndpoint) SetSockOptInt(tcpip.SockOptInt, int) *tcpip.Error {
return tcpip.ErrInvalidEndpointState
}
+// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
+func (*fakeTransportEndpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
+ return false, tcpip.ErrUnknownProtocolOption
+}
+
// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
-func (*fakeTransportEndpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) {
+func (*fakeTransportEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
return -1, tcpip.ErrUnknownProtocolOption
}
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index bd5eb89ca..b7813cbc0 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -35,10 +35,10 @@ import (
"reflect"
"strconv"
"strings"
- "sync"
"sync/atomic"
"time"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/tcpip/buffer"
"gvisor.dev/gvisor/pkg/tcpip/iptables"
"gvisor.dev/gvisor/pkg/waiter"
@@ -308,7 +308,7 @@ type ControlMessages struct {
// HasTimestamp indicates whether Timestamp is valid/set.
HasTimestamp bool
- // Timestamp is the time (in ns) that the last packed used to create
+ // Timestamp is the time (in ns) that the last packet used to create
// the read data was received.
Timestamp int64
@@ -317,6 +317,18 @@ type ControlMessages struct {
// Inq is the number of bytes ready to be received.
Inq int32
+
+ // HasTOS indicates whether Tos is valid/set.
+ HasTOS bool
+
+ // TOS is the IPv4 type of service of the associated packet.
+ TOS uint8
+
+ // HasTClass indicates whether Tclass is valid/set.
+ HasTClass bool
+
+ // Tclass is the IPv6 traffic class of the associated packet.
+ TClass int32
}
// Endpoint is the interface implemented by transport protocols (e.g., tcp, udp)
@@ -411,17 +423,25 @@ type Endpoint interface {
// SetSockOpt sets a socket option. opt should be one of the *Option types.
SetSockOpt(opt interface{}) *Error
+ // SetSockOptBool sets a socket option, for simple cases where a value
+ // has the bool type.
+ SetSockOptBool(opt SockOptBool, v bool) *Error
+
// SetSockOptInt sets a socket option, for simple cases where a value
// has the int type.
- SetSockOptInt(opt SockOpt, v int) *Error
+ SetSockOptInt(opt SockOptInt, v int) *Error
// GetSockOpt gets a socket option. opt should be a pointer to one of the
// *Option types.
GetSockOpt(opt interface{}) *Error
+ // GetSockOptBool gets a socket option for simple cases where a return
+ // value has the bool type.
+ GetSockOptBool(SockOptBool) (bool, *Error)
+
// GetSockOptInt gets a socket option for simple cases where a return
// value has the int type.
- GetSockOptInt(SockOpt) (int, *Error)
+ GetSockOptInt(SockOptInt) (int, *Error)
// State returns a socket's lifecycle state. The returned value is
// protocol-specific and is primarily used for diagnostics.
@@ -476,13 +496,26 @@ type WriteOptions struct {
Atomic bool
}
-// SockOpt represents socket options which values have the int type.
-type SockOpt int
+// SockOptBool represents socket options which values have the bool type.
+type SockOptBool int
+
+const (
+ // ReceiveTOSOption is used by SetSockOpt/GetSockOpt to specify if the TOS
+ // ancillary message is passed with incoming packets.
+ ReceiveTOSOption SockOptBool = iota
+
+ // V6OnlyOption is used by {G,S}etSockOptBool to specify whether an IPv6
+ // socket is to be restricted to sending and receiving IPv6 packets only.
+ V6OnlyOption
+)
+
+// SockOptInt represents socket options which values have the int type.
+type SockOptInt int
const (
// ReceiveQueueSizeOption is used in GetSockOptInt to specify that the
// number of unread bytes in the input buffer should be returned.
- ReceiveQueueSizeOption SockOpt = iota
+ ReceiveQueueSizeOption SockOptInt = iota
// SendBufferSizeOption is used by SetSockOptInt/GetSockOptInt to
// specify the send buffer size option.
@@ -509,10 +542,6 @@ const (
// the endpoint should be cleared and returned.
type ErrorOption struct{}
-// V6OnlyOption is used by SetSockOpt/GetSockOpt to specify whether an IPv6
-// socket is to be restricted to sending and receiving IPv6 packets only.
-type V6OnlyOption int
-
// CorkOption is used by SetSockOpt/GetSockOpt to specify if data should be
// held until segments are full by the TCP transport protocol.
type CorkOption int
@@ -527,7 +556,7 @@ type ReusePortOption int
// BindToDeviceOption is used by SetSockOpt/GetSockOpt to specify that sockets
// should bind only on a specific NIC.
-type BindToDeviceOption string
+type BindToDeviceOption NICID
// QuickAckOption is stubbed out in SetSockOpt/GetSockOpt.
type QuickAckOption int
@@ -564,6 +593,11 @@ type KeepaliveIntervalOption time.Duration
// closed.
type KeepaliveCountOption int
+// TCPUserTimeoutOption is used by SetSockOpt/GetSockOpt to specify a user
+// specified timeout for a given TCP connection.
+// See: RFC5482 for details.
+type TCPUserTimeoutOption time.Duration
+
// CongestionControlOption is used by SetSockOpt/GetSockOpt to set/get
// the current congestion control algorithm.
type CongestionControlOption string
@@ -912,6 +946,14 @@ type TCPStats struct {
// ESTABLISHED state or the CLOSE-WAIT state.
EstablishedResets *StatCounter
+ // EstablishedClosed is the number of times established TCP connections
+ // made a transition to CLOSED state.
+ EstablishedClosed *StatCounter
+
+ // EstablishedTimedout is the number of times an established connection
+ // was reset because of keep-alive time out.
+ EstablishedTimedout *StatCounter
+
// ListenOverflowSynDrop is the number of times the listen queue overflowed
// and a SYN was dropped.
ListenOverflowSynDrop *StatCounter
diff --git a/pkg/tcpip/timer.go b/pkg/tcpip/timer.go
new file mode 100644
index 000000000..f5f01f32f
--- /dev/null
+++ b/pkg/tcpip/timer.go
@@ -0,0 +1,161 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcpip
+
+import (
+ "sync"
+ "time"
+)
+
+// cancellableTimerInstance is a specific instance of CancellableTimer.
+//
+// Different instances are created each time CancellableTimer is Reset so each
+// timer has its own earlyReturn signal. This is to address a bug when a
+// CancellableTimer is stopped and reset in quick succession resulting in a
+// timer instance's earlyReturn signal being affected or seen by another timer
+// instance.
+//
+// Consider the following sceneario where timer instances share a common
+// earlyReturn signal (T1 creates, stops and resets a Cancellable timer under a
+// lock L; T2, T3, T4 and T5 are goroutines that handle the first (A), second
+// (B), third (C), and fourth (D) instance of the timer firing, respectively):
+// T1: Obtain L
+// T1: Create a new CancellableTimer w/ lock L (create instance A)
+// T2: instance A fires, blocked trying to obtain L.
+// T1: Attempt to stop instance A (set earlyReturn = true)
+// T1: Reset timer (create instance B)
+// T3: instance B fires, blocked trying to obtain L.
+// T1: Attempt to stop instance B (set earlyReturn = true)
+// T1: Reset timer (create instance C)
+// T4: instance C fires, blocked trying to obtain L.
+// T1: Attempt to stop instance C (set earlyReturn = true)
+// T1: Reset timer (create instance D)
+// T5: instance D fires, blocked trying to obtain L.
+// T1: Release L
+//
+// Now that T1 has released L, any of the 4 timer instances can take L and check
+// earlyReturn. If the timers simply check earlyReturn and then do nothing
+// further, then instance D will never early return even though it was not
+// requested to stop. If the timers reset earlyReturn before early returning,
+// then all but one of the timers will do work when only one was expected to.
+// If CancellableTimer resets earlyReturn when resetting, then all the timers
+// will fire (again, when only one was expected to).
+//
+// To address the above concerns the simplest solution was to give each timer
+// its own earlyReturn signal.
+type cancellableTimerInstance struct {
+ timer *time.Timer
+
+ // Used to inform the timer to early return when it gets stopped while the
+ // lock the timer tries to obtain when fired is held (T1 is a goroutine that
+ // tries to cancel the timer and T2 is the goroutine that handles the timer
+ // firing):
+ // T1: Obtain the lock, then call StopLocked()
+ // T2: timer fires, and gets blocked on obtaining the lock
+ // T1: Releases lock
+ // T2: Obtains lock does unintended work
+ //
+ // To resolve this, T1 will check to see if the timer already fired, and
+ // inform the timer using earlyReturn to return early so that once T2 obtains
+ // the lock, it will see that it is set to true and do nothing further.
+ earlyReturn *bool
+}
+
+// stop stops the timer instance t from firing if it hasn't fired already. If it
+// has fired and is blocked at obtaining the lock, earlyReturn will be set to
+// true so that it will early return when it obtains the lock.
+func (t *cancellableTimerInstance) stop() {
+ if t.timer != nil {
+ t.timer.Stop()
+ *t.earlyReturn = true
+ }
+}
+
+// CancellableTimer is a timer that does some work and can be safely cancelled
+// when it fires at the same time some "related work" is being done.
+//
+// The term "related work" is defined as some work that needs to be done while
+// holding some lock that the timer must also hold while doing some work.
+type CancellableTimer struct {
+ // The active instance of a cancellable timer.
+ instance cancellableTimerInstance
+
+ // locker is the lock taken by the timer immediately after it fires and must
+ // be held when attempting to stop the timer.
+ //
+ // Must never change after being assigned.
+ locker sync.Locker
+
+ // fn is the function that will be called when a timer fires and has not been
+ // signaled to early return.
+ //
+ // fn MUST NOT attempt to lock locker.
+ //
+ // Must never change after being assigned.
+ fn func()
+}
+
+// StopLocked prevents the Timer from firing if it has not fired already.
+//
+// If the timer is blocked on obtaining the t.locker lock when StopLocked is
+// called, it will early return instead of calling t.fn.
+//
+// Note, t will be modified.
+//
+// t.locker MUST be locked.
+func (t *CancellableTimer) StopLocked() {
+ t.instance.stop()
+
+ // Nothing to do with the stopped instance anymore.
+ t.instance = cancellableTimerInstance{}
+}
+
+// Reset changes the timer to expire after duration d.
+//
+// Note, t will be modified.
+//
+// Reset should only be called on stopped or expired timers. To be safe, callers
+// should always call StopLocked before calling Reset.
+func (t *CancellableTimer) Reset(d time.Duration) {
+ // Create a new instance.
+ earlyReturn := false
+ t.instance = cancellableTimerInstance{
+ timer: time.AfterFunc(d, func() {
+ t.locker.Lock()
+ defer t.locker.Unlock()
+
+ if earlyReturn {
+ // If we reach this point, it means that the timer fired while another
+ // goroutine called StopLocked while it had the lock. Simply return
+ // here and do nothing further.
+ earlyReturn = false
+ return
+ }
+
+ t.fn()
+ }),
+ earlyReturn: &earlyReturn,
+ }
+}
+
+// MakeCancellableTimer returns an unscheduled CancellableTimer with the given
+// locker and fn.
+//
+// fn MUST NOT attempt to lock locker.
+//
+// Callers must call Reset to schedule the timer to fire.
+func MakeCancellableTimer(locker sync.Locker, fn func()) CancellableTimer {
+ return CancellableTimer{locker: locker, fn: fn}
+}
diff --git a/pkg/tcpip/timer_test.go b/pkg/tcpip/timer_test.go
new file mode 100644
index 000000000..2d20f7ef3
--- /dev/null
+++ b/pkg/tcpip/timer_test.go
@@ -0,0 +1,236 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcpip_test
+
+import (
+ "sync"
+ "testing"
+ "time"
+
+ "gvisor.dev/gvisor/pkg/tcpip"
+)
+
+const (
+ shortDuration = 1 * time.Nanosecond
+ middleDuration = 100 * time.Millisecond
+ longDuration = 1 * time.Second
+)
+
+func TestCancellableTimerFire(t *testing.T) {
+ t.Parallel()
+
+ ch := make(chan struct{})
+ var lock sync.Mutex
+
+ timer := tcpip.MakeCancellableTimer(&lock, func() {
+ ch <- struct{}{}
+ })
+ timer.Reset(shortDuration)
+
+ // Wait for timer to fire.
+ select {
+ case <-ch:
+ case <-time.After(middleDuration):
+ t.Fatal("timed out waiting for timer to fire")
+ }
+
+ // The timer should have fired only once.
+ select {
+ case <-ch:
+ t.Fatal("no other timers should have fired")
+ case <-time.After(middleDuration):
+ }
+}
+
+func TestCancellableTimerResetFromLongDuration(t *testing.T) {
+ t.Parallel()
+
+ ch := make(chan struct{})
+ var lock sync.Mutex
+
+ timer := tcpip.MakeCancellableTimer(&lock, func() { ch <- struct{}{} })
+ timer.Reset(middleDuration)
+
+ lock.Lock()
+ timer.StopLocked()
+ lock.Unlock()
+
+ timer.Reset(shortDuration)
+
+ // Wait for timer to fire.
+ select {
+ case <-ch:
+ case <-time.After(middleDuration):
+ t.Fatal("timed out waiting for timer to fire")
+ }
+
+ // The timer should have fired only once.
+ select {
+ case <-ch:
+ t.Fatal("no other timers should have fired")
+ case <-time.After(middleDuration):
+ }
+}
+
+func TestCancellableTimerResetFromShortDuration(t *testing.T) {
+ t.Parallel()
+
+ ch := make(chan struct{})
+ var lock sync.Mutex
+
+ lock.Lock()
+ timer := tcpip.MakeCancellableTimer(&lock, func() { ch <- struct{}{} })
+ timer.Reset(shortDuration)
+ timer.StopLocked()
+ lock.Unlock()
+
+ // Wait for timer to fire if it wasn't correctly stopped.
+ select {
+ case <-ch:
+ t.Fatal("timer fired after being stopped")
+ case <-time.After(middleDuration):
+ }
+
+ timer.Reset(shortDuration)
+
+ // Wait for timer to fire.
+ select {
+ case <-ch:
+ case <-time.After(middleDuration):
+ t.Fatal("timed out waiting for timer to fire")
+ }
+
+ // The timer should have fired only once.
+ select {
+ case <-ch:
+ t.Fatal("no other timers should have fired")
+ case <-time.After(middleDuration):
+ }
+}
+
+func TestCancellableTimerImmediatelyStop(t *testing.T) {
+ t.Parallel()
+
+ ch := make(chan struct{})
+ var lock sync.Mutex
+
+ for i := 0; i < 1000; i++ {
+ lock.Lock()
+ timer := tcpip.MakeCancellableTimer(&lock, func() { ch <- struct{}{} })
+ timer.Reset(shortDuration)
+ timer.StopLocked()
+ lock.Unlock()
+ }
+
+ // Wait for timer to fire if it wasn't correctly stopped.
+ select {
+ case <-ch:
+ t.Fatal("timer fired after being stopped")
+ case <-time.After(middleDuration):
+ }
+}
+
+func TestCancellableTimerStoppedResetWithoutLock(t *testing.T) {
+ t.Parallel()
+
+ ch := make(chan struct{})
+ var lock sync.Mutex
+
+ lock.Lock()
+ timer := tcpip.MakeCancellableTimer(&lock, func() { ch <- struct{}{} })
+ timer.Reset(shortDuration)
+ timer.StopLocked()
+ lock.Unlock()
+
+ for i := 0; i < 10; i++ {
+ timer.Reset(middleDuration)
+
+ lock.Lock()
+ // Sleep until the timer fires and gets blocked trying to take the lock.
+ time.Sleep(middleDuration * 2)
+ timer.StopLocked()
+ lock.Unlock()
+ }
+
+ // Wait for double the duration so timers that weren't correctly stopped can
+ // fire.
+ select {
+ case <-ch:
+ t.Fatal("timer fired after being stopped")
+ case <-time.After(middleDuration * 2):
+ }
+}
+
+func TestManyCancellableTimerResetAfterBlockedOnLock(t *testing.T) {
+ t.Parallel()
+
+ ch := make(chan struct{})
+ var lock sync.Mutex
+
+ lock.Lock()
+ timer := tcpip.MakeCancellableTimer(&lock, func() { ch <- struct{}{} })
+ timer.Reset(shortDuration)
+ for i := 0; i < 10; i++ {
+ // Sleep until the timer fires and gets blocked trying to take the lock.
+ time.Sleep(middleDuration)
+ timer.StopLocked()
+ timer.Reset(shortDuration)
+ }
+ lock.Unlock()
+
+ // Wait for double the duration for the last timer to fire.
+ select {
+ case <-ch:
+ case <-time.After(middleDuration):
+ t.Fatal("timed out waiting for timer to fire")
+ }
+
+ // The timer should have fired only once.
+ select {
+ case <-ch:
+ t.Fatal("no other timers should have fired")
+ case <-time.After(middleDuration):
+ }
+}
+
+func TestManyCancellableTimerResetUnderLock(t *testing.T) {
+ t.Parallel()
+
+ ch := make(chan struct{})
+ var lock sync.Mutex
+
+ lock.Lock()
+ timer := tcpip.MakeCancellableTimer(&lock, func() { ch <- struct{}{} })
+ timer.Reset(shortDuration)
+ for i := 0; i < 10; i++ {
+ timer.StopLocked()
+ timer.Reset(shortDuration)
+ }
+ lock.Unlock()
+
+ // Wait for double the duration for the last timer to fire.
+ select {
+ case <-ch:
+ case <-time.After(middleDuration):
+ t.Fatal("timed out waiting for timer to fire")
+ }
+
+ // The timer should have fired only once.
+ select {
+ case <-ch:
+ t.Fatal("no other timers should have fired")
+ case <-time.After(middleDuration):
+ }
+}
diff --git a/pkg/tcpip/transport/icmp/BUILD b/pkg/tcpip/transport/icmp/BUILD
index d8c5b5058..3aa23d529 100644
--- a/pkg/tcpip/transport/icmp/BUILD
+++ b/pkg/tcpip/transport/icmp/BUILD
@@ -28,6 +28,7 @@ go_library(
visibility = ["//visibility:public"],
deps = [
"//pkg/sleep",
+ "//pkg/sync",
"//pkg/tcpip",
"//pkg/tcpip/buffer",
"//pkg/tcpip/header",
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index 9c40931b5..42afb3f5b 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -15,8 +15,7 @@
package icmp
import (
- "sync"
-
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/buffer"
"gvisor.dev/gvisor/pkg/tcpip/header"
@@ -289,7 +288,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
toCopy := *to
to = &toCopy
- netProto, err := e.checkV4Mapped(to, true)
+ netProto, err := e.checkV4Mapped(to)
if err != nil {
return 0, nil, err
}
@@ -350,13 +349,23 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
return nil
}
+// SetSockOptBool sets a socket option. Currently not supported.
+func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
+ return nil
+}
+
// SetSockOptInt sets a socket option. Currently not supported.
-func (e *endpoint) SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error {
+func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
return nil
}
+// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
+func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
+ return false, tcpip.ErrUnknownProtocolOption
+}
+
// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
-func (e *endpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) {
+func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
switch opt {
case tcpip.ReceiveQueueSizeOption:
v := 0
@@ -466,18 +475,12 @@ func send6(r *stack.Route, ident uint16, data buffer.View, ttl uint8) *tcpip.Err
})
}
-func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress, allowMismatch bool) (tcpip.NetworkProtocolNumber, *tcpip.Error) {
- netProto := e.NetProto
- if header.IsV4MappedAddress(addr.Addr) {
- return 0, tcpip.ErrNoRoute
- }
-
- // Fail if we're bound to an address length different from the one we're
- // checking.
- if l := len(e.ID.LocalAddress); !allowMismatch && l != 0 && l != len(addr.Addr) {
- return 0, tcpip.ErrInvalidEndpointState
+func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress) (tcpip.NetworkProtocolNumber, *tcpip.Error) {
+ unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProto(*addr, false /* v6only */)
+ if err != nil {
+ return 0, err
}
-
+ *addr = unwrapped
return netProto, nil
}
@@ -509,7 +512,7 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
return tcpip.ErrInvalidEndpointState
}
- netProto, err := e.checkV4Mapped(&addr, false)
+ netProto, err := e.checkV4Mapped(&addr)
if err != nil {
return err
}
@@ -622,7 +625,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) *tcpip.Error {
return tcpip.ErrInvalidEndpointState
}
- netProto, err := e.checkV4Mapped(&addr, false)
+ netProto, err := e.checkV4Mapped(&addr)
if err != nil {
return err
}
diff --git a/pkg/tcpip/transport/packet/BUILD b/pkg/tcpip/transport/packet/BUILD
index 44b58ff6b..4858d150c 100644
--- a/pkg/tcpip/transport/packet/BUILD
+++ b/pkg/tcpip/transport/packet/BUILD
@@ -28,6 +28,7 @@ go_library(
deps = [
"//pkg/log",
"//pkg/sleep",
+ "//pkg/sync",
"//pkg/tcpip",
"//pkg/tcpip/buffer",
"//pkg/tcpip/header",
diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go
index 0010b5e5f..fc5bc69fa 100644
--- a/pkg/tcpip/transport/packet/endpoint.go
+++ b/pkg/tcpip/transport/packet/endpoint.go
@@ -25,8 +25,7 @@
package packet
import (
- "sync"
-
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/buffer"
"gvisor.dev/gvisor/pkg/tcpip/header"
@@ -247,17 +246,17 @@ func (ep *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
// used with SetSockOpt, and this function always returns
// tcpip.ErrNotSupported.
func (ep *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
- return tcpip.ErrNotSupported
+ return tcpip.ErrUnknownProtocolOption
}
-// SetSockOptInt implements tcpip.Endpoint.SetSockOptInt.
-func (ep *endpoint) SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error {
+// SetSockOptBool implements tcpip.Endpoint.SetSockOptBool.
+func (ep *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
return tcpip.ErrUnknownProtocolOption
}
-// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
-func (ep *endpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) {
- return 0, tcpip.ErrNotSupported
+// SetSockOptInt implements tcpip.Endpoint.SetSockOptInt.
+func (ep *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
+ return tcpip.ErrUnknownProtocolOption
}
// GetSockOpt implements tcpip.Endpoint.GetSockOpt.
@@ -265,6 +264,16 @@ func (ep *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
return tcpip.ErrNotSupported
}
+// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
+func (ep *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
+ return false, tcpip.ErrNotSupported
+}
+
+// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
+func (ep *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
+ return 0, tcpip.ErrNotSupported
+}
+
// HandlePacket implements stack.PacketEndpoint.HandlePacket.
func (ep *endpoint) HandlePacket(nicID tcpip.NICID, localAddr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, pkt tcpip.PacketBuffer) {
ep.rcvMu.Lock()
diff --git a/pkg/tcpip/transport/raw/BUILD b/pkg/tcpip/transport/raw/BUILD
index 00991ac8e..2f2131ff7 100644
--- a/pkg/tcpip/transport/raw/BUILD
+++ b/pkg/tcpip/transport/raw/BUILD
@@ -29,6 +29,7 @@ go_library(
deps = [
"//pkg/log",
"//pkg/sleep",
+ "//pkg/sync",
"//pkg/tcpip",
"//pkg/tcpip/buffer",
"//pkg/tcpip/header",
diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go
index 5aafe2615..ee9c4c58b 100644
--- a/pkg/tcpip/transport/raw/endpoint.go
+++ b/pkg/tcpip/transport/raw/endpoint.go
@@ -26,8 +26,7 @@
package raw
import (
- "sync"
-
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/buffer"
"gvisor.dev/gvisor/pkg/tcpip/header"
@@ -509,13 +508,38 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
return tcpip.ErrUnknownProtocolOption
}
+// SetSockOptBool implements tcpip.Endpoint.SetSockOptBool.
+func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
+ return tcpip.ErrUnknownProtocolOption
+}
+
// SetSockOptInt implements tcpip.Endpoint.SetSockOptInt.
-func (ep *endpoint) SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error {
+func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
return tcpip.ErrUnknownProtocolOption
}
+// GetSockOpt implements tcpip.Endpoint.GetSockOpt.
+func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
+ switch o := opt.(type) {
+ case tcpip.ErrorOption:
+ return nil
+
+ case *tcpip.KeepaliveEnabledOption:
+ *o = 0
+ return nil
+
+ default:
+ return tcpip.ErrUnknownProtocolOption
+ }
+}
+
+// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
+func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
+ return false, tcpip.ErrUnknownProtocolOption
+}
+
// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
-func (e *endpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) {
+func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
switch opt {
case tcpip.ReceiveQueueSizeOption:
v := 0
@@ -544,21 +568,6 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) {
return -1, tcpip.ErrUnknownProtocolOption
}
-// GetSockOpt implements tcpip.Endpoint.GetSockOpt.
-func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
- switch o := opt.(type) {
- case tcpip.ErrorOption:
- return nil
-
- case *tcpip.KeepaliveEnabledOption:
- *o = 0
- return nil
-
- default:
- return tcpip.ErrUnknownProtocolOption
- }
-}
-
// HandlePacket implements stack.RawTransportEndpoint.HandlePacket.
func (e *endpoint) HandlePacket(route *stack.Route, pkt tcpip.PacketBuffer) {
e.rcvMu.Lock()
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index dd1728f9c..0e3ab05ad 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -16,6 +16,18 @@ go_template_instance(
},
)
+go_template_instance(
+ name = "tcp_endpoint_list",
+ out = "tcp_endpoint_list.go",
+ package = "tcp",
+ prefix = "endpoint",
+ template = "//pkg/ilist:generic_list",
+ types = {
+ "Element": "*endpoint",
+ "Linker": "*endpoint",
+ },
+)
+
go_library(
name = "tcp",
srcs = [
@@ -23,11 +35,13 @@ go_library(
"connect.go",
"cubic.go",
"cubic_state.go",
+ "dispatcher.go",
"endpoint.go",
"endpoint_state.go",
"forwarder.go",
"protocol.go",
"rcv.go",
+ "rcv_state.go",
"reno.go",
"sack.go",
"sack_scoreboard.go",
@@ -37,6 +51,7 @@ go_library(
"segment_state.go",
"snd.go",
"snd_state.go",
+ "tcp_endpoint_list.go",
"tcp_segment_list.go",
"timer.go",
],
@@ -44,14 +59,15 @@ go_library(
imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"],
visibility = ["//visibility:public"],
deps = [
- "//pkg/log",
"//pkg/rand",
"//pkg/sleep",
+ "//pkg/sync",
"//pkg/tcpip",
"//pkg/tcpip/buffer",
"//pkg/tcpip/hash/jenkins",
"//pkg/tcpip/header",
"//pkg/tcpip/iptables",
+ "//pkg/tcpip/ports",
"//pkg/tcpip/seqnum",
"//pkg/tcpip/stack",
"//pkg/tcpip/transport/raw",
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index f543a6105..1a2e3efa9 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -19,11 +19,11 @@ import (
"encoding/binary"
"hash"
"io"
- "sync"
"time"
"gvisor.dev/gvisor/pkg/rand"
"gvisor.dev/gvisor/pkg/sleep"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/buffer"
"gvisor.dev/gvisor/pkg/tcpip/header"
@@ -242,6 +242,13 @@ func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, i
n.initGSO()
+ // Now inherit any socket options that should be inherited from the
+ // listening endpoint.
+ // In case of Forwarder listenEP will be nil and hence this check.
+ if l.listenEP != nil {
+ l.listenEP.propagateInheritableOptions(n)
+ }
+
// Register new endpoint so that packets are routed to it.
if err := n.stack.RegisterTransportEndpoint(n.boundNICID, n.effectiveNetProtos, ProtocolNumber, n.ID, n, n.reusePort, n.boundBindToDevice); err != nil {
n.Close()
@@ -278,7 +285,7 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
// listenEP is nil when listenContext is used by tcp.Forwarder.
if l.listenEP != nil {
l.listenEP.mu.Lock()
- if l.listenEP.state != StateListen {
+ if l.listenEP.EndpointState() != StateListen {
l.listenEP.mu.Unlock()
return nil, tcpip.ErrConnectionAborted
}
@@ -298,8 +305,6 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
return nil, err
}
ep.mu.Lock()
- ep.stack.Stats().TCP.CurrentEstablished.Increment()
- ep.state = StateEstablished
ep.isConnectNotified = true
ep.mu.Unlock()
@@ -339,11 +344,12 @@ func (l *listenContext) closeAllPendingEndpoints() {
// instead.
func (e *endpoint) deliverAccepted(n *endpoint) {
e.mu.Lock()
- state := e.state
+ state := e.EndpointState()
e.pendingAccepted.Add(1)
defer e.pendingAccepted.Done()
acceptedChan := e.acceptedChan
e.mu.Unlock()
+
if state == StateListen {
acceptedChan <- n
e.waiterQueue.Notify(waiter.EventIn)
@@ -352,6 +358,14 @@ func (e *endpoint) deliverAccepted(n *endpoint) {
}
}
+// propagateInheritableOptions propagates any options set on the listening
+// endpoint to the newly created endpoint.
+func (e *endpoint) propagateInheritableOptions(n *endpoint) {
+ e.mu.Lock()
+ n.userTimeout = e.userTimeout
+ e.mu.Unlock()
+}
+
// handleSynSegment is called in its own goroutine once the listening endpoint
// receives a SYN segment. It is responsible for completing the handshake and
// queueing the new endpoint for acceptance.
@@ -546,9 +560,11 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
n.tsOffset = 0
// Switch state to connected.
+ // We do not use transitionToStateEstablishedLocked here as there is
+ // no handshake state available when doing a SYN cookie based accept.
n.stack.Stats().TCP.CurrentEstablished.Increment()
- n.state = StateEstablished
n.isConnectNotified = true
+ n.setEndpointState(StateEstablished)
// Do the delivery in a separate goroutine so
// that we don't block the listen loop in case
@@ -581,7 +597,7 @@ func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) *tcpip.Error {
// handleSynSegment() from attempting to queue new connections
// to the endpoint.
e.mu.Lock()
- e.state = StateClose
+ e.setEndpointState(StateClose)
// close any endpoints in SYN-RCVD state.
ctx.closeAllPendingEndpoints()
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 4206db8b6..a2f384384 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -16,11 +16,11 @@ package tcp
import (
"encoding/binary"
- "sync"
"time"
"gvisor.dev/gvisor/pkg/rand"
"gvisor.dev/gvisor/pkg/sleep"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/buffer"
"gvisor.dev/gvisor/pkg/tcpip/hash/jenkins"
@@ -190,7 +190,7 @@ func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *hea
h.mss = opts.MSS
h.sndWndScale = opts.WS
h.ep.mu.Lock()
- h.ep.state = StateSynRecv
+ h.ep.setEndpointState(StateSynRecv)
h.ep.mu.Unlock()
}
@@ -218,6 +218,14 @@ func (h *handshake) synSentState(s *segment) *tcpip.Error {
// acceptable if the ack field acknowledges the SYN.
if s.flagIsSet(header.TCPFlagRst) {
if s.flagIsSet(header.TCPFlagAck) && s.ackNumber == h.iss+1 {
+ // RFC 793, page 67, states that "If the RST bit is set [and] If the ACK
+ // was acceptable then signal the user "error: connection reset", drop
+ // the segment, enter CLOSED state, delete TCB, and return."
+ h.ep.mu.Lock()
+ h.ep.workerCleanup = true
+ h.ep.mu.Unlock()
+ // Although the RFC above calls out ECONNRESET, Linux actually returns
+ // ECONNREFUSED here so we do as well.
return tcpip.ErrConnectionRefused
}
return nil
@@ -252,6 +260,11 @@ func (h *handshake) synSentState(s *segment) *tcpip.Error {
// and the handshake is completed.
if s.flagIsSet(header.TCPFlagAck) {
h.state = handshakeCompleted
+
+ h.ep.mu.Lock()
+ h.ep.transitionToStateEstablishedLocked(h)
+ h.ep.mu.Unlock()
+
h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck, h.iss+1, h.ackNum, h.rcvWnd>>h.effectiveRcvWndScale())
return nil
}
@@ -261,14 +274,14 @@ func (h *handshake) synSentState(s *segment) *tcpip.Error {
// SYN-RCVD state.
h.state = handshakeSynRcvd
h.ep.mu.Lock()
- h.ep.state = StateSynRecv
ttl := h.ep.ttl
+ h.ep.setEndpointState(StateSynRecv)
h.ep.mu.Unlock()
synOpts := header.TCPSynOptions{
WS: int(h.effectiveRcvWndScale()),
TS: rcvSynOpts.TS,
TSVal: h.ep.timestamp(),
- TSEcr: h.ep.recentTS,
+ TSEcr: h.ep.recentTimestamp(),
// We only send SACKPermitted if the other side indicated it
// permits SACK. This is not explicitly defined in the RFC but
@@ -328,7 +341,7 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error {
WS: h.rcvWndScale,
TS: h.ep.sendTSOk,
TSVal: h.ep.timestamp(),
- TSEcr: h.ep.recentTS,
+ TSEcr: h.ep.recentTimestamp(),
SACKPermitted: h.ep.sackPermitted,
MSS: h.ep.amss,
}
@@ -352,6 +365,10 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error {
h.ep.updateRecentTimestamp(s.parsedOptions.TSVal, h.ackNum, s.sequenceNumber)
}
h.state = handshakeCompleted
+ h.ep.mu.Lock()
+ h.ep.transitionToStateEstablishedLocked(h)
+ h.ep.mu.Unlock()
+
return nil
}
@@ -484,7 +501,7 @@ func (h *handshake) execute() *tcpip.Error {
WS: h.rcvWndScale,
TS: true,
TSVal: h.ep.timestamp(),
- TSEcr: h.ep.recentTS,
+ TSEcr: h.ep.recentTimestamp(),
SACKPermitted: bool(sackEnabled),
MSS: h.ep.amss,
}
@@ -775,7 +792,7 @@ func (e *endpoint) makeOptions(sackBlocks []header.SACKBlock) []byte {
// Ref: https://tools.ietf.org/html/rfc7323#section-5.4.
offset += header.EncodeNOP(options[offset:])
offset += header.EncodeNOP(options[offset:])
- offset += header.EncodeTSOption(e.timestamp(), uint32(e.recentTS), options[offset:])
+ offset += header.EncodeTSOption(e.timestamp(), e.recentTimestamp(), options[offset:])
}
if e.sackPermitted && len(sackBlocks) > 0 {
offset += header.EncodeNOP(options[offset:])
@@ -794,7 +811,7 @@ func (e *endpoint) makeOptions(sackBlocks []header.SACKBlock) []byte {
// sendRaw sends a TCP segment to the endpoint's peer.
func (e *endpoint) sendRaw(data buffer.VectorisedView, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size) *tcpip.Error {
var sackBlocks []header.SACKBlock
- if e.state == StateEstablished && e.rcv.pendingBufSize > 0 && (flags&header.TCPFlagAck != 0) {
+ if e.EndpointState() == StateEstablished && e.rcv.pendingBufSize > 0 && (flags&header.TCPFlagAck != 0) {
sackBlocks = e.sack.Blocks[:e.sack.NumBlocks]
}
options := e.makeOptions(sackBlocks)
@@ -831,6 +848,9 @@ func (e *endpoint) handleWrite() *tcpip.Error {
}
func (e *endpoint) handleClose() *tcpip.Error {
+ if !e.EndpointState().connected() {
+ return nil
+ }
// Drain the send queue.
e.handleWrite()
@@ -847,13 +867,9 @@ func (e *endpoint) handleClose() *tcpip.Error {
func (e *endpoint) resetConnectionLocked(err *tcpip.Error) {
// Only send a reset if the connection is being aborted for a reason
// other than receiving a reset.
- if e.state == StateEstablished || e.state == StateCloseWait {
- e.stack.Stats().TCP.EstablishedResets.Increment()
- e.stack.Stats().TCP.CurrentEstablished.Decrement()
- }
- e.state = StateError
+ e.setEndpointState(StateError)
e.HardError = err
- if err != tcpip.ErrConnectionReset {
+ if err != tcpip.ErrConnectionReset && err != tcpip.ErrTimeout {
// The exact sequence number to be used for the RST is the same as the
// one used by Linux. We need to handle the case of window being shrunk
// which can cause sndNxt to be outside the acceptable window on the
@@ -871,26 +887,54 @@ func (e *endpoint) resetConnectionLocked(err *tcpip.Error) {
}
// completeWorkerLocked is called by the worker goroutine when it's about to
-// exit. It marks the worker as completed and performs cleanup work if requested
-// by Close().
+// exit.
func (e *endpoint) completeWorkerLocked() {
+ // Worker is terminating(either due to moving to
+ // CLOSED or ERROR state, ensure we release all
+ // registrations port reservations even if the socket
+ // itself is not yet closed by the application.
e.workerRunning = false
if e.workerCleanup {
e.cleanupLocked()
}
}
+// transitionToStateEstablisedLocked transitions a given endpoint
+// to an established state using the handshake parameters provided.
+// It also initializes sender/receiver if required.
+func (e *endpoint) transitionToStateEstablishedLocked(h *handshake) {
+ if e.snd == nil {
+ // Transfer handshake state to TCP connection. We disable
+ // receive window scaling if the peer doesn't support it
+ // (indicated by a negative send window scale).
+ e.snd = newSender(e, h.iss, h.ackNum-1, h.sndWnd, h.mss, h.sndWndScale)
+ }
+ if e.rcv == nil {
+ rcvBufSize := seqnum.Size(e.receiveBufferSize())
+ e.rcvListMu.Lock()
+ e.rcv = newReceiver(e, h.ackNum-1, h.rcvWnd, h.effectiveRcvWndScale(), rcvBufSize)
+ // Bootstrap the auto tuning algorithm. Starting at zero will
+ // result in a really large receive window after the first auto
+ // tuning adjustment.
+ e.rcvAutoParams.prevCopied = int(h.rcvWnd)
+ e.rcvListMu.Unlock()
+ }
+ e.setEndpointState(StateEstablished)
+}
+
// transitionToStateCloseLocked ensures that the endpoint is
// cleaned up from the transport demuxer, "before" moving to
// StateClose. This will ensure that no packet will be
// delivered to this endpoint from the demuxer when the endpoint
// is transitioned to StateClose.
func (e *endpoint) transitionToStateCloseLocked() {
- if e.state == StateClose {
+ if e.EndpointState() == StateClose {
return
}
+ // Mark the endpoint as fully closed for reads/writes.
e.cleanupLocked()
- e.state = StateClose
+ e.setEndpointState(StateClose)
+ e.stack.Stats().TCP.EstablishedClosed.Increment()
}
// tryDeliverSegmentFromClosedEndpoint attempts to deliver the parsed
@@ -904,7 +948,9 @@ func (e *endpoint) tryDeliverSegmentFromClosedEndpoint(s *segment) {
s.decRef()
return
}
- ep.(*endpoint).enqueueSegment(s)
+ if ep.(*endpoint).enqueueSegment(s) {
+ ep.(*endpoint).newSegmentWaker.Assert()
+ }
}
func (e *endpoint) handleReset(s *segment) (ok bool, err *tcpip.Error) {
@@ -913,9 +959,8 @@ func (e *endpoint) handleReset(s *segment) (ok bool, err *tcpip.Error) {
// except SYN-SENT, all reset (RST) segments are
// validated by checking their SEQ-fields." So
// we only process it if it's acceptable.
- s.decRef()
e.mu.Lock()
- switch e.state {
+ switch e.EndpointState() {
// In case of a RST in CLOSE-WAIT linux moves
// the socket to closed state with an error set
// to indicate EPIPE.
@@ -939,99 +984,53 @@ func (e *endpoint) handleReset(s *segment) (ok bool, err *tcpip.Error) {
e.transitionToStateCloseLocked()
e.HardError = tcpip.ErrAborted
e.mu.Unlock()
+ e.notifyProtocolGoroutine(notifyTickleWorker)
return false, nil
default:
e.mu.Unlock()
+ // RFC 793, page 37 states that "in all states
+ // except SYN-SENT, all reset (RST) segments are
+ // validated by checking their SEQ-fields." So
+ // we only process it if it's acceptable.
+
+ // Notify protocol goroutine. This is required when
+ // handleSegment is invoked from the processor goroutine
+ // rather than the worker goroutine.
+ e.notifyProtocolGoroutine(notifyResetByPeer)
return false, tcpip.ErrConnectionReset
}
}
return true, nil
}
-// handleSegments pulls segments from the queue and processes them. It returns
-// no error if the protocol loop should continue, an error otherwise.
-func (e *endpoint) handleSegments() *tcpip.Error {
+// handleSegments processes all inbound segments.
+func (e *endpoint) handleSegments(fastPath bool) *tcpip.Error {
checkRequeue := true
for i := 0; i < maxSegmentsPerWake; i++ {
- e.mu.RLock()
- state := e.state
- e.mu.RUnlock()
- if state == StateClose {
- // When we get into StateClose while processing from the queue,
- // return immediately and let the protocolMainloop handle it.
- //
- // We can reach StateClose only while processing a previous segment
- // or a notification from the protocolMainLoop (caller goroutine).
- // This means that with this return, the segment dequeue below can
- // never occur on a closed endpoint.
+ if e.EndpointState() == StateClose || e.EndpointState() == StateError {
return nil
}
-
s := e.segmentQueue.dequeue()
if s == nil {
checkRequeue = false
break
}
- // Invoke the tcp probe if installed.
- if e.probe != nil {
- e.probe(e.completeState())
+ cont, err := e.handleSegment(s)
+ if err != nil {
+ s.decRef()
+ return err
}
-
- if s.flagIsSet(header.TCPFlagRst) {
- if ok, err := e.handleReset(s); !ok {
- return err
- }
- } else if s.flagIsSet(header.TCPFlagSyn) {
- // See: https://tools.ietf.org/html/rfc5961#section-4.1
- // 1) If the SYN bit is set, irrespective of the sequence number, TCP
- // MUST send an ACK (also referred to as challenge ACK) to the remote
- // peer:
- //
- // <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
- //
- // After sending the acknowledgment, TCP MUST drop the unacceptable
- // segment and stop processing further.
- //
- // By sending an ACK, the remote peer is challenged to confirm the loss
- // of the previous connection and the request to start a new connection.
- // A legitimate peer, after restart, would not have a TCB in the
- // synchronized state. Thus, when the ACK arrives, the peer should send
- // a RST segment back with the sequence number derived from the ACK
- // field that caused the RST.
-
- // This RST will confirm that the remote peer has indeed closed the
- // previous connection. Upon receipt of a valid RST, the local TCP
- // endpoint MUST terminate its connection. The local TCP endpoint
- // should then rely on SYN retransmission from the remote end to
- // re-establish the connection.
-
- e.snd.sendAck()
- } else if s.flagIsSet(header.TCPFlagAck) {
- // Patch the window size in the segment according to the
- // send window scale.
- s.window <<= e.snd.sndWndScale
-
- // RFC 793, page 41 states that "once in the ESTABLISHED
- // state all segments must carry current acknowledgment
- // information."
- drop, err := e.rcv.handleRcvdSegment(s)
- if err != nil {
- s.decRef()
- return err
- }
- if drop {
- s.decRef()
- continue
- }
- e.snd.handleRcvdSegment(s)
+ if !cont {
+ s.decRef()
+ return nil
}
- s.decRef()
}
- // If the queue is not empty, make sure we'll wake up in the next
- // iteration.
- if checkRequeue && !e.segmentQueue.empty() {
+ // When fastPath is true we don't want to wake up the worker
+ // goroutine. If the endpoint has more segments to process the
+ // dispatcher will call handleSegments again anyway.
+ if !fastPath && checkRequeue && !e.segmentQueue.empty() {
e.newSegmentWaker.Assert()
}
@@ -1040,23 +1039,113 @@ func (e *endpoint) handleSegments() *tcpip.Error {
e.snd.sendAck()
}
- e.resetKeepaliveTimer(true)
+ e.resetKeepaliveTimer(true /* receivedData */)
return nil
}
+// handleSegment handles a given segment and notifies the worker goroutine if
+// if the connection should be terminated.
+func (e *endpoint) handleSegment(s *segment) (cont bool, err *tcpip.Error) {
+ // Invoke the tcp probe if installed.
+ if e.probe != nil {
+ e.probe(e.completeState())
+ }
+
+ if s.flagIsSet(header.TCPFlagRst) {
+ if ok, err := e.handleReset(s); !ok {
+ return false, err
+ }
+ } else if s.flagIsSet(header.TCPFlagSyn) {
+ // See: https://tools.ietf.org/html/rfc5961#section-4.1
+ // 1) If the SYN bit is set, irrespective of the sequence number, TCP
+ // MUST send an ACK (also referred to as challenge ACK) to the remote
+ // peer:
+ //
+ // <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
+ //
+ // After sending the acknowledgment, TCP MUST drop the unacceptable
+ // segment and stop processing further.
+ //
+ // By sending an ACK, the remote peer is challenged to confirm the loss
+ // of the previous connection and the request to start a new connection.
+ // A legitimate peer, after restart, would not have a TCB in the
+ // synchronized state. Thus, when the ACK arrives, the peer should send
+ // a RST segment back with the sequence number derived from the ACK
+ // field that caused the RST.
+
+ // This RST will confirm that the remote peer has indeed closed the
+ // previous connection. Upon receipt of a valid RST, the local TCP
+ // endpoint MUST terminate its connection. The local TCP endpoint
+ // should then rely on SYN retransmission from the remote end to
+ // re-establish the connection.
+
+ e.snd.sendAck()
+ } else if s.flagIsSet(header.TCPFlagAck) {
+ // Patch the window size in the segment according to the
+ // send window scale.
+ s.window <<= e.snd.sndWndScale
+
+ // RFC 793, page 41 states that "once in the ESTABLISHED
+ // state all segments must carry current acknowledgment
+ // information."
+ drop, err := e.rcv.handleRcvdSegment(s)
+ if err != nil {
+ return false, err
+ }
+ if drop {
+ return true, nil
+ }
+
+ // Now check if the received segment has caused us to transition
+ // to a CLOSED state, if yes then terminate processing and do
+ // not invoke the sender.
+ e.mu.RLock()
+ state := e.state
+ e.mu.RUnlock()
+ if state == StateClose {
+ // When we get into StateClose while processing from the queue,
+ // return immediately and let the protocolMainloop handle it.
+ //
+ // We can reach StateClose only while processing a previous segment
+ // or a notification from the protocolMainLoop (caller goroutine).
+ // This means that with this return, the segment dequeue below can
+ // never occur on a closed endpoint.
+ s.decRef()
+ return false, nil
+ }
+
+ e.snd.handleRcvdSegment(s)
+ }
+
+ return true, nil
+}
+
// keepaliveTimerExpired is called when the keepaliveTimer fires. We send TCP
// keepalive packets periodically when the connection is idle. If we don't hear
// from the other side after a number of tries, we terminate the connection.
func (e *endpoint) keepaliveTimerExpired() *tcpip.Error {
+ e.mu.RLock()
+ userTimeout := e.userTimeout
+ e.mu.RUnlock()
+
e.keepalive.Lock()
if !e.keepalive.enabled || !e.keepalive.timer.checkExpiration() {
e.keepalive.Unlock()
return nil
}
+ // If a userTimeout is set then abort the connection if it is
+ // exceeded.
+ if userTimeout != 0 && time.Since(e.rcv.lastRcvdAckTime) >= userTimeout && e.keepalive.unacked > 0 {
+ e.keepalive.Unlock()
+ e.stack.Stats().TCP.EstablishedTimedout.Increment()
+ return tcpip.ErrTimeout
+ }
+
if e.keepalive.unacked >= e.keepalive.count {
e.keepalive.Unlock()
+ e.stack.Stats().TCP.EstablishedTimedout.Increment()
return tcpip.ErrTimeout
}
@@ -1073,7 +1162,6 @@ func (e *endpoint) keepaliveTimerExpired() *tcpip.Error {
// whether it is enabled for this endpoint.
func (e *endpoint) resetKeepaliveTimer(receivedData bool) {
e.keepalive.Lock()
- defer e.keepalive.Unlock()
if receivedData {
e.keepalive.unacked = 0
}
@@ -1081,6 +1169,7 @@ func (e *endpoint) resetKeepaliveTimer(receivedData bool) {
// data to send.
if !e.keepalive.enabled || e.snd == nil || e.snd.sndUna != e.snd.sndNxt {
e.keepalive.timer.disable()
+ e.keepalive.Unlock()
return
}
if e.keepalive.unacked > 0 {
@@ -1088,6 +1177,7 @@ func (e *endpoint) resetKeepaliveTimer(receivedData bool) {
} else {
e.keepalive.timer.enable(e.keepalive.idle)
}
+ e.keepalive.Unlock()
}
// disableKeepaliveTimer stops the keepalive timer.
@@ -1100,7 +1190,7 @@ func (e *endpoint) disableKeepaliveTimer() {
// protocolMainLoop is the main loop of the TCP protocol. It runs in its own
// goroutine and is responsible for sending segments and handling received
// segments.
-func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
+func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{}) *tcpip.Error {
var closeTimer *time.Timer
var closeWaker sleep.Waker
@@ -1122,6 +1212,7 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
}
e.mu.Unlock()
+ e.workMu.Unlock()
// When the protocol loop exits we should wake up our waiters.
e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
}
@@ -1133,7 +1224,7 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
initialRcvWnd := e.initialReceiveWindow()
h := newHandshake(e, seqnum.Size(initialRcvWnd))
e.mu.Lock()
- h.ep.state = StateSynSent
+ h.ep.setEndpointState(StateSynSent)
e.mu.Unlock()
if err := h.execute(); err != nil {
@@ -1142,41 +1233,18 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
e.lastErrorMu.Unlock()
e.mu.Lock()
- e.stack.Stats().TCP.EstablishedResets.Increment()
- e.stack.Stats().TCP.CurrentEstablished.Decrement()
- e.state = StateError
+ e.setEndpointState(StateError)
e.HardError = err
// Lock released below.
epilogue()
-
return err
}
-
- // Transfer handshake state to TCP connection. We disable
- // receive window scaling if the peer doesn't support it
- // (indicated by a negative send window scale).
- e.snd = newSender(e, h.iss, h.ackNum-1, h.sndWnd, h.mss, h.sndWndScale)
-
- rcvBufSize := seqnum.Size(e.receiveBufferSize())
- e.rcvListMu.Lock()
- e.rcv = newReceiver(e, h.ackNum-1, h.rcvWnd, h.effectiveRcvWndScale(), rcvBufSize)
- // boot strap the auto tuning algorithm. Starting at zero will
- // result in a large step function on the first proper causing
- // the window to just go to a really large value after the first
- // RTT itself.
- e.rcvAutoParams.prevCopied = initialRcvWnd
- e.rcvListMu.Unlock()
- e.stack.Stats().TCP.CurrentEstablished.Increment()
- e.mu.Lock()
- e.state = StateEstablished
- e.mu.Unlock()
}
e.keepalive.timer.init(&e.keepalive.waker)
defer e.keepalive.timer.cleanup()
- // Tell waiters that the endpoint is connected and writable.
e.mu.Lock()
drained := e.drainDone != nil
e.mu.Unlock()
@@ -1185,8 +1253,6 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
<-e.undrain
}
- e.waiterQueue.Notify(waiter.EventOut)
-
// Set up the functions that will be called when the main protocol loop
// wakes up.
funcs := []struct {
@@ -1202,17 +1268,14 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
f: e.handleClose,
},
{
- w: &e.newSegmentWaker,
- f: e.handleSegments,
- },
- {
w: &closeWaker,
f: func() *tcpip.Error {
// This means the socket is being closed due
- // to the TCP_FIN_WAIT2 timeout was hit. Just
+ // to the TCP-FIN-WAIT2 timeout was hit. Just
// mark the socket as closed.
e.mu.Lock()
e.transitionToStateCloseLocked()
+ e.workerCleanup = true
e.mu.Unlock()
return nil
},
@@ -1221,12 +1284,19 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
w: &e.snd.resendWaker,
f: func() *tcpip.Error {
if !e.snd.retransmitTimerExpired() {
+ e.stack.Stats().TCP.EstablishedTimedout.Increment()
return tcpip.ErrTimeout
}
return nil
},
},
{
+ w: &e.newSegmentWaker,
+ f: func() *tcpip.Error {
+ return e.handleSegments(false /* fastPath */)
+ },
+ },
+ {
w: &e.keepalive.waker,
f: e.keepaliveTimerExpired,
},
@@ -1253,14 +1323,16 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
}
if n&notifyReset != 0 {
- e.mu.Lock()
- e.resetConnectionLocked(tcpip.ErrConnectionAborted)
- e.mu.Unlock()
+ return tcpip.ErrConnectionAborted
+ }
+
+ if n&notifyResetByPeer != 0 {
+ return tcpip.ErrConnectionReset
}
if n&notifyClose != 0 && closeTimer == nil {
e.mu.Lock()
- if e.state == StateFinWait2 && e.closed {
+ if e.EndpointState() == StateFinWait2 && e.closed {
// The socket has been closed and we are in FIN_WAIT2
// so start the FIN_WAIT2 timer.
closeTimer = time.AfterFunc(e.tcpLingerTimeout, func() {
@@ -1280,11 +1352,11 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
if n&notifyDrain != 0 {
for !e.segmentQueue.empty() {
- if err := e.handleSegments(); err != nil {
+ if err := e.handleSegments(false /* fastPath */); err != nil {
return err
}
}
- if e.state != StateClose && e.state != StateError {
+ if e.EndpointState() != StateClose && e.EndpointState() != StateError {
// Only block the worker if the endpoint
// is not in closed state or error state.
close(e.drainDone)
@@ -1309,14 +1381,21 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
s.AddWaker(funcs[i].w, i)
}
+ // Notify the caller that the waker initialization is complete and the
+ // endpoint is ready.
+ if wakerInitDone != nil {
+ close(wakerInitDone)
+ }
+
+ // Tell waiters that the endpoint is connected and writable.
+ e.waiterQueue.Notify(waiter.EventOut)
+
// The following assertions and notifications are needed for restored
// endpoints. Fresh newly created endpoints have empty states and should
// not invoke any.
- e.segmentQueue.mu.Lock()
- if !e.segmentQueue.list.Empty() {
+ if !e.segmentQueue.empty() {
e.newSegmentWaker.Assert()
}
- e.segmentQueue.mu.Unlock()
e.rcvListMu.Lock()
if !e.rcvList.Empty() {
@@ -1331,28 +1410,53 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
// Main loop. Handle segments until both send and receive ends of the
// connection have completed.
+ cleanupOnError := func(err *tcpip.Error) {
+ e.mu.Lock()
+ e.workerCleanup = true
+ if err != nil {
+ e.resetConnectionLocked(err)
+ }
+ // Lock released below.
+ epilogue()
+ }
- for e.state != StateTimeWait && e.state != StateClose && e.state != StateError {
+loop:
+ for e.EndpointState() != StateTimeWait && e.EndpointState() != StateClose && e.EndpointState() != StateError {
e.mu.Unlock()
e.workMu.Unlock()
v, _ := s.Fetch(true)
e.workMu.Lock()
- if err := funcs[v].f(); err != nil {
- e.mu.Lock()
- // Ensure we release all endpoint registration and route
- // references as the connection is now in an error
- // state.
- e.workerCleanup = true
- e.resetConnectionLocked(err)
- // Lock released below.
- epilogue()
+ // We need to double check here because the notification maybe
+ // stale by the time we got around to processing it.
+ //
+ // NOTE: since we now hold the workMu the processors cannot
+ // change the state of the endpoint so it's safe to proceed
+ // after this check.
+ switch e.EndpointState() {
+ case StateError:
+ // If the endpoint has already transitioned to an ERROR
+ // state just pass nil here as any reset that may need
+ // to be sent etc should already have been done and we
+ // just want to terminate the loop and cleanup the
+ // endpoint.
+ cleanupOnError(nil)
return nil
+ case StateTimeWait:
+ fallthrough
+ case StateClose:
+ e.mu.Lock()
+ break loop
+ default:
+ if err := funcs[v].f(); err != nil {
+ cleanupOnError(err)
+ return nil
+ }
+ e.mu.Lock()
}
- e.mu.Lock()
}
- state := e.state
+ state := e.EndpointState()
e.mu.Unlock()
var reuseTW func()
if state == StateTimeWait {
@@ -1365,14 +1469,15 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
s.Done()
// Wake up any waiters before we enter TIME_WAIT.
e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
+ e.mu.Lock()
+ e.workerCleanup = true
+ e.mu.Unlock()
reuseTW = e.doTimeWait()
}
// Mark endpoint as closed.
e.mu.Lock()
- if e.state != StateError {
- e.stack.Stats().TCP.EstablishedResets.Increment()
- e.stack.Stats().TCP.CurrentEstablished.Decrement()
+ if e.EndpointState() != StateError {
e.transitionToStateCloseLocked()
}
@@ -1388,6 +1493,7 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
if s == nil {
break
}
+
e.tryDeliverSegmentFromClosedEndpoint(s)
}
@@ -1428,7 +1534,11 @@ func (e *endpoint) handleTimeWaitSegments() (extendTimeWait bool, reuseTW func()
tcpEP := listenEP.(*endpoint)
if EndpointState(tcpEP.State()) == StateListen {
reuseTW = func() {
- tcpEP.enqueueSegment(s)
+ if !tcpEP.enqueueSegment(s) {
+ s.decRef()
+ return
+ }
+ tcpEP.newSegmentWaker.Assert()
}
// We explicitly do not decRef
// the segment as it's still
diff --git a/pkg/tcpip/transport/tcp/dispatcher.go b/pkg/tcpip/transport/tcp/dispatcher.go
new file mode 100644
index 000000000..e18012ac0
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/dispatcher.go
@@ -0,0 +1,224 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+import (
+ "gvisor.dev/gvisor/pkg/rand"
+ "gvisor.dev/gvisor/pkg/sleep"
+ "gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/tcpip"
+ "gvisor.dev/gvisor/pkg/tcpip/hash/jenkins"
+ "gvisor.dev/gvisor/pkg/tcpip/header"
+ "gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// epQueue is a queue of endpoints.
+type epQueue struct {
+ mu sync.Mutex
+ list endpointList
+}
+
+// enqueue adds e to the queue if the endpoint is not already on the queue.
+func (q *epQueue) enqueue(e *endpoint) {
+ q.mu.Lock()
+ if e.pendingProcessing {
+ q.mu.Unlock()
+ return
+ }
+ q.list.PushBack(e)
+ e.pendingProcessing = true
+ q.mu.Unlock()
+}
+
+// dequeue removes and returns the first element from the queue if available,
+// returns nil otherwise.
+func (q *epQueue) dequeue() *endpoint {
+ q.mu.Lock()
+ if e := q.list.Front(); e != nil {
+ q.list.Remove(e)
+ e.pendingProcessing = false
+ q.mu.Unlock()
+ return e
+ }
+ q.mu.Unlock()
+ return nil
+}
+
+// empty returns true if the queue is empty, false otherwise.
+func (q *epQueue) empty() bool {
+ q.mu.Lock()
+ v := q.list.Empty()
+ q.mu.Unlock()
+ return v
+}
+
+// processor is responsible for processing packets queued to a tcp endpoint.
+type processor struct {
+ epQ epQueue
+ newEndpointWaker sleep.Waker
+ id int
+}
+
+func newProcessor(id int) *processor {
+ p := &processor{
+ id: id,
+ }
+ go p.handleSegments()
+ return p
+}
+
+func (p *processor) queueEndpoint(ep *endpoint) {
+ // Queue an endpoint for processing by the processor goroutine.
+ p.epQ.enqueue(ep)
+ p.newEndpointWaker.Assert()
+}
+
+func (p *processor) handleSegments() {
+ const newEndpointWaker = 1
+ s := sleep.Sleeper{}
+ s.AddWaker(&p.newEndpointWaker, newEndpointWaker)
+ defer s.Done()
+ for {
+ s.Fetch(true)
+ for ep := p.epQ.dequeue(); ep != nil; ep = p.epQ.dequeue() {
+ if ep.segmentQueue.empty() {
+ continue
+ }
+
+ // If socket has transitioned out of connected state
+ // then just let the worker handle the packet.
+ //
+ // NOTE: We read this outside of e.mu lock which means
+ // that by the time we get to handleSegments the
+ // endpoint may not be in ESTABLISHED. But this should
+ // be fine as all normal shutdown states are handled by
+ // handleSegments and if the endpoint moves to a
+ // CLOSED/ERROR state then handleSegments is a noop.
+ if ep.EndpointState() != StateEstablished {
+ ep.newSegmentWaker.Assert()
+ continue
+ }
+
+ if !ep.workMu.TryLock() {
+ ep.newSegmentWaker.Assert()
+ continue
+ }
+ // If the endpoint is in a connected state then we do
+ // direct delivery to ensure low latency and avoid
+ // scheduler interactions.
+ if err := ep.handleSegments(true /* fastPath */); err != nil || ep.EndpointState() == StateClose {
+ // Send any active resets if required.
+ if err != nil {
+ ep.mu.Lock()
+ ep.resetConnectionLocked(err)
+ ep.mu.Unlock()
+ }
+ ep.notifyProtocolGoroutine(notifyTickleWorker)
+ ep.workMu.Unlock()
+ continue
+ }
+
+ if !ep.segmentQueue.empty() {
+ p.epQ.enqueue(ep)
+ }
+
+ ep.workMu.Unlock()
+ }
+ }
+}
+
+// dispatcher manages a pool of TCP endpoint processors which are responsible
+// for the processing of inbound segments. This fixed pool of processor
+// goroutines do full tcp processing. The processor is selected based on the
+// hash of the endpoint id to ensure that delivery for the same endpoint happens
+// in-order.
+type dispatcher struct {
+ processors []*processor
+ seed uint32
+}
+
+func newDispatcher(nProcessors int) *dispatcher {
+ processors := []*processor{}
+ for i := 0; i < nProcessors; i++ {
+ processors = append(processors, newProcessor(i))
+ }
+ return &dispatcher{
+ processors: processors,
+ seed: generateRandUint32(),
+ }
+}
+
+func (d *dispatcher) queuePacket(r *stack.Route, stackEP stack.TransportEndpoint, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) {
+ ep := stackEP.(*endpoint)
+ s := newSegment(r, id, pkt)
+ if !s.parse() {
+ ep.stack.Stats().MalformedRcvdPackets.Increment()
+ ep.stack.Stats().TCP.InvalidSegmentsReceived.Increment()
+ ep.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
+ s.decRef()
+ return
+ }
+
+ if !s.csumValid {
+ ep.stack.Stats().MalformedRcvdPackets.Increment()
+ ep.stack.Stats().TCP.ChecksumErrors.Increment()
+ ep.stats.ReceiveErrors.ChecksumErrors.Increment()
+ s.decRef()
+ return
+ }
+
+ ep.stack.Stats().TCP.ValidSegmentsReceived.Increment()
+ ep.stats.SegmentsReceived.Increment()
+ if (s.flags & header.TCPFlagRst) != 0 {
+ ep.stack.Stats().TCP.ResetsReceived.Increment()
+ }
+
+ if !ep.enqueueSegment(s) {
+ s.decRef()
+ return
+ }
+
+ // For sockets not in established state let the worker goroutine
+ // handle the packets.
+ if ep.EndpointState() != StateEstablished {
+ ep.newSegmentWaker.Assert()
+ return
+ }
+
+ d.selectProcessor(id).queueEndpoint(ep)
+}
+
+func generateRandUint32() uint32 {
+ b := make([]byte, 4)
+ if _, err := rand.Read(b); err != nil {
+ panic(err)
+ }
+ return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+}
+
+func (d *dispatcher) selectProcessor(id stack.TransportEndpointID) *processor {
+ payload := []byte{
+ byte(id.LocalPort),
+ byte(id.LocalPort >> 8),
+ byte(id.RemotePort),
+ byte(id.RemotePort >> 8)}
+
+ h := jenkins.Sum32(d.seed)
+ h.Write(payload)
+ h.Write([]byte(id.LocalAddress))
+ h.Write([]byte(id.RemoteAddress))
+
+ return d.processors[h.Sum32()%uint32(len(d.processors))]
+}
diff --git a/pkg/tcpip/transport/tcp/dual_stack_test.go b/pkg/tcpip/transport/tcp/dual_stack_test.go
index dfaa4a559..4f361b226 100644
--- a/pkg/tcpip/transport/tcp/dual_stack_test.go
+++ b/pkg/tcpip/transport/tcp/dual_stack_test.go
@@ -391,9 +391,8 @@ func testV4Accept(t *testing.T, c *context.Context) {
// Make sure we get the same error when calling the original ep and the
// new one. This validates that v4-mapped endpoints are still able to
// query the V6Only flag, whereas pure v4 endpoints are not.
- var v tcpip.V6OnlyOption
- expected := c.EP.GetSockOpt(&v)
- if err := nep.GetSockOpt(&v); err != expected {
+ _, expected := c.EP.GetSockOptBool(tcpip.V6OnlyOption)
+ if _, err := nep.GetSockOptBool(tcpip.V6OnlyOption); err != expected {
t.Fatalf("GetSockOpt returned unexpected value: got %v, want %v", err, expected)
}
@@ -531,8 +530,7 @@ func TestV6AcceptOnV6(t *testing.T) {
// Make sure we can still query the v6 only status of the new endpoint,
// that is, that it is in fact a v6 socket.
- var v tcpip.V6OnlyOption
- if err := nep.GetSockOpt(&v); err != nil {
+ if _, err := nep.GetSockOptBool(tcpip.V6OnlyOption); err != nil {
t.Fatalf("GetSockOpt failed failed: %v", err)
}
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 9d4a87e30..4797f11d1 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -19,17 +19,18 @@ import (
"fmt"
"math"
"strings"
- "sync"
"sync/atomic"
"time"
"gvisor.dev/gvisor/pkg/rand"
"gvisor.dev/gvisor/pkg/sleep"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/buffer"
"gvisor.dev/gvisor/pkg/tcpip/hash/jenkins"
"gvisor.dev/gvisor/pkg/tcpip/header"
"gvisor.dev/gvisor/pkg/tcpip/iptables"
+ "gvisor.dev/gvisor/pkg/tcpip/ports"
"gvisor.dev/gvisor/pkg/tcpip/seqnum"
"gvisor.dev/gvisor/pkg/tcpip/stack"
"gvisor.dev/gvisor/pkg/tmutex"
@@ -119,6 +120,7 @@ const (
notifyMTUChanged
notifyDrain
notifyReset
+ notifyResetByPeer
notifyKeepaliveChanged
notifyMSSChanged
// notifyTickleWorker is used to tickle the protocol main loop during a
@@ -126,6 +128,7 @@ const (
// ensures the loop terminates if the final state of the endpoint is
// say TIME_WAIT.
notifyTickleWorker
+ notifyError
)
// SACKInfo holds TCP SACK related information for a given endpoint.
@@ -282,6 +285,18 @@ func (*EndpointInfo) IsEndpointInfo() {}
type endpoint struct {
EndpointInfo
+ // endpointEntry is used to queue endpoints for processing to the
+ // a given tcp processor goroutine.
+ //
+ // Precondition: epQueue.mu must be held to read/write this field..
+ endpointEntry `state:"nosave"`
+
+ // pendingProcessing is true if this endpoint is queued for processing
+ // to a TCP processor.
+ //
+ // Precondition: epQueue.mu must be held to read/write this field..
+ pendingProcessing bool `state:"nosave"`
+
// workMu is used to arbitrate which goroutine may perform protocol
// work. Only the main protocol goroutine is expected to call Lock() on
// it, but other goroutines (e.g., send) may call TryLock() to eagerly
@@ -323,6 +338,7 @@ type endpoint struct {
// The following fields are protected by the mutex.
mu sync.RWMutex `state:"nosave"`
+ // state must be read/set using the EndpointState()/setEndpointState() methods.
state EndpointState `state:".(EndpointState)"`
// origEndpointState is only used during a restore phase to save the
@@ -340,9 +356,11 @@ type endpoint struct {
// TCP should never broadcast but Linux nevertheless supports enabling/
// disabling SO_BROADCAST, albeit as a NOOP.
broadcast bool
+
// Values used to reserve a port or register a transport endpoint
// (which ever happens first).
boundBindToDevice tcpip.NICID
+ boundPortFlags ports.Flags
// effectiveNetProtos contains the network protocols actually in use. In
// most cases it will only contain "netProto", but in cases like IPv6
@@ -356,7 +374,7 @@ type endpoint struct {
workerRunning bool
// workerCleanup specifies if the worker goroutine must perform cleanup
- // before exitting. This can only be set to true when workerRunning is
+ // before exiting. This can only be set to true when workerRunning is
// also true, and they're both protected by the mutex.
workerCleanup bool
@@ -368,6 +386,8 @@ type endpoint struct {
// recentTS is the timestamp that should be sent in the TSEcr field of
// the timestamp for future segments sent by the endpoint. This field is
// updated if required when a new segment is received by this endpoint.
+ //
+ // recentTS must be read/written atomically.
recentTS uint32
// tsOffset is a randomized offset added to the value of the
@@ -472,6 +492,12 @@ type endpoint struct {
// without hearing a response, the connection is closed.
keepalive keepalive
+ // userTimeout if non-zero specifies a user specified timeout for
+ // a connection w/ pending data to send. A connection that has pending
+ // unacked data will be forcibily aborted if the timeout is reached
+ // without any data being acked.
+ userTimeout time.Duration
+
// pendingAccepted is a synchronization primitive used to track number
// of connections that are queued up to be delivered to the accepted
// channel. We use this to ensure that all goroutines blocked on writing
@@ -558,6 +584,47 @@ func (e *endpoint) ResumeWork() {
e.workMu.Unlock()
}
+// setEndpointState updates the state of the endpoint to state atomically. This
+// method is unexported as the only place we should update the state is in this
+// package but we allow the state to be read freely without holding e.mu.
+//
+// Precondition: e.mu must be held to call this method.
+func (e *endpoint) setEndpointState(state EndpointState) {
+ oldstate := EndpointState(atomic.LoadUint32((*uint32)(&e.state)))
+ switch state {
+ case StateEstablished:
+ e.stack.Stats().TCP.CurrentEstablished.Increment()
+ case StateError:
+ fallthrough
+ case StateClose:
+ if oldstate == StateCloseWait || oldstate == StateEstablished {
+ e.stack.Stats().TCP.EstablishedResets.Increment()
+ }
+ fallthrough
+ default:
+ if oldstate == StateEstablished {
+ e.stack.Stats().TCP.CurrentEstablished.Decrement()
+ }
+ }
+ atomic.StoreUint32((*uint32)(&e.state), uint32(state))
+}
+
+// EndpointState returns the current state of the endpoint.
+func (e *endpoint) EndpointState() EndpointState {
+ return EndpointState(atomic.LoadUint32((*uint32)(&e.state)))
+}
+
+// setRecentTimestamp atomically sets the recentTS field to the
+// provided value.
+func (e *endpoint) setRecentTimestamp(recentTS uint32) {
+ atomic.StoreUint32(&e.recentTS, recentTS)
+}
+
+// recentTimestamp atomically reads and returns the value of the recentTS field.
+func (e *endpoint) recentTimestamp() uint32 {
+ return atomic.LoadUint32(&e.recentTS)
+}
+
// keepalive is a synchronization wrapper used to appease stateify. See the
// comment in endpoint, where it is used.
//
@@ -647,7 +714,7 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
e.mu.RLock()
defer e.mu.RUnlock()
- switch e.state {
+ switch e.EndpointState() {
case StateInitial, StateBound, StateConnecting, StateSynSent, StateSynRecv:
// Ready for nothing.
@@ -663,7 +730,7 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
}
}
}
- if e.state.connected() {
+ if e.EndpointState().connected() {
// Determine if the endpoint is writable if requested.
if (mask & waiter.EventOut) != 0 {
e.sndBufMu.Lock()
@@ -724,22 +791,29 @@ func (e *endpoint) Close() {
// Issue a shutdown so that the peer knows we won't send any more data
// if we're connected, or stop accepting if we're listening.
e.Shutdown(tcpip.ShutdownWrite | tcpip.ShutdownRead)
+ e.closeNoShutdown()
+}
+// closeNoShutdown closes the endpoint without doing a full shutdown. This is
+// used when a connection needs to be aborted with a RST and we want to skip
+// a full 4 way TCP shutdown.
+func (e *endpoint) closeNoShutdown() {
e.mu.Lock()
// For listening sockets, we always release ports inline so that they
// are immediately available for reuse after Close() is called. If also
// registered, we unregister as well otherwise the next user would fail
// in Listen() when trying to register.
- if e.state == StateListen && e.isPortReserved {
+ if e.EndpointState() == StateListen && e.isPortReserved {
if e.isRegistered {
e.stack.StartTransportEndpointCleanup(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundBindToDevice)
e.isRegistered = false
}
- e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundBindToDevice)
+ e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundPortFlags, e.boundBindToDevice)
e.isPortReserved = false
e.boundBindToDevice = 0
+ e.boundPortFlags = ports.Flags{}
}
// Mark endpoint as closed.
@@ -770,6 +844,8 @@ func (e *endpoint) closePendingAcceptableConnectionsLocked() {
defer close(done)
for n := range e.acceptedChan {
n.notifyProtocolGoroutine(notifyReset)
+ // close all connections that have completed but
+ // not accepted by the application.
n.Close()
}
}()
@@ -787,11 +863,13 @@ func (e *endpoint) closePendingAcceptableConnectionsLocked() {
// after Close() is called and the worker goroutine (if any) is done with its
// work.
func (e *endpoint) cleanupLocked() {
+
// Close all endpoints that might have been accepted by TCP but not by
// the client.
if e.acceptedChan != nil {
e.closePendingAcceptableConnectionsLocked()
}
+
e.workerCleanup = false
if e.isRegistered {
@@ -800,10 +878,11 @@ func (e *endpoint) cleanupLocked() {
}
if e.isPortReserved {
- e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundBindToDevice)
+ e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundPortFlags, e.boundBindToDevice)
e.isPortReserved = false
}
e.boundBindToDevice = 0
+ e.boundPortFlags = ports.Flags{}
e.route.Release()
e.stack.CompleteTransportEndpointCleanup(e)
@@ -874,8 +953,14 @@ func (e *endpoint) ModerateRecvBuf(copied int) {
// reject valid data that might already be in flight as the
// acceptable window will shrink.
if rcvWnd > e.rcvBufSize {
+ availBefore := e.receiveBufferAvailableLocked()
e.rcvBufSize = rcvWnd
- e.notifyProtocolGoroutine(notifyReceiveWindowChanged)
+ availAfter := e.receiveBufferAvailableLocked()
+ mask := uint32(notifyReceiveWindowChanged)
+ if crossed, above := e.windowCrossedACKThreshold(availAfter - availBefore); crossed && above {
+ mask |= notifyNonZeroReceiveWindow
+ }
+ e.notifyProtocolGoroutine(mask)
}
// We only update prevCopied when we grow the buffer because in cases
@@ -903,7 +988,7 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages,
// reads to proceed before returning a ECONNRESET.
e.rcvListMu.Lock()
bufUsed := e.rcvBufUsed
- if s := e.state; !s.connected() && s != StateClose && bufUsed == 0 {
+ if s := e.EndpointState(); !s.connected() && s != StateClose && bufUsed == 0 {
e.rcvListMu.Unlock()
he := e.HardError
e.mu.RUnlock()
@@ -927,7 +1012,7 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages,
func (e *endpoint) readLocked() (buffer.View, *tcpip.Error) {
if e.rcvBufUsed == 0 {
- if e.rcvClosed || !e.state.connected() {
+ if e.rcvClosed || !e.EndpointState().connected() {
return buffer.View{}, tcpip.ErrClosedForReceive
}
return buffer.View{}, tcpip.ErrWouldBlock
@@ -944,11 +1029,12 @@ func (e *endpoint) readLocked() (buffer.View, *tcpip.Error) {
}
e.rcvBufUsed -= len(v)
- // If the window was zero before this read and if the read freed up
- // enough buffer space for the scaled window to be non-zero then notify
- // the protocol goroutine to send a window update.
- if e.zeroWindow && !e.zeroReceiveWindow(e.rcv.rcvWndScale) {
- e.zeroWindow = false
+
+ // If the window was small before this read and if the read freed up
+ // enough buffer space, to either fit an aMSS or half a receive buffer
+ // (whichever smaller), then notify the protocol goroutine to send a
+ // window update.
+ if crossed, above := e.windowCrossedACKThreshold(len(v)); crossed && above {
e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow)
}
@@ -962,8 +1048,8 @@ func (e *endpoint) readLocked() (buffer.View, *tcpip.Error) {
// Caller must hold e.mu and e.sndBufMu
func (e *endpoint) isEndpointWritableLocked() (int, *tcpip.Error) {
// The endpoint cannot be written to if it's not connected.
- if !e.state.connected() {
- switch e.state {
+ if !e.EndpointState().connected() {
+ switch e.EndpointState() {
case StateError:
return 0, e.HardError
default:
@@ -1021,42 +1107,86 @@ func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
return 0, nil, perr
}
- if !opts.Atomic { // See above.
- e.mu.RLock()
- e.sndBufMu.Lock()
+ if opts.Atomic {
+ // Add data to the send queue.
+ s := newSegmentFromView(&e.route, e.ID, v)
+ e.sndBufUsed += len(v)
+ e.sndBufInQueue += seqnum.Size(len(v))
+ e.sndQueue.PushBack(s)
+ e.sndBufMu.Unlock()
+ // Release the endpoint lock to prevent deadlocks due to lock
+ // order inversion when acquiring workMu.
+ e.mu.RUnlock()
+ }
- // Because we released the lock before copying, check state again
- // to make sure the endpoint is still in a valid state for a write.
- avail, err = e.isEndpointWritableLocked()
- if err != nil {
+ if e.workMu.TryLock() {
+ // Since we released locks in between it's possible that the
+ // endpoint transitioned to a CLOSED/ERROR states so make
+ // sure endpoint is still writable before trying to write.
+ if !opts.Atomic { // See above.
+ e.mu.RLock()
+ e.sndBufMu.Lock()
+
+ // Because we released the lock before copying, check state again
+ // to make sure the endpoint is still in a valid state for a write.
+ avail, err = e.isEndpointWritableLocked()
+ if err != nil {
+ e.sndBufMu.Unlock()
+ e.mu.RUnlock()
+ e.stats.WriteErrors.WriteClosed.Increment()
+ return 0, nil, err
+ }
+
+ // Discard any excess data copied in due to avail being reduced due
+ // to a simultaneous write call to the socket.
+ if avail < len(v) {
+ v = v[:avail]
+ }
+ // Add data to the send queue.
+ s := newSegmentFromView(&e.route, e.ID, v)
+ e.sndBufUsed += len(v)
+ e.sndBufInQueue += seqnum.Size(len(v))
+ e.sndQueue.PushBack(s)
e.sndBufMu.Unlock()
+ // Release the endpoint lock to prevent deadlocks due to lock
+ // order inversion when acquiring workMu.
e.mu.RUnlock()
- e.stats.WriteErrors.WriteClosed.Increment()
- return 0, nil, err
- }
- // Discard any excess data copied in due to avail being reduced due
- // to a simultaneous write call to the socket.
- if avail < len(v) {
- v = v[:avail]
}
- }
-
- // Add data to the send queue.
- s := newSegmentFromView(&e.route, e.ID, v)
- e.sndBufUsed += len(v)
- e.sndBufInQueue += seqnum.Size(len(v))
- e.sndQueue.PushBack(s)
- e.sndBufMu.Unlock()
- // Release the endpoint lock to prevent deadlocks due to lock
- // order inversion when acquiring workMu.
- e.mu.RUnlock()
-
- if e.workMu.TryLock() {
// Do the work inline.
e.handleWrite()
e.workMu.Unlock()
} else {
+ if !opts.Atomic { // See above.
+ e.mu.RLock()
+ e.sndBufMu.Lock()
+
+ // Because we released the lock before copying, check state again
+ // to make sure the endpoint is still in a valid state for a write.
+ avail, err = e.isEndpointWritableLocked()
+ if err != nil {
+ e.sndBufMu.Unlock()
+ e.mu.RUnlock()
+ e.stats.WriteErrors.WriteClosed.Increment()
+ return 0, nil, err
+ }
+
+ // Discard any excess data copied in due to avail being reduced due
+ // to a simultaneous write call to the socket.
+ if avail < len(v) {
+ v = v[:avail]
+ }
+ // Add data to the send queue.
+ s := newSegmentFromView(&e.route, e.ID, v)
+ e.sndBufUsed += len(v)
+ e.sndBufInQueue += seqnum.Size(len(v))
+ e.sndQueue.PushBack(s)
+ e.sndBufMu.Unlock()
+ // Release the endpoint lock to prevent deadlocks due to lock
+ // order inversion when acquiring workMu.
+ e.mu.RUnlock()
+
+ }
// Let the protocol goroutine do the work.
e.sndWaker.Assert()
}
@@ -1073,7 +1203,7 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro
// The endpoint can be read if it's connected, or if it's already closed
// but has some pending unread data.
- if s := e.state; !s.connected() && s != StateClose {
+ if s := e.EndpointState(); !s.connected() && s != StateClose {
if s == StateError {
return 0, tcpip.ControlMessages{}, e.HardError
}
@@ -1085,7 +1215,7 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro
defer e.rcvListMu.Unlock()
if e.rcvBufUsed == 0 {
- if e.rcvClosed || !e.state.connected() {
+ if e.rcvClosed || !e.EndpointState().connected() {
e.stats.ReadErrors.ReadClosed.Increment()
return 0, tcpip.ControlMessages{}, tcpip.ErrClosedForReceive
}
@@ -1122,20 +1252,65 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro
return num, tcpip.ControlMessages{}, nil
}
-// zeroReceiveWindow checks if the receive window to be announced now would be
-// zero, based on the amount of available buffer and the receive window scaling.
+// windowCrossedACKThreshold checks if the receive window to be announced now
+// would be under aMSS or under half receive buffer, whichever smaller. This is
+// useful as a receive side silly window syndrome prevention mechanism. If
+// window grows to reasonable value, we should send ACK to the sender to inform
+// the rx space is now large. We also want ensure a series of small read()'s
+// won't trigger a flood of spurious tiny ACK's.
//
-// It must be called with rcvListMu held.
-func (e *endpoint) zeroReceiveWindow(scale uint8) bool {
- if e.rcvBufUsed >= e.rcvBufSize {
- return true
+// For large receive buffers, the threshold is aMSS - once reader reads more
+// than aMSS we'll send ACK. For tiny receive buffers, the threshold is half of
+// receive buffer size. This is chosen arbitrairly.
+// crossed will be true if the window size crossed the ACK threshold.
+// above will be true if the new window is >= ACK threshold and false
+// otherwise.
+func (e *endpoint) windowCrossedACKThreshold(deltaBefore int) (crossed bool, above bool) {
+ newAvail := e.receiveBufferAvailableLocked()
+ oldAvail := newAvail - deltaBefore
+ if oldAvail < 0 {
+ oldAvail = 0
+ }
+
+ threshold := int(e.amss)
+ if threshold > e.rcvBufSize/2 {
+ threshold = e.rcvBufSize / 2
+ }
+
+ switch {
+ case oldAvail < threshold && newAvail >= threshold:
+ return true, true
+ case oldAvail >= threshold && newAvail < threshold:
+ return true, false
+ }
+ return false, false
+}
+
+// SetSockOptBool sets a socket option.
+func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
+ switch opt {
+ case tcpip.V6OnlyOption:
+ // We only recognize this option on v6 endpoints.
+ if e.NetProto != header.IPv6ProtocolNumber {
+ return tcpip.ErrInvalidEndpointState
+ }
+
+ e.mu.Lock()
+ defer e.mu.Unlock()
+
+ // We only allow this to be set when we're in the initial state.
+ if e.EndpointState() != StateInitial {
+ return tcpip.ErrInvalidEndpointState
+ }
+
+ e.v6only = v
}
- return ((e.rcvBufSize - e.rcvBufUsed) >> scale) == 0
+ return nil
}
// SetSockOptInt sets a socket option.
-func (e *endpoint) SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error {
+func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
switch opt {
case tcpip.ReceiveBufferSizeOption:
// Make sure the receive buffer size is within the min and max
@@ -1170,10 +1345,16 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error {
size = math.MaxInt32 / 2
}
+ availBefore := e.receiveBufferAvailableLocked()
e.rcvBufSize = size
+ availAfter := e.receiveBufferAvailableLocked()
+
e.rcvAutoParams.disabled = true
- if e.zeroWindow && !e.zeroReceiveWindow(scale) {
- e.zeroWindow = false
+
+ // Immediately send an ACK to uncork the sender silly window
+ // syndrome prevetion, when our available space grows above aMSS
+ // or half receive buffer, whichever smaller.
+ if crossed, above := e.windowCrossedACKThreshold(availAfter - availBefore); crossed && above {
mask |= notifyNonZeroReceiveWindow
}
e.rcvListMu.Unlock()
@@ -1245,19 +1426,14 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
return nil
case tcpip.BindToDeviceOption:
- e.mu.Lock()
- defer e.mu.Unlock()
- if v == "" {
- e.bindToDevice = 0
- return nil
+ id := tcpip.NICID(v)
+ if id != 0 && !e.stack.HasNIC(id) {
+ return tcpip.ErrUnknownDevice
}
- for nicID, nic := range e.stack.NICInfo() {
- if nic.Name == string(v) {
- e.bindToDevice = nicID
- return nil
- }
- }
- return tcpip.ErrUnknownDevice
+ e.mu.Lock()
+ e.bindToDevice = id
+ e.mu.Unlock()
+ return nil
case tcpip.QuickAckOption:
if v == 0 {
@@ -1278,23 +1454,6 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
e.notifyProtocolGoroutine(notifyMSSChanged)
return nil
- case tcpip.V6OnlyOption:
- // We only recognize this option on v6 endpoints.
- if e.NetProto != header.IPv6ProtocolNumber {
- return tcpip.ErrInvalidEndpointState
- }
-
- e.mu.Lock()
- defer e.mu.Unlock()
-
- // We only allow this to be set when we're in the initial state.
- if e.state != StateInitial {
- return tcpip.ErrInvalidEndpointState
- }
-
- e.v6only = v != 0
- return nil
-
case tcpip.TTLOption:
e.mu.Lock()
e.ttl = uint8(v)
@@ -1329,6 +1488,12 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
e.notifyProtocolGoroutine(notifyKeepaliveChanged)
return nil
+ case tcpip.TCPUserTimeoutOption:
+ e.mu.Lock()
+ e.userTimeout = time.Duration(v)
+ e.mu.Unlock()
+ return nil
+
case tcpip.BroadcastOption:
e.mu.Lock()
e.broadcast = v != 0
@@ -1349,14 +1514,14 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
// Acquire the work mutex as we may need to
// reinitialize the congestion control state.
e.mu.Lock()
- state := e.state
+ state := e.EndpointState()
e.cc = v
e.mu.Unlock()
switch state {
case StateEstablished:
e.workMu.Lock()
e.mu.Lock()
- if e.state == state {
+ if e.EndpointState() == state {
e.snd.cc = e.snd.initCongestionControl(e.cc)
}
e.mu.Unlock()
@@ -1419,7 +1584,7 @@ func (e *endpoint) readyReceiveSize() (int, *tcpip.Error) {
defer e.mu.RUnlock()
// The endpoint cannot be in listen state.
- if e.state == StateListen {
+ if e.EndpointState() == StateListen {
return 0, tcpip.ErrInvalidEndpointState
}
@@ -1429,8 +1594,27 @@ func (e *endpoint) readyReceiveSize() (int, *tcpip.Error) {
return e.rcvBufUsed, nil
}
+// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
+func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
+ switch opt {
+ case tcpip.V6OnlyOption:
+ // We only recognize this option on v6 endpoints.
+ if e.NetProto != header.IPv6ProtocolNumber {
+ return false, tcpip.ErrUnknownProtocolOption
+ }
+
+ e.mu.Lock()
+ v := e.v6only
+ e.mu.Unlock()
+
+ return v, nil
+ }
+
+ return false, tcpip.ErrUnknownProtocolOption
+}
+
// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
-func (e *endpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) {
+func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
switch opt {
case tcpip.ReceiveQueueSizeOption:
return e.readyReceiveSize()
@@ -1508,12 +1692,8 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
case *tcpip.BindToDeviceOption:
e.mu.RLock()
- defer e.mu.RUnlock()
- if nic, ok := e.stack.NICInfo()[e.bindToDevice]; ok {
- *o = tcpip.BindToDeviceOption(nic.Name)
- return nil
- }
- *o = ""
+ *o = tcpip.BindToDeviceOption(e.bindToDevice)
+ e.mu.RUnlock()
return nil
case *tcpip.QuickAckOption:
@@ -1523,22 +1703,6 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
}
return nil
- case *tcpip.V6OnlyOption:
- // We only recognize this option on v6 endpoints.
- if e.NetProto != header.IPv6ProtocolNumber {
- return tcpip.ErrUnknownProtocolOption
- }
-
- e.mu.Lock()
- v := e.v6only
- e.mu.Unlock()
-
- *o = 0
- if v {
- *o = 1
- }
- return nil
-
case *tcpip.TTLOption:
e.mu.Lock()
*o = tcpip.TTLOption(e.ttl)
@@ -1587,6 +1751,12 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
e.keepalive.Unlock()
return nil
+ case *tcpip.TCPUserTimeoutOption:
+ e.mu.Lock()
+ *o = tcpip.TCPUserTimeoutOption(e.userTimeout)
+ e.mu.Unlock()
+ return nil
+
case *tcpip.OutOfBandInlineOption:
// We don't currently support disabling this option.
*o = 1
@@ -1633,26 +1803,11 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
}
func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress) (tcpip.NetworkProtocolNumber, *tcpip.Error) {
- netProto := e.NetProto
- if header.IsV4MappedAddress(addr.Addr) {
- // Fail if using a v4 mapped address on a v6only endpoint.
- if e.v6only {
- return 0, tcpip.ErrNoRoute
- }
-
- netProto = header.IPv4ProtocolNumber
- addr.Addr = addr.Addr[header.IPv6AddressSize-header.IPv4AddressSize:]
- if addr.Addr == header.IPv4Any {
- addr.Addr = ""
- }
- }
-
- // Fail if we're bound to an address length different from the one we're
- // checking.
- if l := len(e.ID.LocalAddress); l != 0 && len(addr.Addr) != 0 && l != len(addr.Addr) {
- return 0, tcpip.ErrInvalidEndpointState
+ unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProto(*addr, e.v6only)
+ if err != nil {
+ return 0, err
}
-
+ *addr = unwrapped
return netProto, nil
}
@@ -1688,7 +1843,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
return err
}
- if e.state.connected() {
+ if e.EndpointState().connected() {
// The endpoint is already connected. If caller hasn't been
// notified yet, return success.
if !e.isConnectNotified {
@@ -1700,7 +1855,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
}
nicID := addr.NIC
- switch e.state {
+ switch e.EndpointState() {
case StateBound:
// If we're already bound to a NIC but the caller is requesting
// that we use a different one now, we cannot proceed.
@@ -1775,7 +1930,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
}
// reusePort is false below because connect cannot reuse a port even if
// reusePort was set.
- if !e.stack.IsPortAvailable(netProtos, ProtocolNumber, e.ID.LocalAddress, p, false /* reusePort */, e.bindToDevice) {
+ if !e.stack.IsPortAvailable(netProtos, ProtocolNumber, e.ID.LocalAddress, p, ports.Flags{LoadBalanced: false}, e.bindToDevice) {
return false, nil
}
@@ -1802,12 +1957,12 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
// before Connect: in such a case we don't want to hold on to
// reservations anymore.
if e.isPortReserved {
- e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, origID.LocalAddress, origID.LocalPort, e.boundBindToDevice)
+ e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, origID.LocalAddress, origID.LocalPort, e.boundPortFlags, e.boundBindToDevice)
e.isPortReserved = false
}
e.isRegistered = true
- e.state = StateConnecting
+ e.setEndpointState(StateConnecting)
e.route = r.Clone()
e.boundNICID = nicID
e.effectiveNetProtos = netProtos
@@ -1828,14 +1983,13 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
}
e.segmentQueue.mu.Unlock()
e.snd.updateMaxPayloadSize(int(e.route.MTU()), 0)
- e.state = StateEstablished
- e.stack.Stats().TCP.CurrentEstablished.Increment()
+ e.setEndpointState(StateEstablished)
}
if run {
e.workerRunning = true
e.stack.Stats().TCP.ActiveConnectionOpenings.Increment()
- go e.protocolMainLoop(handshake) // S/R-SAFE: will be drained before save.
+ go e.protocolMainLoop(handshake, nil) // S/R-SAFE: will be drained before save.
}
return tcpip.ErrConnectStarted
@@ -1853,7 +2007,7 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
e.shutdownFlags |= flags
finQueued := false
switch {
- case e.state.connected():
+ case e.EndpointState().connected():
// Close for read.
if (e.shutdownFlags & tcpip.ShutdownRead) != 0 {
// Mark read side as closed.
@@ -1865,8 +2019,18 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
// If we're fully closed and we have unread data we need to abort
// the connection with a RST.
if (e.shutdownFlags&tcpip.ShutdownWrite) != 0 && rcvBufUsed > 0 {
- e.notifyProtocolGoroutine(notifyReset)
e.mu.Unlock()
+ // Try to send an active reset immediately if the
+ // work mutex is available.
+ if e.workMu.TryLock() {
+ e.mu.Lock()
+ e.resetConnectionLocked(tcpip.ErrConnectionAborted)
+ e.notifyProtocolGoroutine(notifyTickleWorker)
+ e.mu.Unlock()
+ e.workMu.Unlock()
+ } else {
+ e.notifyProtocolGoroutine(notifyReset)
+ }
return nil
}
}
@@ -1888,11 +2052,10 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
finQueued = true
// Mark endpoint as closed.
e.sndClosed = true
-
e.sndBufMu.Unlock()
}
- case e.state == StateListen:
+ case e.EndpointState() == StateListen:
// Tell protocolListenLoop to stop.
if flags&tcpip.ShutdownRead != 0 {
e.notifyProtocolGoroutine(notifyClose)
@@ -1933,7 +2096,7 @@ func (e *endpoint) listen(backlog int) *tcpip.Error {
// When the endpoint shuts down, it sets workerCleanup to true, and from
// that point onward, acceptedChan is the responsibility of the cleanup()
// method (and should not be touched anywhere else, including here).
- if e.state == StateListen && !e.workerCleanup {
+ if e.EndpointState() == StateListen && !e.workerCleanup {
// Adjust the size of the channel iff we can fix existing
// pending connections into the new one.
if len(e.acceptedChan) > backlog {
@@ -1951,8 +2114,17 @@ func (e *endpoint) listen(backlog int) *tcpip.Error {
return nil
}
+ if e.EndpointState() == StateInitial {
+ // The listen is called on an unbound socket, the socket is
+ // automatically bound to a random free port with the local
+ // address set to INADDR_ANY.
+ if err := e.bindLocked(tcpip.FullAddress{}); err != nil {
+ return err
+ }
+ }
+
// Endpoint must be bound before it can transition to listen mode.
- if e.state != StateBound {
+ if e.EndpointState() != StateBound {
e.stats.ReadErrors.InvalidEndpointState.Increment()
return tcpip.ErrInvalidEndpointState
}
@@ -1963,24 +2135,27 @@ func (e *endpoint) listen(backlog int) *tcpip.Error {
}
e.isRegistered = true
- e.state = StateListen
+ e.setEndpointState(StateListen)
+
if e.acceptedChan == nil {
e.acceptedChan = make(chan *endpoint, backlog)
}
e.workerRunning = true
-
go e.protocolListenLoop( // S/R-SAFE: drained on save.
seqnum.Size(e.receiveBufferAvailable()))
-
return nil
}
// startAcceptedLoop sets up required state and starts a goroutine with the
// main loop for accepted connections.
func (e *endpoint) startAcceptedLoop(waiterQueue *waiter.Queue) {
+ e.mu.Lock()
e.waiterQueue = waiterQueue
e.workerRunning = true
- go e.protocolMainLoop(false) // S/R-SAFE: drained on save.
+ e.mu.Unlock()
+ wakerInitDone := make(chan struct{})
+ go e.protocolMainLoop(false, wakerInitDone) // S/R-SAFE: drained on save.
+ <-wakerInitDone
}
// Accept returns a new endpoint if a peer has established a connection
@@ -1990,7 +2165,7 @@ func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
defer e.mu.RUnlock()
// Endpoint must be in listen state before it can accept connections.
- if e.state != StateListen {
+ if e.EndpointState() != StateListen {
return nil, nil, tcpip.ErrInvalidEndpointState
}
@@ -2010,10 +2185,14 @@ func (e *endpoint) Bind(addr tcpip.FullAddress) (err *tcpip.Error) {
e.mu.Lock()
defer e.mu.Unlock()
+ return e.bindLocked(addr)
+}
+
+func (e *endpoint) bindLocked(addr tcpip.FullAddress) (err *tcpip.Error) {
// Don't allow binding once endpoint is not in the initial state
// anymore. This is because once the endpoint goes into a connected or
// listen state, it is already bound.
- if e.state != StateInitial {
+ if e.EndpointState() != StateInitial {
return tcpip.ErrAlreadyBound
}
@@ -2034,28 +2213,33 @@ func (e *endpoint) Bind(addr tcpip.FullAddress) (err *tcpip.Error) {
}
}
- port, err := e.stack.ReservePort(netProtos, ProtocolNumber, addr.Addr, addr.Port, e.reusePort, e.bindToDevice)
+ flags := ports.Flags{
+ LoadBalanced: e.reusePort,
+ }
+ port, err := e.stack.ReservePort(netProtos, ProtocolNumber, addr.Addr, addr.Port, flags, e.bindToDevice)
if err != nil {
return err
}
e.boundBindToDevice = e.bindToDevice
+ e.boundPortFlags = flags
e.isPortReserved = true
e.effectiveNetProtos = netProtos
e.ID.LocalPort = port
// Any failures beyond this point must remove the port registration.
- defer func(bindToDevice tcpip.NICID) {
+ defer func(portFlags ports.Flags, bindToDevice tcpip.NICID) {
if err != nil {
- e.stack.ReleasePort(netProtos, ProtocolNumber, addr.Addr, port, bindToDevice)
+ e.stack.ReleasePort(netProtos, ProtocolNumber, addr.Addr, port, portFlags, bindToDevice)
e.isPortReserved = false
e.effectiveNetProtos = nil
e.ID.LocalPort = 0
e.ID.LocalAddress = ""
e.boundNICID = 0
e.boundBindToDevice = 0
+ e.boundPortFlags = ports.Flags{}
}
- }(e.boundBindToDevice)
+ }(e.boundPortFlags, e.boundBindToDevice)
// If an address is specified, we must ensure that it's one of our
// local addresses.
@@ -2070,7 +2254,7 @@ func (e *endpoint) Bind(addr tcpip.FullAddress) (err *tcpip.Error) {
}
// Mark endpoint as bound.
- e.state = StateBound
+ e.setEndpointState(StateBound)
return nil
}
@@ -2092,7 +2276,7 @@ func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
e.mu.RLock()
defer e.mu.RUnlock()
- if !e.state.connected() {
+ if !e.EndpointState().connected() {
return tcpip.FullAddress{}, tcpip.ErrNotConnected
}
@@ -2103,45 +2287,22 @@ func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
}, nil
}
-// HandlePacket is called by the stack when new packets arrive to this transport
-// endpoint.
func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) {
- s := newSegment(r, id, pkt)
- if !s.parse() {
- e.stack.Stats().MalformedRcvdPackets.Increment()
- e.stack.Stats().TCP.InvalidSegmentsReceived.Increment()
- e.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
- s.decRef()
- return
- }
-
- if !s.csumValid {
- e.stack.Stats().MalformedRcvdPackets.Increment()
- e.stack.Stats().TCP.ChecksumErrors.Increment()
- e.stats.ReceiveErrors.ChecksumErrors.Increment()
- s.decRef()
- return
- }
-
- e.stack.Stats().TCP.ValidSegmentsReceived.Increment()
- e.stats.SegmentsReceived.Increment()
- if (s.flags & header.TCPFlagRst) != 0 {
- e.stack.Stats().TCP.ResetsReceived.Increment()
- }
-
- e.enqueueSegment(s)
+ // TCP HandlePacket is not required anymore as inbound packets first
+ // land at the Dispatcher which then can either delivery using the
+ // worker go routine or directly do the invoke the tcp processing inline
+ // based on the state of the endpoint.
}
-func (e *endpoint) enqueueSegment(s *segment) {
+func (e *endpoint) enqueueSegment(s *segment) bool {
// Send packet to worker goroutine.
- if e.segmentQueue.enqueue(s) {
- e.newSegmentWaker.Assert()
- } else {
+ if !e.segmentQueue.enqueue(s) {
// The queue is full, so we drop the segment.
e.stack.Stats().DroppedPackets.Increment()
e.stats.ReceiveErrors.SegmentQueueDropped.Increment()
- s.decRef()
+ return false
}
+ return true
}
// HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
@@ -2184,13 +2345,10 @@ func (e *endpoint) readyToRead(s *segment) {
if s != nil {
s.incRef()
e.rcvBufUsed += s.data.Size()
- // Check if the receive window is now closed. If so make sure
- // we set the zero window before we deliver the segment to ensure
- // that a subsequent read of the segment will correctly trigger
- // a non-zero notification.
- if avail := e.receiveBufferAvailableLocked(); avail>>e.rcv.rcvWndScale == 0 {
+ // Increase counter if the receive window falls down below MSS
+ // or half receive buffer size, whichever smaller.
+ if crossed, above := e.windowCrossedACKThreshold(-s.data.Size()); crossed && !above {
e.stats.ReceiveErrors.ZeroRcvWindowState.Increment()
- e.zeroWindow = true
}
e.rcvList.PushBack(s)
} else {
@@ -2261,8 +2419,8 @@ func (e *endpoint) rcvWndScaleForHandshake() int {
// updateRecentTimestamp updates the recent timestamp using the algorithm
// described in https://tools.ietf.org/html/rfc7323#section-4.3
func (e *endpoint) updateRecentTimestamp(tsVal uint32, maxSentAck seqnum.Value, segSeq seqnum.Value) {
- if e.sendTSOk && seqnum.Value(e.recentTS).LessThan(seqnum.Value(tsVal)) && segSeq.LessThanEq(maxSentAck) {
- e.recentTS = tsVal
+ if e.sendTSOk && seqnum.Value(e.recentTimestamp()).LessThan(seqnum.Value(tsVal)) && segSeq.LessThanEq(maxSentAck) {
+ e.setRecentTimestamp(tsVal)
}
}
@@ -2272,7 +2430,7 @@ func (e *endpoint) updateRecentTimestamp(tsVal uint32, maxSentAck seqnum.Value,
func (e *endpoint) maybeEnableTimestamp(synOpts *header.TCPSynOptions) {
if synOpts.TS {
e.sendTSOk = true
- e.recentTS = synOpts.TSVal
+ e.setRecentTimestamp(synOpts.TSVal)
}
}
@@ -2361,7 +2519,7 @@ func (e *endpoint) completeState() stack.TCPEndpointState {
// Endpoint TCP Option state.
s.SendTSOk = e.sendTSOk
- s.RecentTS = e.recentTS
+ s.RecentTS = e.recentTimestamp()
s.TSOffset = e.tsOffset
s.SACKPermitted = e.sackPermitted
s.SACK.Blocks = make([]header.SACKBlock, e.sack.NumBlocks)
@@ -2468,9 +2626,7 @@ func (e *endpoint) initGSO() {
// State implements tcpip.Endpoint.State. It exports the endpoint's protocol
// state for diagnostics.
func (e *endpoint) State() uint32 {
- e.mu.Lock()
- defer e.mu.Unlock()
- return uint32(e.state)
+ return uint32(e.EndpointState())
}
// Info returns a copy of the endpoint info.
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
index 7aa4c3f0e..4a46f0ec5 100644
--- a/pkg/tcpip/transport/tcp/endpoint_state.go
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -16,9 +16,10 @@ package tcp
import (
"fmt"
- "sync"
+ "sync/atomic"
"time"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/header"
"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -48,7 +49,7 @@ func (e *endpoint) beforeSave() {
e.mu.Lock()
defer e.mu.Unlock()
- switch e.state {
+ switch e.EndpointState() {
case StateInitial, StateBound:
// TODO(b/138137272): this enumeration duplicates
// EndpointState.connected. remove it.
@@ -70,31 +71,30 @@ func (e *endpoint) beforeSave() {
fallthrough
case StateListen, StateConnecting:
e.drainSegmentLocked()
- if e.state != StateClose && e.state != StateError {
+ if e.EndpointState() != StateClose && e.EndpointState() != StateError {
if !e.workerRunning {
panic("endpoint has no worker running in listen, connecting, or connected state")
}
break
}
- fallthrough
case StateError, StateClose:
- for (e.state == StateError || e.state == StateClose) && e.workerRunning {
+ for e.workerRunning {
e.mu.Unlock()
time.Sleep(100 * time.Millisecond)
e.mu.Lock()
}
if e.workerRunning {
- panic("endpoint still has worker running in closed or error state")
+ panic(fmt.Sprintf("endpoint: %+v still has worker running in closed or error state", e.ID))
}
default:
- panic(fmt.Sprintf("endpoint in unknown state %v", e.state))
+ panic(fmt.Sprintf("endpoint in unknown state %v", e.EndpointState()))
}
if e.waiterQueue != nil && !e.waiterQueue.IsEmpty() {
panic("endpoint still has waiters upon save")
}
- if e.state != StateClose && !((e.state == StateBound || e.state == StateListen) == e.isPortReserved) {
+ if e.EndpointState() != StateClose && !((e.EndpointState() == StateBound || e.EndpointState() == StateListen) == e.isPortReserved) {
panic("endpoints which are not in the closed state must have a reserved port IFF they are in bound or listen state")
}
}
@@ -135,7 +135,7 @@ func (e *endpoint) loadAcceptedChan(acceptedEndpoints []*endpoint) {
// saveState is invoked by stateify.
func (e *endpoint) saveState() EndpointState {
- return e.state
+ return e.EndpointState()
}
// Endpoint loading must be done in the following ordering by their state, to
@@ -151,7 +151,8 @@ var connectingLoading sync.WaitGroup
func (e *endpoint) loadState(state EndpointState) {
// This is to ensure that the loading wait groups include all applicable
// endpoints before any asynchronous calls to the Wait() methods.
- if state.connected() {
+ // For restore purposes we treat TimeWait like a connected endpoint.
+ if state.connected() || state == StateTimeWait {
connectedLoading.Add(1)
}
switch state {
@@ -160,13 +161,14 @@ func (e *endpoint) loadState(state EndpointState) {
case StateConnecting, StateSynSent, StateSynRecv:
connectingLoading.Add(1)
}
- e.state = state
+ // Directly update the state here rather than using e.setEndpointState
+ // as the endpoint is still being loaded and the stack reference to increment
+ // metrics is not yet initialized.
+ atomic.StoreUint32((*uint32)(&e.state), uint32(state))
}
// afterLoad is invoked by stateify.
func (e *endpoint) afterLoad() {
- // Freeze segment queue before registering to prevent any segments
- // from being delivered while it is being restored.
e.origEndpointState = e.state
// Restore the endpoint to InitialState as it will be moved to
// its origEndpointState during Resume.
@@ -180,7 +182,6 @@ func (e *endpoint) Resume(s *stack.Stack) {
e.segmentQueue.setLimit(MaxUnprocessedSegments)
e.workMu.Init()
state := e.origEndpointState
-
switch state {
case StateInitial, StateBound, StateListen, StateConnecting, StateEstablished:
var ss SendBufferSizeOption
@@ -276,7 +277,7 @@ func (e *endpoint) Resume(s *stack.Stack) {
listenLoading.Wait()
connectingLoading.Wait()
bind()
- e.state = StateClose
+ e.setEndpointState(StateClose)
tcpip.AsyncLoading.Done()
}()
}
@@ -288,6 +289,7 @@ func (e *endpoint) Resume(s *stack.Stack) {
e.stack.CompleteTransportEndpointCleanup(e)
tcpip.DeleteDanglingEndpoint(e)
}
+
}
// saveLastError is invoked by stateify.
diff --git a/pkg/tcpip/transport/tcp/forwarder.go b/pkg/tcpip/transport/tcp/forwarder.go
index 4983bca81..7eb613be5 100644
--- a/pkg/tcpip/transport/tcp/forwarder.go
+++ b/pkg/tcpip/transport/tcp/forwarder.go
@@ -15,8 +15,7 @@
package tcp
import (
- "sync"
-
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/header"
"gvisor.dev/gvisor/pkg/tcpip/seqnum"
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index 89b965c23..958c06fa7 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -21,10 +21,11 @@
package tcp
import (
+ "runtime"
"strings"
- "sync"
"time"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/buffer"
"gvisor.dev/gvisor/pkg/tcpip/header"
@@ -104,6 +105,7 @@ type protocol struct {
moderateReceiveBuffer bool
tcpLingerTimeout time.Duration
tcpTimeWaitTimeout time.Duration
+ dispatcher *dispatcher
}
// Number returns the tcp protocol number.
@@ -134,6 +136,14 @@ func (*protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error) {
return h.SourcePort(), h.DestinationPort(), nil
}
+// QueuePacket queues packets targeted at an endpoint after hashing the packet
+// to a specific processing queue. Each queue is serviced by its own processor
+// goroutine which is responsible for dequeuing and doing full TCP dispatch of
+// the packet.
+func (p *protocol) QueuePacket(r *stack.Route, ep stack.TransportEndpoint, id stack.TransportEndpointID, pkt tcpip.PacketBuffer) {
+ p.dispatcher.queuePacket(r, ep, id, pkt)
+}
+
// HandleUnknownDestinationPacket handles packets targeted at this protocol but
// that don't match any existing endpoint.
//
@@ -162,13 +172,26 @@ func (*protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.Transpo
func replyWithReset(s *segment) {
// Get the seqnum from the packet if the ack flag is set.
seq := seqnum.Value(0)
+ ack := seqnum.Value(0)
+ flags := byte(header.TCPFlagRst)
+ // As per RFC 793 page 35 (Reset Generation)
+ // 1. If the connection does not exist (CLOSED) then a reset is sent
+ // in response to any incoming segment except another reset. In
+ // particular, SYNs addressed to a non-existent connection are rejected
+ // by this means.
+
+ // If the incoming segment has an ACK field, the reset takes its
+ // sequence number from the ACK field of the segment, otherwise the
+ // reset has sequence number zero and the ACK field is set to the sum
+ // of the sequence number and segment length of the incoming segment.
+ // The connection remains in the CLOSED state.
if s.flagIsSet(header.TCPFlagAck) {
seq = s.ackNumber
+ } else {
+ flags |= header.TCPFlagAck
+ ack = s.sequenceNumber.Add(s.logicalLen())
}
-
- ack := s.sequenceNumber.Add(s.logicalLen())
-
- sendTCP(&s.route, s.id, buffer.VectorisedView{}, s.route.DefaultTTL(), stack.DefaultTOS, header.TCPFlagRst|header.TCPFlagAck, seq, ack, 0 /* rcvWnd */, nil /* options */, nil /* gso */)
+ sendTCP(&s.route, s.id, buffer.VectorisedView{}, s.route.DefaultTTL(), stack.DefaultTOS, flags, seq, ack, 0 /* rcvWnd */, nil /* options */, nil /* gso */)
}
// SetOption implements TransportProtocol.SetOption.
@@ -317,5 +340,6 @@ func NewProtocol() stack.TransportProtocol {
availableCongestionControl: []string{ccReno, ccCubic},
tcpLingerTimeout: DefaultTCPLingerTimeout,
tcpTimeWaitTimeout: DefaultTCPTimeWaitTimeout,
+ dispatcher: newDispatcher(runtime.GOMAXPROCS(0)),
}
}
diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go
index 857dc445f..958f03ac1 100644
--- a/pkg/tcpip/transport/tcp/rcv.go
+++ b/pkg/tcpip/transport/tcp/rcv.go
@@ -50,16 +50,20 @@ type receiver struct {
pendingRcvdSegments segmentHeap
pendingBufUsed seqnum.Size
pendingBufSize seqnum.Size
+
+ // Time when the last ack was received.
+ lastRcvdAckTime time.Time `state:".(unixTime)"`
}
func newReceiver(ep *endpoint, irs seqnum.Value, rcvWnd seqnum.Size, rcvWndScale uint8, pendingBufSize seqnum.Size) *receiver {
return &receiver{
- ep: ep,
- rcvNxt: irs + 1,
- rcvAcc: irs.Add(rcvWnd + 1),
- rcvWnd: rcvWnd,
- rcvWndScale: rcvWndScale,
- pendingBufSize: pendingBufSize,
+ ep: ep,
+ rcvNxt: irs + 1,
+ rcvAcc: irs.Add(rcvWnd + 1),
+ rcvWnd: rcvWnd,
+ rcvWndScale: rcvWndScale,
+ pendingBufSize: pendingBufSize,
+ lastRcvdAckTime: time.Now(),
}
}
@@ -94,12 +98,6 @@ func (r *receiver) getSendParams() (rcvNxt seqnum.Value, rcvWnd seqnum.Size) {
// in such cases we may need to send an ack to indicate to our peer that it can
// resume sending data.
func (r *receiver) nonZeroWindow() {
- if (r.rcvAcc-r.rcvNxt)>>r.rcvWndScale != 0 {
- // We never got around to announcing a zero window size, so we
- // don't need to immediately announce a nonzero one.
- return
- }
-
// Immediately send an ack.
r.ep.snd.sendAck()
}
@@ -171,19 +169,19 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum
// We just received a FIN, our next state depends on whether we sent a
// FIN already or not.
r.ep.mu.Lock()
- switch r.ep.state {
+ switch r.ep.EndpointState() {
case StateEstablished:
- r.ep.state = StateCloseWait
+ r.ep.setEndpointState(StateCloseWait)
case StateFinWait1:
if s.flagIsSet(header.TCPFlagAck) {
// FIN-ACK, transition to TIME-WAIT.
- r.ep.state = StateTimeWait
+ r.ep.setEndpointState(StateTimeWait)
} else {
// Simultaneous close, expecting a final ACK.
- r.ep.state = StateClosing
+ r.ep.setEndpointState(StateClosing)
}
case StateFinWait2:
- r.ep.state = StateTimeWait
+ r.ep.setEndpointState(StateTimeWait)
}
r.ep.mu.Unlock()
@@ -205,18 +203,18 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum
// Handle ACK (not FIN-ACK, which we handled above) during one of the
// shutdown states.
- if s.flagIsSet(header.TCPFlagAck) {
+ if s.flagIsSet(header.TCPFlagAck) && s.ackNumber == r.ep.snd.sndNxt {
r.ep.mu.Lock()
- switch r.ep.state {
+ switch r.ep.EndpointState() {
case StateFinWait1:
- r.ep.state = StateFinWait2
+ r.ep.setEndpointState(StateFinWait2)
// Notify protocol goroutine that we have received an
// ACK to our FIN so that it can start the FIN_WAIT2
// timer to abort connection if the other side does
// not close within 2MSL.
r.ep.notifyProtocolGoroutine(notifyClose)
case StateClosing:
- r.ep.state = StateTimeWait
+ r.ep.setEndpointState(StateTimeWait)
case StateLastAck:
r.ep.transitionToStateCloseLocked()
}
@@ -269,7 +267,6 @@ func (r *receiver) handleRcvdSegmentClosing(s *segment, state EndpointState, clo
switch state {
case StateCloseWait, StateClosing, StateLastAck:
if !s.sequenceNumber.LessThanEq(r.rcvNxt) {
- s.decRef()
// Just drop the segment as we have
// already received a FIN and this
// segment is after the sequence number
@@ -286,7 +283,6 @@ func (r *receiver) handleRcvdSegmentClosing(s *segment, state EndpointState, clo
// trigger a RST.
endDataSeq := s.sequenceNumber.Add(seqnum.Size(s.data.Size()))
if rcvClosed && r.rcvNxt.LessThan(endDataSeq) {
- s.decRef()
return true, tcpip.ErrConnectionAborted
}
if state == StateFinWait1 {
@@ -316,7 +312,6 @@ func (r *receiver) handleRcvdSegmentClosing(s *segment, state EndpointState, clo
// the last actual data octet in a segment in
// which it occurs.
if closed && (!s.flagIsSet(header.TCPFlagFin) || s.sequenceNumber.Add(s.logicalLen()) != r.rcvNxt+1) {
- s.decRef()
return true, tcpip.ErrConnectionAborted
}
}
@@ -338,7 +333,7 @@ func (r *receiver) handleRcvdSegmentClosing(s *segment, state EndpointState, clo
// r as they arrive. It is called by the protocol main loop.
func (r *receiver) handleRcvdSegment(s *segment) (drop bool, err *tcpip.Error) {
r.ep.mu.RLock()
- state := r.ep.state
+ state := r.ep.EndpointState()
closed := r.ep.closed
r.ep.mu.RUnlock()
@@ -360,6 +355,9 @@ func (r *receiver) handleRcvdSegment(s *segment) (drop bool, err *tcpip.Error) {
return true, nil
}
+ // Store the time of the last ack.
+ r.lastRcvdAckTime = time.Now()
+
// Defer segment processing if it can't be consumed now.
if !r.consumeSegment(s, segSeq, segLen) {
if segLen > 0 || s.flagIsSet(header.TCPFlagFin) {
diff --git a/pkg/tcpip/transport/tcp/rcv_state.go b/pkg/tcpip/transport/tcp/rcv_state.go
new file mode 100644
index 000000000..2bf21a2e7
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/rcv_state.go
@@ -0,0 +1,29 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+import (
+ "time"
+)
+
+// saveLastRcvdAckTime is invoked by stateify.
+func (r *receiver) saveLastRcvdAckTime() unixTime {
+ return unixTime{r.lastRcvdAckTime.Unix(), r.lastRcvdAckTime.UnixNano()}
+}
+
+// loadLastRcvdAckTime is invoked by stateify.
+func (r *receiver) loadLastRcvdAckTime(unix unixTime) {
+ r.lastRcvdAckTime = time.Unix(unix.second, unix.nano)
+}
diff --git a/pkg/tcpip/transport/tcp/segment_queue.go b/pkg/tcpip/transport/tcp/segment_queue.go
index e0759225e..bd20a7ee9 100644
--- a/pkg/tcpip/transport/tcp/segment_queue.go
+++ b/pkg/tcpip/transport/tcp/segment_queue.go
@@ -15,7 +15,7 @@
package tcp
import (
- "sync"
+ "gvisor.dev/gvisor/pkg/sync"
)
// segmentQueue is a bounded, thread-safe queue of TCP segments.
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index d3f7c9125..b74b61e7d 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -16,11 +16,11 @@ package tcp
import (
"math"
- "sync"
"sync/atomic"
"time"
"gvisor.dev/gvisor/pkg/sleep"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/buffer"
"gvisor.dev/gvisor/pkg/tcpip/header"
@@ -28,8 +28,11 @@ import (
)
const (
- // minRTO is the minimum allowed value for the retransmit timeout.
- minRTO = 200 * time.Millisecond
+ // MinRTO is the minimum allowed value for the retransmit timeout.
+ MinRTO = 200 * time.Millisecond
+
+ // MaxRTO is the maximum allowed value for the retransmit timeout.
+ MaxRTO = 120 * time.Second
// InitialCwnd is the initial congestion window.
InitialCwnd = 10
@@ -134,6 +137,10 @@ type sender struct {
// rttMeasureTime is the time when the rttMeasureSeqNum was sent.
rttMeasureTime time.Time `state:".(unixTime)"`
+ // firstRetransmittedSegXmitTime is the original transmit time of
+ // the first segment that was retransmitted due to RTO expiration.
+ firstRetransmittedSegXmitTime time.Time `state:".(unixTime)"`
+
closed bool
writeNext *segment
writeList segmentList
@@ -392,8 +399,8 @@ func (s *sender) updateRTO(rtt time.Duration) {
s.rto = s.rtt.srtt + 4*s.rtt.rttvar
s.rtt.Unlock()
- if s.rto < minRTO {
- s.rto = minRTO
+ if s.rto < MinRTO {
+ s.rto = MinRTO
}
}
@@ -435,11 +442,40 @@ func (s *sender) retransmitTimerExpired() bool {
return true
}
+ // TODO(b/147297758): Band-aid fix, retransmitTimer can fire in some edge cases
+ // when writeList is empty. Remove this once we have a proper fix for this
+ // issue.
+ if s.writeList.Front() == nil {
+ return true
+ }
+
s.ep.stack.Stats().TCP.Timeouts.Increment()
s.ep.stats.SendErrors.Timeouts.Increment()
- // Give up if we've waited more than a minute since the last resend.
- if s.rto >= 60*time.Second {
+ // Give up if we've waited more than a minute since the last resend or
+ // if a user time out is set and we have exceeded the user specified
+ // timeout since the first retransmission.
+ s.ep.mu.RLock()
+ uto := s.ep.userTimeout
+ s.ep.mu.RUnlock()
+
+ if s.firstRetransmittedSegXmitTime.IsZero() {
+ // We store the original xmitTime of the segment that we are
+ // about to retransmit as the retransmission time. This is
+ // required as by the time the retransmitTimer has expired the
+ // segment has already been sent and unacked for the RTO at the
+ // time the segment was sent.
+ s.firstRetransmittedSegXmitTime = s.writeList.Front().xmitTime
+ }
+
+ elapsed := time.Since(s.firstRetransmittedSegXmitTime)
+ remaining := MaxRTO
+ if uto != 0 {
+ // Cap to the user specified timeout if one is specified.
+ remaining = uto - elapsed
+ }
+
+ if remaining <= 0 || s.rto >= MaxRTO {
return false
}
@@ -447,6 +483,11 @@ func (s *sender) retransmitTimerExpired() bool {
// below.
s.rto *= 2
+ // Cap RTO to remaining time.
+ if s.rto > remaining {
+ s.rto = remaining
+ }
+
// See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 4.
//
// Retransmit timeouts:
@@ -664,18 +705,15 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se
}
seg.flags = header.TCPFlagAck | header.TCPFlagFin
segEnd = seg.sequenceNumber.Add(1)
- // Transition to FIN-WAIT1 state since we're initiating an active close.
- s.ep.mu.Lock()
- switch s.ep.state {
+ // Update the state to reflect that we have now
+ // queued a FIN.
+ switch s.ep.EndpointState() {
case StateCloseWait:
- // We've already received a FIN and are now sending our own. The
- // sender is now awaiting a final ACK for this FIN.
- s.ep.state = StateLastAck
+ s.ep.setEndpointState(StateLastAck)
default:
- s.ep.state = StateFinWait1
+ s.ep.setEndpointState(StateFinWait1)
}
- s.ep.stack.Stats().TCP.CurrentEstablished.Decrement()
- s.ep.mu.Unlock()
+
} else {
// We're sending a non-FIN segment.
if seg.flags&header.TCPFlagFin != 0 {
@@ -1169,6 +1207,8 @@ func (s *sender) handleRcvdSegment(seg *segment) {
// RFC 6298 Rule 5.3
if s.sndUna == s.sndNxt {
s.outstanding = 0
+ // Reset firstRetransmittedSegXmitTime to the zero value.
+ s.firstRetransmittedSegXmitTime = time.Time{}
s.resendTimer.disable()
}
}
diff --git a/pkg/tcpip/transport/tcp/snd_state.go b/pkg/tcpip/transport/tcp/snd_state.go
index 12eff8afc..8b20c3455 100644
--- a/pkg/tcpip/transport/tcp/snd_state.go
+++ b/pkg/tcpip/transport/tcp/snd_state.go
@@ -48,3 +48,13 @@ func (s *sender) loadRttMeasureTime(unix unixTime) {
func (s *sender) afterLoad() {
s.resendTimer.init(&s.resendWaker)
}
+
+// saveFirstRetransmittedSegXmitTime is invoked by stateify.
+func (s *sender) saveFirstRetransmittedSegXmitTime() unixTime {
+ return unixTime{s.firstRetransmittedSegXmitTime.Unix(), s.firstRetransmittedSegXmitTime.UnixNano()}
+}
+
+// loadFirstRetransmittedSegXmitTime is invoked by stateify.
+func (s *sender) loadFirstRetransmittedSegXmitTime(unix unixTime) {
+ s.firstRetransmittedSegXmitTime = time.Unix(unix.second, unix.nano)
+}
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index c4b45aa6f..a9dfbe857 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -75,6 +75,20 @@ func TestGiveUpConnect(t *testing.T) {
if err := ep.GetSockOpt(tcpip.ErrorOption{}); err != tcpip.ErrAborted {
t.Fatalf("got ep.GetSockOpt(tcpip.ErrorOption{}) = %v, want = %v", err, tcpip.ErrAborted)
}
+
+ // Call Connect again to retreive the handshake failure status
+ // and stats updates.
+ if err := ep.Connect(tcpip.FullAddress{Addr: context.TestAddr, Port: context.TestPort}); err != tcpip.ErrAborted {
+ t.Fatalf("got ep.Connect(...) = %v, want = %v", err, tcpip.ErrAborted)
+ }
+
+ if got := c.Stack().Stats().TCP.FailedConnectionAttempts.Value(); got != 1 {
+ t.Errorf("got stats.TCP.FailedConnectionAttempts.Value() = %v, want = 1", got)
+ }
+
+ if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 0 {
+ t.Errorf("got stats.TCP.CurrentEstablished.Value() = %v, want = 0", got)
+ }
}
func TestConnectIncrementActiveConnection(t *testing.T) {
@@ -279,7 +293,6 @@ func TestTCPResetSentForACKWhenNotUsingSynCookies(t *testing.T) {
checker.SeqNum(uint32(c.IRS+1)),
checker.AckNum(uint32(iss)+1),
checker.TCPFlags(header.TCPFlagFin|header.TCPFlagAck)))
-
finHeaders := &context.Headers{
SrcPort: context.TestPort,
DstPort: context.StackPort,
@@ -309,8 +322,8 @@ func TestTCPResetSentForACKWhenNotUsingSynCookies(t *testing.T) {
checker.SrcPort(context.StackPort),
checker.DstPort(context.TestPort),
checker.SeqNum(uint32(c.IRS+1)),
- checker.AckNum(uint32(iss)+1),
- checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck)))
+ checker.AckNum(0),
+ checker.TCPFlags(header.TCPFlagRst)))
}
func TestTCPResetsReceivedIncrement(t *testing.T) {
@@ -445,19 +458,21 @@ func TestConnectResetAfterClose(t *testing.T) {
checker.IPv4(t, b,
checker.TCP(
checker.DstPort(context.TestPort),
+ // RST is always generated with sndNxt which if the FIN
+ // has been sent will be 1 higher than the sequence number
+ // of the FIN itself.
checker.SeqNum(uint32(c.IRS)+2),
- checker.AckNum(790),
- checker.TCPFlags(header.TCPFlagAck|header.TCPFlagRst),
+ checker.AckNum(0),
+ checker.TCPFlags(header.TCPFlagRst),
),
)
break
}
}
-// TestClosingWithEnqueuedSegments tests handling of
-// still enqueued segments when the endpoint transitions
-// to StateClose. The in-flight segments would be re-enqueued
-// to a any listening endpoint.
+// TestClosingWithEnqueuedSegments tests handling of still enqueued segments
+// when the endpoint transitions to StateClose. The in-flight segments would be
+// re-enqueued to a any listening endpoint.
func TestClosingWithEnqueuedSegments(t *testing.T) {
c := context.New(t, defaultMTU)
defer c.Cleanup()
@@ -541,21 +556,29 @@ func TestClosingWithEnqueuedSegments(t *testing.T) {
ep.(interface{ ResumeWork() }).ResumeWork()
// Wait for the protocolMainLoop to resume and update state.
- time.Sleep(1 * time.Millisecond)
+ time.Sleep(10 * time.Millisecond)
// Expect the endpoint to be closed.
if got, want := tcp.EndpointState(ep.State()), tcp.StateClose; got != want {
t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
}
+ if got := c.Stack().Stats().TCP.EstablishedClosed.Value(); got != 1 {
+ t.Errorf("got c.Stack().Stats().TCP.EstablishedClosed = %v, want = 1", got)
+ }
+
+ if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 0 {
+ t.Errorf("got stats.TCP.CurrentEstablished.Value() = %v, want = 0", got)
+ }
+
// Check if the endpoint was moved to CLOSED and netstack a reset in
// response to the ACK packet that we sent after last-ACK.
checker.IPv4(t, c.GetPacket(),
checker.TCP(
checker.DstPort(context.TestPort),
checker.SeqNum(uint32(c.IRS)+2),
- checker.AckNum(793),
- checker.TCPFlags(header.TCPFlagAck|header.TCPFlagRst),
+ checker.AckNum(0),
+ checker.TCPFlags(header.TCPFlagRst),
),
)
}
@@ -792,6 +815,82 @@ func TestSendRstOnListenerRxSynAckV6(t *testing.T) {
checker.SeqNum(200)))
}
+// TestTCPAckBeforeAcceptV4 tests that once the 3-way handshake is complete,
+// peers can send data and expect a response within a reasonable ammount of time
+// without calling Accept on the listening endpoint first.
+//
+// This test uses IPv4.
+func TestTCPAckBeforeAcceptV4(t *testing.T) {
+ c := context.New(t, defaultMTU)
+ defer c.Cleanup()
+
+ c.Create(-1)
+
+ if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+ t.Fatal("Bind failed:", err)
+ }
+
+ if err := c.EP.Listen(10); err != nil {
+ t.Fatal("Listen failed:", err)
+ }
+
+ irs, iss := executeHandshake(t, c, context.TestPort, false /* synCookiesInUse */)
+
+ // Send data before accepting the connection.
+ c.SendPacket([]byte{1, 2, 3, 4}, &context.Headers{
+ SrcPort: context.TestPort,
+ DstPort: context.StackPort,
+ Flags: header.TCPFlagAck,
+ SeqNum: irs + 1,
+ AckNum: iss + 1,
+ })
+
+ // Receive ACK for the data we sent.
+ checker.IPv4(t, c.GetPacket(), checker.TCP(
+ checker.DstPort(context.TestPort),
+ checker.TCPFlags(header.TCPFlagAck),
+ checker.SeqNum(uint32(iss+1)),
+ checker.AckNum(uint32(irs+5))))
+}
+
+// TestTCPAckBeforeAcceptV6 tests that once the 3-way handshake is complete,
+// peers can send data and expect a response within a reasonable ammount of time
+// without calling Accept on the listening endpoint first.
+//
+// This test uses IPv6.
+func TestTCPAckBeforeAcceptV6(t *testing.T) {
+ c := context.New(t, defaultMTU)
+ defer c.Cleanup()
+
+ c.CreateV6Endpoint(true)
+
+ if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+ t.Fatal("Bind failed:", err)
+ }
+
+ if err := c.EP.Listen(10); err != nil {
+ t.Fatal("Listen failed:", err)
+ }
+
+ irs, iss := executeV6Handshake(t, c, context.TestPort, false /* synCookiesInUse */)
+
+ // Send data before accepting the connection.
+ c.SendV6Packet([]byte{1, 2, 3, 4}, &context.Headers{
+ SrcPort: context.TestPort,
+ DstPort: context.StackPort,
+ Flags: header.TCPFlagAck,
+ SeqNum: irs + 1,
+ AckNum: iss + 1,
+ })
+
+ // Receive ACK for the data we sent.
+ checker.IPv6(t, c.GetV6Packet(), checker.TCP(
+ checker.DstPort(context.TestPort),
+ checker.TCPFlags(header.TCPFlagAck),
+ checker.SeqNum(uint32(iss+1)),
+ checker.AckNum(uint32(irs+5))))
+}
+
func TestSendRstOnListenerRxAckV4(t *testing.T) {
c := context.New(t, defaultMTU)
defer c.Cleanup()
@@ -816,7 +915,7 @@ func TestSendRstOnListenerRxAckV4(t *testing.T) {
checker.IPv4(t, c.GetPacket(), checker.TCP(
checker.DstPort(context.TestPort),
- checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck),
+ checker.TCPFlags(header.TCPFlagRst),
checker.SeqNum(200)))
}
@@ -844,7 +943,7 @@ func TestSendRstOnListenerRxAckV6(t *testing.T) {
checker.IPv6(t, c.GetV6Packet(), checker.TCP(
checker.DstPort(context.TestPort),
- checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck),
+ checker.TCPFlags(header.TCPFlagRst),
checker.SeqNum(200)))
}
@@ -986,12 +1085,12 @@ func TestTrafficClassV6(t *testing.T) {
func TestConnectBindToDevice(t *testing.T) {
for _, test := range []struct {
name string
- device string
+ device tcpip.NICID
want tcp.EndpointState
}{
- {"RightDevice", "nic1", tcp.StateEstablished},
- {"WrongDevice", "nic2", tcp.StateSynSent},
- {"AnyDevice", "", tcp.StateEstablished},
+ {"RightDevice", 1, tcp.StateEstablished},
+ {"WrongDevice", 2, tcp.StateSynSent},
+ {"AnyDevice", 0, tcp.StateEstablished},
} {
t.Run(test.name, func(t *testing.T) {
c := context.New(t, defaultMTU)
@@ -1043,6 +1142,71 @@ func TestConnectBindToDevice(t *testing.T) {
}
}
+func TestRstOnSynSent(t *testing.T) {
+ c := context.New(t, defaultMTU)
+ defer c.Cleanup()
+
+ // Create an endpoint, don't handshake because we want to interfere with the
+ // handshake process.
+ c.Create(-1)
+
+ // Start connection attempt.
+ waitEntry, ch := waiter.NewChannelEntry(nil)
+ c.WQ.EventRegister(&waitEntry, waiter.EventOut)
+ defer c.WQ.EventUnregister(&waitEntry)
+
+ addr := tcpip.FullAddress{Addr: context.TestAddr, Port: context.TestPort}
+ if err := c.EP.Connect(addr); err != tcpip.ErrConnectStarted {
+ t.Fatalf("got Connect(%+v) = %v, want %s", addr, err, tcpip.ErrConnectStarted)
+ }
+
+ // Receive SYN packet.
+ b := c.GetPacket()
+ checker.IPv4(t, b,
+ checker.TCP(
+ checker.DstPort(context.TestPort),
+ checker.TCPFlags(header.TCPFlagSyn),
+ ),
+ )
+
+ // Ensure that we've reached SynSent state
+ if got, want := tcp.EndpointState(c.EP.State()), tcp.StateSynSent; got != want {
+ t.Fatalf("got State() = %s, want %s", got, want)
+ }
+ tcpHdr := header.TCP(header.IPv4(b).Payload())
+ c.IRS = seqnum.Value(tcpHdr.SequenceNumber())
+
+ // Send a packet with a proper ACK and a RST flag to cause the socket
+ // to Error and close out
+ iss := seqnum.Value(789)
+ rcvWnd := seqnum.Size(30000)
+ c.SendPacket(nil, &context.Headers{
+ SrcPort: tcpHdr.DestinationPort(),
+ DstPort: tcpHdr.SourcePort(),
+ Flags: header.TCPFlagRst | header.TCPFlagAck,
+ SeqNum: iss,
+ AckNum: c.IRS.Add(1),
+ RcvWnd: rcvWnd,
+ TCPOpts: nil,
+ })
+
+ // Wait for receive to be notified.
+ select {
+ case <-ch:
+ case <-time.After(3 * time.Second):
+ t.Fatal("timed out waiting for packet to arrive")
+ }
+
+ if _, _, err := c.EP.Read(nil); err != tcpip.ErrConnectionRefused {
+ t.Fatalf("got c.EP.Read(nil) = %v, want = %s", err, tcpip.ErrConnectionRefused)
+ }
+
+ // Due to the RST the endpoint should be in an error state.
+ if got, want := tcp.EndpointState(c.EP.State()), tcp.StateError; got != want {
+ t.Fatalf("got State() = %s, want %s", got, want)
+ }
+}
+
func TestOutOfOrderReceive(t *testing.T) {
c := context.New(t, defaultMTU)
defer c.Cleanup()
@@ -1338,6 +1502,9 @@ func TestRstOnCloseWithUnreadDataFinConvertRst(t *testing.T) {
checker.TCP(
checker.DstPort(context.TestPort),
checker.TCPFlags(header.TCPFlagAck|header.TCPFlagRst),
+ // RST is always generated with sndNxt which if the FIN
+ // has been sent will be 1 higher than the sequence
+ // number of the FIN itself.
checker.SeqNum(uint32(c.IRS)+2),
))
// The RST puts the endpoint into an error state.
@@ -1929,10 +2096,14 @@ func TestZeroScaledWindowReceive(t *testing.T) {
)
}
- // Read some data. An ack should be sent in response to that.
- v, _, err := c.EP.Read(nil)
- if err != nil {
- t.Fatalf("Read failed: %v", err)
+ // Read at least 1MSS of data. An ack should be sent in response to that.
+ sz := 0
+ for sz < defaultMTU {
+ v, _, err := c.EP.Read(nil)
+ if err != nil {
+ t.Fatalf("Read failed: %v", err)
+ }
+ sz += len(v)
}
checker.IPv4(t, c.GetPacket(),
@@ -1941,7 +2112,7 @@ func TestZeroScaledWindowReceive(t *testing.T) {
checker.DstPort(context.TestPort),
checker.SeqNum(uint32(c.IRS)+1),
checker.AckNum(uint32(790+sent)),
- checker.Window(uint16(len(v)>>ws)),
+ checker.Window(uint16(sz>>ws)),
checker.TCPFlags(header.TCPFlagAck),
),
)
@@ -2618,6 +2789,13 @@ loop:
if tcp.EndpointState(c.EP.State()) != tcp.StateError {
t.Fatalf("got EP state is not StateError")
}
+
+ if got := c.Stack().Stats().TCP.EstablishedResets.Value(); got != 1 {
+ t.Errorf("got stats.TCP.EstablishedResets.Value() = %v, want = 1", got)
+ }
+ if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 0 {
+ t.Errorf("got stats.TCP.CurrentEstablished.Value() = %v, want = 0", got)
+ }
}
func TestSendOnResetConnection(t *testing.T) {
@@ -3625,46 +3803,41 @@ func TestBindToDeviceOption(t *testing.T) {
}
defer ep.Close()
- if err := s.CreateNamedNIC(321, "my_device", loopback.New()); err != nil {
- t.Errorf("CreateNamedNIC failed: %v", err)
- }
-
- // Make an nameless NIC.
- if err := s.CreateNIC(54321, loopback.New()); err != nil {
+ if err := s.CreateNIC(321, loopback.New()); err != nil {
t.Errorf("CreateNIC failed: %v", err)
}
- // strPtr is used instead of taking the address of string literals, which is
+ // nicIDPtr is used instead of taking the address of NICID literals, which is
// a compiler error.
- strPtr := func(s string) *string {
+ nicIDPtr := func(s tcpip.NICID) *tcpip.NICID {
return &s
}
testActions := []struct {
name string
- setBindToDevice *string
+ setBindToDevice *tcpip.NICID
setBindToDeviceError *tcpip.Error
getBindToDevice tcpip.BindToDeviceOption
}{
- {"GetDefaultValue", nil, nil, ""},
- {"BindToNonExistent", strPtr("non_existent_device"), tcpip.ErrUnknownDevice, ""},
- {"BindToExistent", strPtr("my_device"), nil, "my_device"},
- {"UnbindToDevice", strPtr(""), nil, ""},
+ {"GetDefaultValue", nil, nil, 0},
+ {"BindToNonExistent", nicIDPtr(999), tcpip.ErrUnknownDevice, 0},
+ {"BindToExistent", nicIDPtr(321), nil, 321},
+ {"UnbindToDevice", nicIDPtr(0), nil, 0},
}
for _, testAction := range testActions {
t.Run(testAction.name, func(t *testing.T) {
if testAction.setBindToDevice != nil {
bindToDevice := tcpip.BindToDeviceOption(*testAction.setBindToDevice)
- if got, want := ep.SetSockOpt(bindToDevice), testAction.setBindToDeviceError; got != want {
- t.Errorf("SetSockOpt(%v) got %v, want %v", bindToDevice, got, want)
+ if gotErr, wantErr := ep.SetSockOpt(bindToDevice), testAction.setBindToDeviceError; gotErr != wantErr {
+ t.Errorf("SetSockOpt(%v) got %v, want %v", bindToDevice, gotErr, wantErr)
}
}
- bindToDevice := tcpip.BindToDeviceOption("to be modified by GetSockOpt")
- if ep.GetSockOpt(&bindToDevice) != nil {
- t.Errorf("GetSockOpt got %v, want %v", ep.GetSockOpt(&bindToDevice), nil)
+ bindToDevice := tcpip.BindToDeviceOption(88888)
+ if err := ep.GetSockOpt(&bindToDevice); err != nil {
+ t.Errorf("GetSockOpt got %v, want %v", err, nil)
}
if got, want := bindToDevice, testAction.getBindToDevice; got != want {
- t.Errorf("bindToDevice got %q, want %q", got, want)
+ t.Errorf("bindToDevice got %d, want %d", got, want)
}
})
}
@@ -3858,12 +4031,12 @@ func TestConnectAvoidsBoundPorts(t *testing.T) {
switch network {
case "ipv4":
case "ipv6":
- if err := ep.SetSockOpt(tcpip.V6OnlyOption(1)); err != nil {
- t.Fatalf("SetSockOpt(V6OnlyOption(1)) failed: %v", err)
+ if err := ep.SetSockOptBool(tcpip.V6OnlyOption, true); err != nil {
+ t.Fatalf("SetSockOpt(V6OnlyOption(true)) failed: %v", err)
}
case "dual":
- if err := ep.SetSockOpt(tcpip.V6OnlyOption(0)); err != nil {
- t.Fatalf("SetSockOpt(V6OnlyOption(0)) failed: %v", err)
+ if err := ep.SetSockOptBool(tcpip.V6OnlyOption, false); err != nil {
+ t.Fatalf("SetSockOpt(V6OnlyOption(false)) failed: %v", err)
}
default:
t.Fatalf("unknown network: '%s'", network)
@@ -4186,8 +4359,9 @@ func TestKeepalive(t *testing.T) {
c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+ const keepAliveInterval = 10 * time.Millisecond
c.EP.SetSockOpt(tcpip.KeepaliveIdleOption(10 * time.Millisecond))
- c.EP.SetSockOpt(tcpip.KeepaliveIntervalOption(10 * time.Millisecond))
+ c.EP.SetSockOpt(tcpip.KeepaliveIntervalOption(keepAliveInterval))
c.EP.SetSockOpt(tcpip.KeepaliveCountOption(5))
c.EP.SetSockOpt(tcpip.KeepaliveEnabledOption(1))
@@ -4277,19 +4451,43 @@ func TestKeepalive(t *testing.T) {
)
}
+ // Sleep for a litte over the KeepAlive interval to make sure
+ // the timer has time to fire after the last ACK and close the
+ // close the socket.
+ time.Sleep(keepAliveInterval + 5*time.Millisecond)
+
// The connection should be terminated after 5 unacked keepalives.
+ // Send an ACK to trigger a RST from the stack as the endpoint should
+ // be dead.
+ c.SendPacket(nil, &context.Headers{
+ SrcPort: context.TestPort,
+ DstPort: c.Port,
+ Flags: header.TCPFlagAck,
+ SeqNum: 790,
+ AckNum: seqnum.Value(next),
+ RcvWnd: 30000,
+ })
+
checker.IPv4(t, c.GetPacket(),
checker.TCP(
checker.DstPort(context.TestPort),
checker.SeqNum(uint32(next)),
- checker.AckNum(uint32(790)),
- checker.TCPFlags(header.TCPFlagAck|header.TCPFlagRst),
+ checker.AckNum(uint32(0)),
+ checker.TCPFlags(header.TCPFlagRst),
),
)
+ if got := c.Stack().Stats().TCP.EstablishedTimedout.Value(); got != 1 {
+ t.Errorf("got c.Stack().Stats().TCP.EstablishedTimedout.Value() = %v, want = 1", got)
+ }
+
if _, _, err := c.EP.Read(nil); err != tcpip.ErrTimeout {
t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrTimeout)
}
+
+ if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 0 {
+ t.Errorf("got stats.TCP.CurrentEstablished.Value() = %v, want = 0", got)
+ }
}
func executeHandshake(t *testing.T, c *context.Context, srcPort uint16, synCookieInUse bool) (irs, iss seqnum.Value) {
@@ -4303,7 +4501,7 @@ func executeHandshake(t *testing.T, c *context.Context, srcPort uint16, synCooki
RcvWnd: 30000,
})
- // Receive the SYN-ACK reply.w
+ // Receive the SYN-ACK reply.
b := c.GetPacket()
tcp := header.TCP(header.IPv4(b).Payload())
iss = seqnum.Value(tcp.SequenceNumber())
@@ -4336,6 +4534,50 @@ func executeHandshake(t *testing.T, c *context.Context, srcPort uint16, synCooki
return irs, iss
}
+func executeV6Handshake(t *testing.T, c *context.Context, srcPort uint16, synCookieInUse bool) (irs, iss seqnum.Value) {
+ // Send a SYN request.
+ irs = seqnum.Value(789)
+ c.SendV6Packet(nil, &context.Headers{
+ SrcPort: srcPort,
+ DstPort: context.StackPort,
+ Flags: header.TCPFlagSyn,
+ SeqNum: irs,
+ RcvWnd: 30000,
+ })
+
+ // Receive the SYN-ACK reply.
+ b := c.GetV6Packet()
+ tcp := header.TCP(header.IPv6(b).Payload())
+ iss = seqnum.Value(tcp.SequenceNumber())
+ tcpCheckers := []checker.TransportChecker{
+ checker.SrcPort(context.StackPort),
+ checker.DstPort(srcPort),
+ checker.TCPFlags(header.TCPFlagAck | header.TCPFlagSyn),
+ checker.AckNum(uint32(irs) + 1),
+ }
+
+ if synCookieInUse {
+ // When cookies are in use window scaling is disabled.
+ tcpCheckers = append(tcpCheckers, checker.TCPSynOptions(header.TCPSynOptions{
+ WS: -1,
+ MSS: c.MSSWithoutOptionsV6(),
+ }))
+ }
+
+ checker.IPv6(t, b, checker.TCP(tcpCheckers...))
+
+ // Send ACK.
+ c.SendV6Packet(nil, &context.Headers{
+ SrcPort: srcPort,
+ DstPort: context.StackPort,
+ Flags: header.TCPFlagAck,
+ SeqNum: irs + 1,
+ AckNum: iss + 1,
+ RcvWnd: 30000,
+ })
+ return irs, iss
+}
+
// TestListenBacklogFull tests that netstack does not complete handshakes if the
// listen backlog for the endpoint is full.
func TestListenBacklogFull(t *testing.T) {
@@ -5204,6 +5446,7 @@ func TestReceiveBufferAutoTuningApplicationLimited(t *testing.T) {
rawEP.SendPacketWithTS(b[start:start+mss], tsVal)
packetsSent++
}
+
// Resume the worker so that it only sees the packets once all of them
// are waiting to be read.
worker.ResumeWork()
@@ -5271,7 +5514,7 @@ func TestReceiveBufferAutoTuning(t *testing.T) {
stk := c.Stack()
// Set lower limits for auto-tuning tests. This is required because the
// test stops the worker which can cause packets to be dropped because
- // the segment queue holding unprocessed packets is limited to 500.
+ // the segment queue holding unprocessed packets is limited to 300.
const receiveBufferSize = 80 << 10 // 80KB.
const maxReceiveBufferSize = receiveBufferSize * 10
if err := stk.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.ReceiveBufferSizeOption{1, receiveBufferSize, maxReceiveBufferSize}); err != nil {
@@ -5326,6 +5569,7 @@ func TestReceiveBufferAutoTuning(t *testing.T) {
totalSent += mss
packetsSent++
}
+
// Resume it so that it only sees the packets once all of them
// are waiting to be read.
worker.ResumeWork()
@@ -5512,6 +5756,7 @@ func TestTCPTimeWaitRSTIgnored(t *testing.T) {
DstPort: context.StackPort,
Flags: header.TCPFlagSyn,
SeqNum: iss,
+ RcvWnd: 30000,
})
// Receive the SYN-ACK reply.
@@ -5630,6 +5875,7 @@ func TestTCPTimeWaitOutOfOrder(t *testing.T) {
DstPort: context.StackPort,
Flags: header.TCPFlagSyn,
SeqNum: iss,
+ RcvWnd: 30000,
})
// Receive the SYN-ACK reply.
@@ -5736,6 +5982,7 @@ func TestTCPTimeWaitNewSyn(t *testing.T) {
DstPort: context.StackPort,
Flags: header.TCPFlagSyn,
SeqNum: iss,
+ RcvWnd: 30000,
})
// Receive the SYN-ACK reply.
@@ -5809,6 +6056,7 @@ func TestTCPTimeWaitNewSyn(t *testing.T) {
DstPort: context.StackPort,
Flags: header.TCPFlagSyn,
SeqNum: iss,
+ RcvWnd: 30000,
})
c.CheckNoPacketTimeout("unexpected packet received in response to SYN", 1*time.Second)
@@ -5821,6 +6069,7 @@ func TestTCPTimeWaitNewSyn(t *testing.T) {
DstPort: context.StackPort,
Flags: header.TCPFlagSyn,
SeqNum: iss,
+ RcvWnd: 30000,
})
// Receive the SYN-ACK reply.
@@ -5867,6 +6116,8 @@ func TestTCPTimeWaitDuplicateFINExtendsTimeWait(t *testing.T) {
t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPLingerTimeoutOption(%d) failed: %s", tcpTimeWaitTimeout, err)
}
+ want := c.Stack().Stats().TCP.EstablishedClosed.Value() + 1
+
wq := &waiter.Queue{}
ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq)
if err != nil {
@@ -5887,6 +6138,7 @@ func TestTCPTimeWaitDuplicateFINExtendsTimeWait(t *testing.T) {
DstPort: context.StackPort,
Flags: header.TCPFlagSyn,
SeqNum: iss,
+ RcvWnd: 30000,
})
// Receive the SYN-ACK reply.
@@ -5992,6 +6244,463 @@ func TestTCPTimeWaitDuplicateFINExtendsTimeWait(t *testing.T) {
checker.SrcPort(context.StackPort),
checker.DstPort(context.TestPort),
checker.SeqNum(uint32(ackHeaders.AckNum)),
- checker.AckNum(uint32(ackHeaders.SeqNum)),
- checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck)))
+ checker.AckNum(0),
+ checker.TCPFlags(header.TCPFlagRst)))
+
+ if got := c.Stack().Stats().TCP.EstablishedClosed.Value(); got != want {
+ t.Errorf("got c.Stack().Stats().TCP.EstablishedClosed = %v, want = %v", got, want)
+ }
+ if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 0 {
+ t.Errorf("got stats.TCP.CurrentEstablished.Value() = %v, want = 0", got)
+ }
+}
+
+func TestTCPCloseWithData(t *testing.T) {
+ c := context.New(t, defaultMTU)
+ defer c.Cleanup()
+
+ // Set TCPTimeWaitTimeout to 5 seconds so that sockets are marked closed
+ // after 5 seconds in TIME_WAIT state.
+ tcpTimeWaitTimeout := 5 * time.Second
+ if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPTimeWaitTimeoutOption(tcpTimeWaitTimeout)); err != nil {
+ t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPLingerTimeoutOption(%d) failed: %s", tcpTimeWaitTimeout, err)
+ }
+
+ wq := &waiter.Queue{}
+ ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq)
+ if err != nil {
+ t.Fatalf("NewEndpoint failed: %s", err)
+ }
+ if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+ t.Fatalf("Bind failed: %s", err)
+ }
+
+ if err := ep.Listen(10); err != nil {
+ t.Fatalf("Listen failed: %s", err)
+ }
+
+ // Send a SYN request.
+ iss := seqnum.Value(789)
+ c.SendPacket(nil, &context.Headers{
+ SrcPort: context.TestPort,
+ DstPort: context.StackPort,
+ Flags: header.TCPFlagSyn,
+ SeqNum: iss,
+ RcvWnd: 30000,
+ })
+
+ // Receive the SYN-ACK reply.
+ b := c.GetPacket()
+ tcpHdr := header.TCP(header.IPv4(b).Payload())
+ c.IRS = seqnum.Value(tcpHdr.SequenceNumber())
+
+ ackHeaders := &context.Headers{
+ SrcPort: context.TestPort,
+ DstPort: context.StackPort,
+ Flags: header.TCPFlagAck,
+ SeqNum: iss + 1,
+ AckNum: c.IRS + 1,
+ RcvWnd: 30000,
+ }
+
+ // Send ACK.
+ c.SendPacket(nil, ackHeaders)
+
+ // Try to accept the connection.
+ we, ch := waiter.NewChannelEntry(nil)
+ wq.EventRegister(&we, waiter.EventIn)
+ defer wq.EventUnregister(&we)
+
+ c.EP, _, err = ep.Accept()
+ if err == tcpip.ErrWouldBlock {
+ // Wait for connection to be established.
+ select {
+ case <-ch:
+ c.EP, _, err = ep.Accept()
+ if err != nil {
+ t.Fatalf("Accept failed: %s", err)
+ }
+
+ case <-time.After(1 * time.Second):
+ t.Fatalf("Timed out waiting for accept")
+ }
+ }
+
+ // Now trigger a passive close by sending a FIN.
+ finHeaders := &context.Headers{
+ SrcPort: context.TestPort,
+ DstPort: context.StackPort,
+ Flags: header.TCPFlagAck | header.TCPFlagFin,
+ SeqNum: iss + 1,
+ AckNum: c.IRS + 2,
+ RcvWnd: 30000,
+ }
+
+ c.SendPacket(nil, finHeaders)
+
+ // Get the ACK to the FIN we just sent.
+ checker.IPv4(t, c.GetPacket(), checker.TCP(
+ checker.SrcPort(context.StackPort),
+ checker.DstPort(context.TestPort),
+ checker.SeqNum(uint32(c.IRS+1)),
+ checker.AckNum(uint32(iss)+2),
+ checker.TCPFlags(header.TCPFlagAck)))
+
+ // Now write a few bytes and then close the endpoint.
+ data := []byte{1, 2, 3}
+ view := buffer.NewView(len(data))
+ copy(view, data)
+
+ if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+ t.Fatalf("Write failed: %s", err)
+ }
+
+ // Check that data is received.
+ b = c.GetPacket()
+ checker.IPv4(t, b,
+ checker.PayloadLen(len(data)+header.TCPMinimumSize),
+ checker.TCP(
+ checker.DstPort(context.TestPort),
+ checker.SeqNum(uint32(c.IRS)+1),
+ checker.AckNum(uint32(iss)+2), // Acknum is initial sequence number + 1
+ checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+ ),
+ )
+
+ if p := b[header.IPv4MinimumSize+header.TCPMinimumSize:]; !bytes.Equal(data, p) {
+ t.Errorf("got data = %x, want = %x", p, data)
+ }
+
+ c.EP.Close()
+ // Check the FIN.
+ checker.IPv4(t, c.GetPacket(), checker.TCP(
+ checker.SrcPort(context.StackPort),
+ checker.DstPort(context.TestPort),
+ checker.SeqNum(uint32(c.IRS+1)+uint32(len(data))),
+ checker.AckNum(uint32(iss+2)),
+ checker.TCPFlags(header.TCPFlagFin|header.TCPFlagAck)))
+
+ // First send a partial ACK.
+ ackHeaders = &context.Headers{
+ SrcPort: context.TestPort,
+ DstPort: context.StackPort,
+ Flags: header.TCPFlagAck,
+ SeqNum: iss + 2,
+ AckNum: c.IRS + 1 + seqnum.Value(len(data)-1),
+ RcvWnd: 30000,
+ }
+ c.SendPacket(nil, ackHeaders)
+
+ // Now send a full ACK.
+ ackHeaders = &context.Headers{
+ SrcPort: context.TestPort,
+ DstPort: context.StackPort,
+ Flags: header.TCPFlagAck,
+ SeqNum: iss + 2,
+ AckNum: c.IRS + 1 + seqnum.Value(len(data)),
+ RcvWnd: 30000,
+ }
+ c.SendPacket(nil, ackHeaders)
+
+ // Now ACK the FIN.
+ ackHeaders.AckNum++
+ c.SendPacket(nil, ackHeaders)
+
+ // Now send an ACK and we should get a RST back as the endpoint should
+ // be in CLOSED state.
+ ackHeaders = &context.Headers{
+ SrcPort: context.TestPort,
+ DstPort: context.StackPort,
+ Flags: header.TCPFlagAck,
+ SeqNum: iss + 2,
+ AckNum: c.IRS + 1 + seqnum.Value(len(data)),
+ RcvWnd: 30000,
+ }
+ c.SendPacket(nil, ackHeaders)
+
+ // Check the RST.
+ checker.IPv4(t, c.GetPacket(), checker.TCP(
+ checker.SrcPort(context.StackPort),
+ checker.DstPort(context.TestPort),
+ checker.SeqNum(uint32(ackHeaders.AckNum)),
+ checker.AckNum(0),
+ checker.TCPFlags(header.TCPFlagRst)))
+}
+
+func TestTCPUserTimeout(t *testing.T) {
+ c := context.New(t, defaultMTU)
+ defer c.Cleanup()
+
+ c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+
+ origEstablishedTimedout := c.Stack().Stats().TCP.EstablishedTimedout.Value()
+
+ userTimeout := 50 * time.Millisecond
+ c.EP.SetSockOpt(tcpip.TCPUserTimeoutOption(userTimeout))
+
+ // Send some data and wait before ACKing it.
+ view := buffer.NewView(3)
+ if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+ t.Fatalf("Write failed: %v", err)
+ }
+
+ next := uint32(c.IRS) + 1
+ checker.IPv4(t, c.GetPacket(),
+ checker.PayloadLen(len(view)+header.TCPMinimumSize),
+ checker.TCP(
+ checker.DstPort(context.TestPort),
+ checker.SeqNum(next),
+ checker.AckNum(790),
+ checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+ ),
+ )
+
+ // Wait for a little over the minimum retransmit timeout of 200ms for
+ // the retransmitTimer to fire and close the connection.
+ time.Sleep(tcp.MinRTO + 10*time.Millisecond)
+
+ // No packet should be received as the connection should be silently
+ // closed due to timeout.
+ c.CheckNoPacket("unexpected packet received after userTimeout has expired")
+
+ next += uint32(len(view))
+
+ // The connection should be terminated after userTimeout has expired.
+ // Send an ACK to trigger a RST from the stack as the endpoint should
+ // be dead.
+ c.SendPacket(nil, &context.Headers{
+ SrcPort: context.TestPort,
+ DstPort: c.Port,
+ Flags: header.TCPFlagAck,
+ SeqNum: 790,
+ AckNum: seqnum.Value(next),
+ RcvWnd: 30000,
+ })
+
+ checker.IPv4(t, c.GetPacket(),
+ checker.TCP(
+ checker.DstPort(context.TestPort),
+ checker.SeqNum(uint32(next)),
+ checker.AckNum(uint32(0)),
+ checker.TCPFlags(header.TCPFlagRst),
+ ),
+ )
+
+ if _, _, err := c.EP.Read(nil); err != tcpip.ErrTimeout {
+ t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrTimeout)
+ }
+
+ if got, want := c.Stack().Stats().TCP.EstablishedTimedout.Value(), origEstablishedTimedout+1; got != want {
+ t.Errorf("got c.Stack().Stats().TCP.EstablishedTimedout = %v, want = %v", got, want)
+ }
+}
+
+func TestKeepaliveWithUserTimeout(t *testing.T) {
+ c := context.New(t, defaultMTU)
+ defer c.Cleanup()
+
+ c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+
+ origEstablishedTimedout := c.Stack().Stats().TCP.EstablishedTimedout.Value()
+
+ const keepAliveInterval = 10 * time.Millisecond
+ c.EP.SetSockOpt(tcpip.KeepaliveIdleOption(10 * time.Millisecond))
+ c.EP.SetSockOpt(tcpip.KeepaliveIntervalOption(keepAliveInterval))
+ c.EP.SetSockOpt(tcpip.KeepaliveCountOption(10))
+ c.EP.SetSockOpt(tcpip.KeepaliveEnabledOption(1))
+
+ // Set userTimeout to be the duration for 3 keepalive probes.
+ userTimeout := 30 * time.Millisecond
+ c.EP.SetSockOpt(tcpip.TCPUserTimeoutOption(userTimeout))
+
+ // Check that the connection is still alive.
+ if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
+ t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrWouldBlock)
+ }
+
+ // Now receive 2 keepalives, but don't ACK them. The connection should
+ // be reset when the 3rd one should be sent due to userTimeout being
+ // 30ms and each keepalive probe should be sent 10ms apart as set above after
+ // the connection has been idle for 10ms.
+ for i := 0; i < 2; i++ {
+ b := c.GetPacket()
+ checker.IPv4(t, b,
+ checker.TCP(
+ checker.DstPort(context.TestPort),
+ checker.SeqNum(uint32(c.IRS)),
+ checker.AckNum(uint32(790)),
+ checker.TCPFlags(header.TCPFlagAck),
+ ),
+ )
+ }
+
+ // Sleep for a litte over the KeepAlive interval to make sure
+ // the timer has time to fire after the last ACK and close the
+ // close the socket.
+ time.Sleep(keepAliveInterval + 5*time.Millisecond)
+
+ // The connection should be terminated after 30ms.
+ // Send an ACK to trigger a RST from the stack as the endpoint should
+ // be dead.
+ c.SendPacket(nil, &context.Headers{
+ SrcPort: context.TestPort,
+ DstPort: c.Port,
+ Flags: header.TCPFlagAck,
+ SeqNum: 790,
+ AckNum: seqnum.Value(c.IRS + 1),
+ RcvWnd: 30000,
+ })
+
+ checker.IPv4(t, c.GetPacket(),
+ checker.TCP(
+ checker.DstPort(context.TestPort),
+ checker.SeqNum(uint32(c.IRS+1)),
+ checker.AckNum(uint32(0)),
+ checker.TCPFlags(header.TCPFlagRst),
+ ),
+ )
+
+ if _, _, err := c.EP.Read(nil); err != tcpip.ErrTimeout {
+ t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrTimeout)
+ }
+ if got, want := c.Stack().Stats().TCP.EstablishedTimedout.Value(), origEstablishedTimedout+1; got != want {
+ t.Errorf("got c.Stack().Stats().TCP.EstablishedTimedout = %v, want = %v", got, want)
+ }
+}
+
+func TestIncreaseWindowOnReceive(t *testing.T) {
+ // This test ensures that the endpoint sends an ack,
+ // after recv() when the window grows to more than 1 MSS.
+ c := context.New(t, defaultMTU)
+ defer c.Cleanup()
+
+ const rcvBuf = 65535 * 10
+ c.CreateConnected(789, 30000, rcvBuf)
+
+ // Write chunks of ~30000 bytes. It's important that two
+ // payloads make it equal or longer than MSS.
+ remain := rcvBuf
+ sent := 0
+ data := make([]byte, defaultMTU/2)
+ lastWnd := uint16(0)
+
+ for remain > len(data) {
+ c.SendPacket(data, &context.Headers{
+ SrcPort: context.TestPort,
+ DstPort: c.Port,
+ Flags: header.TCPFlagAck,
+ SeqNum: seqnum.Value(790 + sent),
+ AckNum: c.IRS.Add(1),
+ RcvWnd: 30000,
+ })
+ sent += len(data)
+ remain -= len(data)
+
+ lastWnd = uint16(remain)
+ if remain > 0xffff {
+ lastWnd = 0xffff
+ }
+ checker.IPv4(t, c.GetPacket(),
+ checker.PayloadLen(header.TCPMinimumSize),
+ checker.TCP(
+ checker.DstPort(context.TestPort),
+ checker.SeqNum(uint32(c.IRS)+1),
+ checker.AckNum(uint32(790+sent)),
+ checker.Window(lastWnd),
+ checker.TCPFlags(header.TCPFlagAck),
+ ),
+ )
+ }
+
+ if lastWnd == 0xffff || lastWnd == 0 {
+ t.Fatalf("expected small, non-zero window: %d", lastWnd)
+ }
+
+ // We now have < 1 MSS in the buffer space. Read the data! An
+ // ack should be sent in response to that. The window was not
+ // zero, but it grew to larger than MSS.
+ if _, _, err := c.EP.Read(nil); err != nil {
+ t.Fatalf("Read failed: %v", err)
+ }
+
+ if _, _, err := c.EP.Read(nil); err != nil {
+ t.Fatalf("Read failed: %v", err)
+ }
+
+ // After reading two packets, we surely crossed MSS. See the ack:
+ checker.IPv4(t, c.GetPacket(),
+ checker.PayloadLen(header.TCPMinimumSize),
+ checker.TCP(
+ checker.DstPort(context.TestPort),
+ checker.SeqNum(uint32(c.IRS)+1),
+ checker.AckNum(uint32(790+sent)),
+ checker.Window(uint16(0xffff)),
+ checker.TCPFlags(header.TCPFlagAck),
+ ),
+ )
+}
+
+func TestIncreaseWindowOnBufferResize(t *testing.T) {
+ // This test ensures that the endpoint sends an ack,
+ // after available recv buffer grows to more than 1 MSS.
+ c := context.New(t, defaultMTU)
+ defer c.Cleanup()
+
+ const rcvBuf = 65535 * 10
+ c.CreateConnected(789, 30000, rcvBuf)
+
+ // Write chunks of ~30000 bytes. It's important that two
+ // payloads make it equal or longer than MSS.
+ remain := rcvBuf
+ sent := 0
+ data := make([]byte, defaultMTU/2)
+ lastWnd := uint16(0)
+
+ for remain > len(data) {
+ c.SendPacket(data, &context.Headers{
+ SrcPort: context.TestPort,
+ DstPort: c.Port,
+ Flags: header.TCPFlagAck,
+ SeqNum: seqnum.Value(790 + sent),
+ AckNum: c.IRS.Add(1),
+ RcvWnd: 30000,
+ })
+ sent += len(data)
+ remain -= len(data)
+
+ lastWnd = uint16(remain)
+ if remain > 0xffff {
+ lastWnd = 0xffff
+ }
+ checker.IPv4(t, c.GetPacket(),
+ checker.PayloadLen(header.TCPMinimumSize),
+ checker.TCP(
+ checker.DstPort(context.TestPort),
+ checker.SeqNum(uint32(c.IRS)+1),
+ checker.AckNum(uint32(790+sent)),
+ checker.Window(lastWnd),
+ checker.TCPFlags(header.TCPFlagAck),
+ ),
+ )
+ }
+
+ if lastWnd == 0xffff || lastWnd == 0 {
+ t.Fatalf("expected small, non-zero window: %d", lastWnd)
+ }
+
+ // Increasing the buffer from should generate an ACK,
+ // since window grew from small value to larger equal MSS
+ c.EP.SetSockOptInt(tcpip.ReceiveBufferSizeOption, rcvBuf*2)
+
+ // After reading two packets, we surely crossed MSS. See the ack:
+ checker.IPv4(t, c.GetPacket(),
+ checker.PayloadLen(header.TCPMinimumSize),
+ checker.TCP(
+ checker.DstPort(context.TestPort),
+ checker.SeqNum(uint32(c.IRS)+1),
+ checker.AckNum(uint32(790+sent)),
+ checker.Window(uint16(0xffff)),
+ checker.TCPFlags(header.TCPFlagAck),
+ ),
+ )
}
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index 04fdaaed1..822907998 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -158,15 +158,17 @@ func New(t *testing.T, mtu uint32) *Context {
if testing.Verbose() {
wep = sniffer.New(ep)
}
- if err := s.CreateNamedNIC(1, "nic1", wep); err != nil {
- t.Fatalf("CreateNIC failed: %v", err)
+ opts := stack.NICOptions{Name: "nic1"}
+ if err := s.CreateNICWithOptions(1, wep, opts); err != nil {
+ t.Fatalf("CreateNICWithOptions(_, _, %+v) failed: %v", opts, err)
}
wep2 := stack.LinkEndpoint(channel.New(1000, mtu, ""))
if testing.Verbose() {
wep2 = sniffer.New(channel.New(1000, mtu, ""))
}
- if err := s.CreateNamedNIC(2, "nic2", wep2); err != nil {
- t.Fatalf("CreateNIC failed: %v", err)
+ opts2 := stack.NICOptions{Name: "nic2"}
+ if err := s.CreateNICWithOptions(2, wep2, opts2); err != nil {
+ t.Fatalf("CreateNICWithOptions(_, _, %+v) failed: %v", opts2, err)
}
if err := s.AddAddress(1, ipv4.ProtocolNumber, StackAddr); err != nil {
@@ -231,6 +233,7 @@ func (c *Context) CheckNoPacket(errMsg string) {
// addresses. It will fail with an error if no packet is received for
// 2 seconds.
func (c *Context) GetPacket() []byte {
+ c.t.Helper()
select {
case p := <-c.linkEP.C:
if p.Proto != ipv4.ProtocolNumber {
@@ -259,6 +262,7 @@ func (c *Context) GetPacket() []byte {
// and destination address. If no packet is available it will return
// nil immediately.
func (c *Context) GetPacketNonBlocking() []byte {
+ c.t.Helper()
select {
case p := <-c.linkEP.C:
if p.Proto != ipv4.ProtocolNumber {
@@ -471,11 +475,7 @@ func (c *Context) CreateV6Endpoint(v6only bool) {
c.t.Fatalf("NewEndpoint failed: %v", err)
}
- var v tcpip.V6OnlyOption
- if v6only {
- v = 1
- }
- if err := c.EP.SetSockOpt(v); err != nil {
+ if err := c.EP.SetSockOptBool(tcpip.V6OnlyOption, v6only); err != nil {
c.t.Fatalf("SetSockOpt failed failed: %v", err)
}
}
@@ -483,6 +483,7 @@ func (c *Context) CreateV6Endpoint(v6only bool) {
// GetV6Packet reads a single packet from the link layer endpoint of the context
// and asserts that it is an IPv6 Packet with the expected src/dest addresses.
func (c *Context) GetV6Packet() []byte {
+ c.t.Helper()
select {
case p := <-c.linkEP.C:
if p.Proto != ipv6.ProtocolNumber {
@@ -1089,3 +1090,9 @@ func (c *Context) SetGSOEnabled(enable bool) {
func (c *Context) MSSWithoutOptions() uint16 {
return uint16(c.linkEP.MTU() - header.IPv4MinimumSize - header.TCPMinimumSize)
}
+
+// MSSWithoutOptionsV6 returns the value for the MSS used by the stack when no
+// options are in use for IPv6 packets.
+func (c *Context) MSSWithoutOptionsV6() uint16 {
+ return uint16(c.linkEP.MTU() - header.IPv6MinimumSize - header.TCPMinimumSize)
+}
diff --git a/pkg/tcpip/transport/udp/BUILD b/pkg/tcpip/transport/udp/BUILD
index 8d4c3808f..57ff123e3 100644
--- a/pkg/tcpip/transport/udp/BUILD
+++ b/pkg/tcpip/transport/udp/BUILD
@@ -30,10 +30,12 @@ go_library(
visibility = ["//visibility:public"],
deps = [
"//pkg/sleep",
+ "//pkg/sync",
"//pkg/tcpip",
"//pkg/tcpip/buffer",
"//pkg/tcpip/header",
"//pkg/tcpip/iptables",
+ "//pkg/tcpip/ports",
"//pkg/tcpip/stack",
"//pkg/tcpip/transport/raw",
"//pkg/waiter",
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 24cb88c13..c9cbed8f4 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -15,12 +15,12 @@
package udp
import (
- "sync"
-
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/buffer"
"gvisor.dev/gvisor/pkg/tcpip/header"
"gvisor.dev/gvisor/pkg/tcpip/iptables"
+ "gvisor.dev/gvisor/pkg/tcpip/ports"
"gvisor.dev/gvisor/pkg/tcpip/stack"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -31,6 +31,7 @@ type udpPacket struct {
senderAddress tcpip.FullAddress
data buffer.VectorisedView `state:".(buffer.VectorisedView)"`
timestamp int64
+ tos uint8
}
// EndpointState represents the state of a UDP endpoint.
@@ -107,11 +108,16 @@ type endpoint struct {
// Values used to reserve a port or register a transport endpoint.
// (which ever happens first).
boundBindToDevice tcpip.NICID
+ boundPortFlags ports.Flags
// sendTOS represents IPv4 TOS or IPv6 TrafficClass,
// applied while sending packets. Defaults to 0 as on Linux.
sendTOS uint8
+ // receiveTOS determines if the incoming IPv4 TOS header field is passed
+ // as ancillary data to ControlMessages on Read.
+ receiveTOS bool
+
// shutdownFlags represent the current shutdown state of the endpoint.
shutdownFlags tcpip.ShutdownFlags
@@ -180,8 +186,9 @@ func (e *endpoint) Close() {
switch e.state {
case StateBound, StateConnected:
e.stack.UnregisterTransportEndpoint(e.RegisterNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundBindToDevice)
- e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundBindToDevice)
+ e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundPortFlags, e.boundBindToDevice)
e.boundBindToDevice = 0
+ e.boundPortFlags = ports.Flags{}
}
for _, mem := range e.multicastMemberships {
@@ -241,7 +248,18 @@ func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMess
*addr = p.senderAddress
}
- return p.data.ToView(), tcpip.ControlMessages{HasTimestamp: true, Timestamp: p.timestamp}, nil
+ cm := tcpip.ControlMessages{
+ HasTimestamp: true,
+ Timestamp: p.timestamp,
+ }
+ e.mu.RLock()
+ receiveTOS := e.receiveTOS
+ e.mu.RUnlock()
+ if receiveTOS {
+ cm.HasTOS = true
+ cm.TOS = p.tos
+ }
+ return p.data.ToView(), cm, nil
}
// prepareForWrite prepares the endpoint for sending data. In particular, it
@@ -400,7 +418,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
return 0, nil, tcpip.ErrBroadcastDisabled
}
- netProto, err := e.checkV4Mapped(to, false)
+ netProto, err := e.checkV4Mapped(to)
if err != nil {
return 0, nil, err
}
@@ -453,14 +471,15 @@ func (e *endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
return 0, tcpip.ControlMessages{}, nil
}
-// SetSockOptInt implements tcpip.Endpoint.SetSockOptInt.
-func (e *endpoint) SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error {
- return nil
-}
+// SetSockOptBool implements tcpip.Endpoint.SetSockOptBool.
+func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
+ switch opt {
+ case tcpip.ReceiveTOSOption:
+ e.mu.Lock()
+ e.receiveTOS = v
+ e.mu.Unlock()
+ return nil
-// SetSockOpt implements tcpip.Endpoint.SetSockOpt.
-func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
- switch v := opt.(type) {
case tcpip.V6OnlyOption:
// We only recognize this option on v6 endpoints.
if e.NetProto != header.IPv6ProtocolNumber {
@@ -475,8 +494,20 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
return tcpip.ErrInvalidEndpointState
}
- e.v6only = v != 0
+ e.v6only = v
+ }
+ return nil
+}
+
+// SetSockOptInt implements tcpip.Endpoint.SetSockOptInt.
+func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
+ return nil
+}
+
+// SetSockOpt implements tcpip.Endpoint.SetSockOpt.
+func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
+ switch v := opt.(type) {
case tcpip.TTLOption:
e.mu.Lock()
e.ttl = uint8(v)
@@ -492,7 +523,7 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
defer e.mu.Unlock()
fa := tcpip.FullAddress{Addr: v.InterfaceAddr}
- netProto, err := e.checkV4Mapped(&fa, false)
+ netProto, err := e.checkV4Mapped(&fa)
if err != nil {
return err
}
@@ -621,19 +652,14 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
e.mu.Unlock()
case tcpip.BindToDeviceOption:
- e.mu.Lock()
- defer e.mu.Unlock()
- if v == "" {
- e.bindToDevice = 0
- return nil
- }
- for nicID, nic := range e.stack.NICInfo() {
- if nic.Name == string(v) {
- e.bindToDevice = nicID
- return nil
- }
+ id := tcpip.NICID(v)
+ if id != 0 && !e.stack.HasNIC(id) {
+ return tcpip.ErrUnknownDevice
}
- return tcpip.ErrUnknownDevice
+ e.mu.Lock()
+ e.bindToDevice = id
+ e.mu.Unlock()
+ return nil
case tcpip.BroadcastOption:
e.mu.Lock()
@@ -657,8 +683,33 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
return nil
}
+// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
+func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
+ switch opt {
+ case tcpip.ReceiveTOSOption:
+ e.mu.RLock()
+ v := e.receiveTOS
+ e.mu.RUnlock()
+ return v, nil
+
+ case tcpip.V6OnlyOption:
+ // We only recognize this option on v6 endpoints.
+ if e.NetProto != header.IPv6ProtocolNumber {
+ return false, tcpip.ErrUnknownProtocolOption
+ }
+
+ e.mu.RLock()
+ v := e.v6only
+ e.mu.RUnlock()
+
+ return v, nil
+ }
+
+ return false, tcpip.ErrUnknownProtocolOption
+}
+
// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
-func (e *endpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) {
+func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
switch opt {
case tcpip.ReceiveQueueSizeOption:
v := 0
@@ -692,22 +743,6 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
case tcpip.ErrorOption:
return nil
- case *tcpip.V6OnlyOption:
- // We only recognize this option on v6 endpoints.
- if e.NetProto != header.IPv6ProtocolNumber {
- return tcpip.ErrUnknownProtocolOption
- }
-
- e.mu.Lock()
- v := e.v6only
- e.mu.Unlock()
-
- *o = 0
- if v {
- *o = 1
- }
- return nil
-
case *tcpip.TTLOption:
e.mu.Lock()
*o = tcpip.TTLOption(e.ttl)
@@ -754,12 +789,8 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
case *tcpip.BindToDeviceOption:
e.mu.RLock()
- defer e.mu.RUnlock()
- if nic, ok := e.stack.NICInfo()[e.bindToDevice]; ok {
- *o = tcpip.BindToDeviceOption(nic.Name)
- return nil
- }
- *o = tcpip.BindToDeviceOption("")
+ *o = tcpip.BindToDeviceOption(e.bindToDevice)
+ e.mu.RUnlock()
return nil
case *tcpip.KeepaliveEnabledOption:
@@ -836,35 +867,12 @@ func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort u
return nil
}
-func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress, allowMismatch bool) (tcpip.NetworkProtocolNumber, *tcpip.Error) {
- netProto := e.NetProto
- if len(addr.Addr) == 0 {
- return netProto, nil
- }
- if header.IsV4MappedAddress(addr.Addr) {
- // Fail if using a v4 mapped address on a v6only endpoint.
- if e.v6only {
- return 0, tcpip.ErrNoRoute
- }
-
- netProto = header.IPv4ProtocolNumber
- addr.Addr = addr.Addr[header.IPv6AddressSize-header.IPv4AddressSize:]
- if addr.Addr == header.IPv4Any {
- addr.Addr = ""
- }
-
- // Fail if we are bound to an IPv6 address.
- if !allowMismatch && len(e.ID.LocalAddress) == 16 {
- return 0, tcpip.ErrNetworkUnreachable
- }
- }
-
- // Fail if we're bound to an address length different from the one we're
- // checking.
- if l := len(e.ID.LocalAddress); l != 0 && l != len(addr.Addr) {
- return 0, tcpip.ErrInvalidEndpointState
+func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress) (tcpip.NetworkProtocolNumber, *tcpip.Error) {
+ unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProto(*addr, e.v6only)
+ if err != nil {
+ return 0, err
}
-
+ *addr = unwrapped
return netProto, nil
}
@@ -895,7 +903,8 @@ func (e *endpoint) Disconnect() *tcpip.Error {
} else {
if e.ID.LocalPort != 0 {
// Release the ephemeral port.
- e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundBindToDevice)
+ e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundPortFlags, e.boundBindToDevice)
+ e.boundPortFlags = ports.Flags{}
}
e.state = StateInitial
}
@@ -912,7 +921,7 @@ func (e *endpoint) Disconnect() *tcpip.Error {
// Connect connects the endpoint to its peer. Specifying a NIC is optional.
func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
- netProto, err := e.checkV4Mapped(&addr, false)
+ netProto, err := e.checkV4Mapped(&addr)
if err != nil {
return err
}
@@ -1042,16 +1051,23 @@ func (*endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
func (e *endpoint) registerWithStack(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, id stack.TransportEndpointID) (stack.TransportEndpointID, tcpip.NICID, *tcpip.Error) {
if e.ID.LocalPort == 0 {
- port, err := e.stack.ReservePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, e.reusePort, e.bindToDevice)
+ flags := ports.Flags{
+ LoadBalanced: e.reusePort,
+ // FIXME(b/129164367): Support SO_REUSEADDR.
+ MostRecent: false,
+ }
+ port, err := e.stack.ReservePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, flags, e.bindToDevice)
if err != nil {
return id, e.bindToDevice, err
}
+ e.boundPortFlags = flags
id.LocalPort = port
}
err := e.stack.RegisterTransportEndpoint(nicID, netProtos, ProtocolNumber, id, e, e.reusePort, e.bindToDevice)
if err != nil {
- e.stack.ReleasePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, e.bindToDevice)
+ e.stack.ReleasePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, e.boundPortFlags, e.bindToDevice)
+ e.boundPortFlags = ports.Flags{}
}
return id, e.bindToDevice, err
}
@@ -1063,7 +1079,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) *tcpip.Error {
return tcpip.ErrInvalidEndpointState
}
- netProto, err := e.checkV4Mapped(&addr, true)
+ netProto, err := e.checkV4Mapped(&addr)
if err != nil {
return err
}
@@ -1134,9 +1150,14 @@ func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
e.mu.RLock()
defer e.mu.RUnlock()
+ addr := e.ID.LocalAddress
+ if e.state == StateConnected {
+ addr = e.route.LocalAddress
+ }
+
return tcpip.FullAddress{
NIC: e.RegisterNICID,
- Addr: e.ID.LocalAddress,
+ Addr: addr,
Port: e.ID.LocalPort,
}, nil
}
@@ -1222,6 +1243,12 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
e.rcvList.PushBack(packet)
e.rcvBufSize += pkt.Data.Size()
+ // Save any useful information from the network header to the packet.
+ switch r.NetProto {
+ case header.IPv4ProtocolNumber:
+ packet.tos, _ = header.IPv4(pkt.NetworkHeader).TOS()
+ }
+
packet.timestamp = e.stack.NowNanoseconds()
e.rcvMu.Unlock()
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index 7051a7a9c..ee9d10555 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -56,6 +56,7 @@ const (
multicastAddr = "\xe8\x2b\xd3\xea"
multicastV6Addr = "\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
broadcastAddr = header.IPv4Broadcast
+ testTOS = 0x80
// defaultMTU is the MTU, in bytes, used throughout the tests, except
// where another value is explicitly used. It is chosen to match the MTU
@@ -335,7 +336,7 @@ func (c *testContext) createEndpointForFlow(flow testFlow) {
c.createEndpoint(flow.sockProto())
if flow.isV6Only() {
- if err := c.ep.SetSockOpt(tcpip.V6OnlyOption(1)); err != nil {
+ if err := c.ep.SetSockOptBool(tcpip.V6OnlyOption, true); err != nil {
c.t.Fatalf("SetSockOpt failed: %v", err)
}
} else if flow.isBroadcast() {
@@ -453,6 +454,7 @@ func (c *testContext) injectV4Packet(payload []byte, h *header4Tuple, valid bool
ip := header.IPv4(buf)
ip.Encode(&header.IPv4Fields{
IHL: header.IPv4MinimumSize,
+ TOS: testTOS,
TotalLength: uint16(len(buf)),
TTL: 65,
Protocol: uint8(udp.ProtocolNumber),
@@ -508,46 +510,42 @@ func TestBindToDeviceOption(t *testing.T) {
}
defer ep.Close()
- if err := s.CreateNamedNIC(321, "my_device", loopback.New()); err != nil {
- t.Errorf("CreateNamedNIC failed: %v", err)
+ opts := stack.NICOptions{Name: "my_device"}
+ if err := s.CreateNICWithOptions(321, loopback.New(), opts); err != nil {
+ t.Errorf("CreateNICWithOptions(_, _, %+v) failed: %v", opts, err)
}
- // Make an nameless NIC.
- if err := s.CreateNIC(54321, loopback.New()); err != nil {
- t.Errorf("CreateNIC failed: %v", err)
- }
-
- // strPtr is used instead of taking the address of string literals, which is
+ // nicIDPtr is used instead of taking the address of NICID literals, which is
// a compiler error.
- strPtr := func(s string) *string {
+ nicIDPtr := func(s tcpip.NICID) *tcpip.NICID {
return &s
}
testActions := []struct {
name string
- setBindToDevice *string
+ setBindToDevice *tcpip.NICID
setBindToDeviceError *tcpip.Error
getBindToDevice tcpip.BindToDeviceOption
}{
- {"GetDefaultValue", nil, nil, ""},
- {"BindToNonExistent", strPtr("non_existent_device"), tcpip.ErrUnknownDevice, ""},
- {"BindToExistent", strPtr("my_device"), nil, "my_device"},
- {"UnbindToDevice", strPtr(""), nil, ""},
+ {"GetDefaultValue", nil, nil, 0},
+ {"BindToNonExistent", nicIDPtr(999), tcpip.ErrUnknownDevice, 0},
+ {"BindToExistent", nicIDPtr(321), nil, 321},
+ {"UnbindToDevice", nicIDPtr(0), nil, 0},
}
for _, testAction := range testActions {
t.Run(testAction.name, func(t *testing.T) {
if testAction.setBindToDevice != nil {
bindToDevice := tcpip.BindToDeviceOption(*testAction.setBindToDevice)
- if got, want := ep.SetSockOpt(bindToDevice), testAction.setBindToDeviceError; got != want {
- t.Errorf("SetSockOpt(%v) got %v, want %v", bindToDevice, got, want)
+ if gotErr, wantErr := ep.SetSockOpt(bindToDevice), testAction.setBindToDeviceError; gotErr != wantErr {
+ t.Errorf("SetSockOpt(%v) got %v, want %v", bindToDevice, gotErr, wantErr)
}
}
- bindToDevice := tcpip.BindToDeviceOption("to be modified by GetSockOpt")
- if ep.GetSockOpt(&bindToDevice) != nil {
- t.Errorf("GetSockOpt got %v, want %v", ep.GetSockOpt(&bindToDevice), nil)
+ bindToDevice := tcpip.BindToDeviceOption(88888)
+ if err := ep.GetSockOpt(&bindToDevice); err != nil {
+ t.Errorf("GetSockOpt got %v, want %v", err, nil)
}
if got, want := bindToDevice, testAction.getBindToDevice; got != want {
- t.Errorf("bindToDevice got %q, want %q", got, want)
+ t.Errorf("bindToDevice got %d, want %d", got, want)
}
})
}
@@ -556,8 +554,8 @@ func TestBindToDeviceOption(t *testing.T) {
// testReadInternal sends a packet of the given test flow into the stack by
// injecting it into the link endpoint. It then attempts to read it from the
// UDP endpoint and depending on if this was expected to succeed verifies its
-// correctness.
-func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expectReadError bool) {
+// correctness including any additional checker functions provided.
+func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expectReadError bool, checkers ...checker.ControlMessagesChecker) {
c.t.Helper()
payload := newPayload()
@@ -572,12 +570,12 @@ func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expe
epstats := c.ep.Stats().(*tcpip.TransportEndpointStats).Clone()
var addr tcpip.FullAddress
- v, _, err := c.ep.Read(&addr)
+ v, cm, err := c.ep.Read(&addr)
if err == tcpip.ErrWouldBlock {
// Wait for data to become available.
select {
case <-ch:
- v, _, err = c.ep.Read(&addr)
+ v, cm, err = c.ep.Read(&addr)
case <-time.After(300 * time.Millisecond):
if packetShouldBeDropped {
@@ -610,15 +608,21 @@ func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expe
if !bytes.Equal(payload, v) {
c.t.Fatalf("bad payload: got %x, want %x", v, payload)
}
+
+ // Run any checkers against the ControlMessages.
+ for _, f := range checkers {
+ f(c.t, cm)
+ }
+
c.checkEndpointReadStats(1, epstats, err)
}
// testRead sends a packet of the given test flow into the stack by injecting it
// into the link endpoint. It then reads it from the UDP endpoint and verifies
-// its correctness.
-func testRead(c *testContext, flow testFlow) {
+// its correctness including any additional checker functions provided.
+func testRead(c *testContext, flow testFlow, checkers ...checker.ControlMessagesChecker) {
c.t.Helper()
- testReadInternal(c, flow, false /* packetShouldBeDropped */, false /* expectReadError */)
+ testReadInternal(c, flow, false /* packetShouldBeDropped */, false /* expectReadError */, checkers...)
}
// testFailingRead sends a packet of the given test flow into the stack by
@@ -1286,7 +1290,7 @@ func TestTOSV4(t *testing.T) {
c.createEndpointForFlow(flow)
- const tos = 0xC0
+ const tos = testTOS
var v tcpip.IPv4TOSOption
if err := c.ep.GetSockOpt(&v); err != nil {
c.t.Errorf("GetSockopt failed: %s", err)
@@ -1321,7 +1325,7 @@ func TestTOSV6(t *testing.T) {
c.createEndpointForFlow(flow)
- const tos = 0xC0
+ const tos = testTOS
var v tcpip.IPv6TrafficClassOption
if err := c.ep.GetSockOpt(&v); err != nil {
c.t.Errorf("GetSockopt failed: %s", err)
@@ -1348,6 +1352,47 @@ func TestTOSV6(t *testing.T) {
}
}
+func TestReceiveTOSV4(t *testing.T) {
+ for _, flow := range []testFlow{unicastV4, broadcast} {
+ t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
+ c := newDualTestContext(t, defaultMTU)
+ defer c.cleanup()
+
+ c.createEndpointForFlow(flow)
+
+ // Verify that setting and reading the option works.
+ v, err := c.ep.GetSockOptBool(tcpip.ReceiveTOSOption)
+ if err != nil {
+ c.t.Fatal("GetSockOptBool(tcpip.ReceiveTOSOption) failed:", err)
+ }
+ // Test for expected default value.
+ if v != false {
+ c.t.Errorf("got GetSockOptBool(tcpip.ReceiveTOSOption) = %t, want = %t", v, false)
+ }
+
+ want := true
+ if err := c.ep.SetSockOptBool(tcpip.ReceiveTOSOption, want); err != nil {
+ c.t.Fatalf("SetSockOptBool(tcpip.ReceiveTOSOption, %t) failed: %s", want, err)
+ }
+
+ got, err := c.ep.GetSockOptBool(tcpip.ReceiveTOSOption)
+ if err != nil {
+ c.t.Fatal("GetSockOptBool(tcpip.ReceiveTOSOption) failed:", err)
+ }
+ if got != want {
+ c.t.Fatalf("got GetSockOptBool(tcpip.ReceiveTOSOption) = %t, want = %t", got, want)
+ }
+
+ // Verify that the correct received TOS is handed through as
+ // ancillary data to the ControlMessages struct.
+ if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+ c.t.Fatal("Bind failed:", err)
+ }
+ testRead(c, flow, checker.ReceiveTOS(testTOS))
+ })
+ }
+}
+
func TestMulticastInterfaceOption(t *testing.T) {
for _, flow := range []testFlow{multicastV4, multicastV4in6, multicastV6, multicastV6Only} {
t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
diff --git a/pkg/tmutex/BUILD b/pkg/tmutex/BUILD
index 6afdb29b7..07778e4f7 100644
--- a/pkg/tmutex/BUILD
+++ b/pkg/tmutex/BUILD
@@ -15,4 +15,5 @@ go_test(
size = "medium",
srcs = ["tmutex_test.go"],
embed = [":tmutex"],
+ deps = ["//pkg/sync"],
)
diff --git a/pkg/tmutex/tmutex_test.go b/pkg/tmutex/tmutex_test.go
index ce34c7962..05540696a 100644
--- a/pkg/tmutex/tmutex_test.go
+++ b/pkg/tmutex/tmutex_test.go
@@ -17,10 +17,11 @@ package tmutex
import (
"fmt"
"runtime"
- "sync"
"sync/atomic"
"testing"
"time"
+
+ "gvisor.dev/gvisor/pkg/sync"
)
func TestBasicLock(t *testing.T) {
diff --git a/pkg/unet/BUILD b/pkg/unet/BUILD
index 8f6f180e5..d1885ae66 100644
--- a/pkg/unet/BUILD
+++ b/pkg/unet/BUILD
@@ -24,4 +24,5 @@ go_test(
"unet_test.go",
],
embed = [":unet"],
+ deps = ["//pkg/sync"],
)
diff --git a/pkg/unet/unet_test.go b/pkg/unet/unet_test.go
index a3cc6f5d3..5c4b9e8e9 100644
--- a/pkg/unet/unet_test.go
+++ b/pkg/unet/unet_test.go
@@ -19,10 +19,11 @@ import (
"os"
"path/filepath"
"reflect"
- "sync"
"syscall"
"testing"
"time"
+
+ "gvisor.dev/gvisor/pkg/sync"
)
func randomFilename() (string, error) {
diff --git a/pkg/urpc/BUILD b/pkg/urpc/BUILD
index b6bbb0ea2..b8fdc3125 100644
--- a/pkg/urpc/BUILD
+++ b/pkg/urpc/BUILD
@@ -11,6 +11,7 @@ go_library(
deps = [
"//pkg/fd",
"//pkg/log",
+ "//pkg/sync",
"//pkg/unet",
],
)
diff --git a/pkg/urpc/urpc.go b/pkg/urpc/urpc.go
index df59ffab1..13b2ea314 100644
--- a/pkg/urpc/urpc.go
+++ b/pkg/urpc/urpc.go
@@ -27,10 +27,10 @@ import (
"os"
"reflect"
"runtime"
- "sync"
"gvisor.dev/gvisor/pkg/fd"
"gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/unet"
)
diff --git a/pkg/waiter/BUILD b/pkg/waiter/BUILD
index 0427bc41f..1c6890e52 100644
--- a/pkg/waiter/BUILD
+++ b/pkg/waiter/BUILD
@@ -24,6 +24,7 @@ go_library(
],
importpath = "gvisor.dev/gvisor/pkg/waiter",
visibility = ["//visibility:public"],
+ deps = ["//pkg/sync"],
)
go_test(
diff --git a/pkg/waiter/waiter.go b/pkg/waiter/waiter.go
index 8a65ed164..f708e95fa 100644
--- a/pkg/waiter/waiter.go
+++ b/pkg/waiter/waiter.go
@@ -58,7 +58,7 @@
package waiter
import (
- "sync"
+ "gvisor.dev/gvisor/pkg/sync"
)
// EventMask represents io events as used in the poll() syscall.
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index 847d2f91c..3e20f8f2f 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -7,6 +7,7 @@ go_library(
srcs = [
"compat.go",
"compat_amd64.go",
+ "compat_arm64.go",
"config.go",
"controller.go",
"debug.go",
@@ -73,6 +74,7 @@ go_library(
"//pkg/sentry/usage",
"//pkg/sentry/usermem",
"//pkg/sentry/watchdog",
+ "//pkg/sync",
"//pkg/syserror",
"//pkg/tcpip",
"//pkg/tcpip/link/fdbased",
@@ -110,10 +112,10 @@ go_test(
"//pkg/control/server",
"//pkg/log",
"//pkg/p9",
- "//pkg/sentry/arch:registers_go_proto",
"//pkg/sentry/context/contexttest",
"//pkg/sentry/fs",
"//pkg/sentry/kernel/auth",
+ "//pkg/sync",
"//pkg/unet",
"//runsc/fsgofer",
"@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go
index 07e35ab10..9c23b9553 100644
--- a/runsc/boot/compat.go
+++ b/runsc/boot/compat.go
@@ -17,18 +17,16 @@ package boot
import (
"fmt"
"os"
- "sync"
"syscall"
"github.com/golang/protobuf/proto"
- "gvisor.dev/gvisor/pkg/abi"
"gvisor.dev/gvisor/pkg/eventchannel"
"gvisor.dev/gvisor/pkg/log"
- "gvisor.dev/gvisor/pkg/sentry/arch"
rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
ucspb "gvisor.dev/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto"
"gvisor.dev/gvisor/pkg/sentry/strace"
spb "gvisor.dev/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto"
+ "gvisor.dev/gvisor/pkg/sync"
)
func initCompatLogs(fd int) error {
@@ -53,9 +51,9 @@ type compatEmitter struct {
}
func newCompatEmitter(logFD int) (*compatEmitter, error) {
- nameMap, ok := strace.Lookup(abi.Linux, arch.AMD64)
+ nameMap, ok := getSyscallNameMap()
if !ok {
- return nil, fmt.Errorf("amd64 Linux syscall table not found")
+ return nil, fmt.Errorf("Linux syscall table not found")
}
c := &compatEmitter{
@@ -86,16 +84,16 @@ func (c *compatEmitter) Emit(msg proto.Message) (bool, error) {
}
func (c *compatEmitter) emitUnimplementedSyscall(us *spb.UnimplementedSyscall) {
- regs := us.Registers.GetArch().(*rpb.Registers_Amd64).Amd64
+ regs := us.Registers
c.mu.Lock()
defer c.mu.Unlock()
- sysnr := regs.OrigRax
+ sysnr := syscallNum(regs)
tr := c.trackers[sysnr]
if tr == nil {
switch sysnr {
- case syscall.SYS_PRCTL, syscall.SYS_ARCH_PRCTL:
+ case syscall.SYS_PRCTL:
// args: cmd, ...
tr = newArgsTracker(0)
@@ -112,10 +110,14 @@ func (c *compatEmitter) emitUnimplementedSyscall(us *spb.UnimplementedSyscall) {
tr = newArgsTracker(2)
default:
- tr = &onceTracker{}
+ tr = newArchArgsTracker(sysnr)
+ if tr == nil {
+ tr = &onceTracker{}
+ }
}
c.trackers[sysnr] = tr
}
+
if tr.shouldReport(regs) {
c.sink.Infof("Unsupported syscall: %s, regs: %+v", c.nameMap.Name(uintptr(sysnr)), regs)
tr.onReported(regs)
@@ -139,10 +141,10 @@ func (c *compatEmitter) Close() error {
// the syscall and arguments.
type syscallTracker interface {
// shouldReport returns true is the syscall should be reported.
- shouldReport(regs *rpb.AMD64Registers) bool
+ shouldReport(regs *rpb.Registers) bool
// onReported marks the syscall as reported.
- onReported(regs *rpb.AMD64Registers)
+ onReported(regs *rpb.Registers)
}
// onceTracker reports only a single time, used for most syscalls.
@@ -150,10 +152,45 @@ type onceTracker struct {
reported bool
}
-func (o *onceTracker) shouldReport(_ *rpb.AMD64Registers) bool {
+func (o *onceTracker) shouldReport(_ *rpb.Registers) bool {
return !o.reported
}
-func (o *onceTracker) onReported(_ *rpb.AMD64Registers) {
+func (o *onceTracker) onReported(_ *rpb.Registers) {
o.reported = true
}
+
+// argsTracker reports only once for each different combination of arguments.
+// It's used for generic syscalls like ioctl to report once per 'cmd'.
+type argsTracker struct {
+ // argsIdx is the syscall arguments to use as unique ID.
+ argsIdx []int
+ reported map[string]struct{}
+ count int
+}
+
+func newArgsTracker(argIdx ...int) *argsTracker {
+ return &argsTracker{argsIdx: argIdx, reported: make(map[string]struct{})}
+}
+
+// key returns the command based on the syscall argument index.
+func (a *argsTracker) key(regs *rpb.Registers) string {
+ var rv string
+ for _, idx := range a.argsIdx {
+ rv += fmt.Sprintf("%d|", argVal(idx, regs))
+ }
+ return rv
+}
+
+func (a *argsTracker) shouldReport(regs *rpb.Registers) bool {
+ if a.count >= reportLimit {
+ return false
+ }
+ _, ok := a.reported[a.key(regs)]
+ return !ok
+}
+
+func (a *argsTracker) onReported(regs *rpb.Registers) {
+ a.count++
+ a.reported[a.key(regs)] = struct{}{}
+}
diff --git a/runsc/boot/compat_amd64.go b/runsc/boot/compat_amd64.go
index 43cd0db94..42b0ca8b0 100644
--- a/runsc/boot/compat_amd64.go
+++ b/runsc/boot/compat_amd64.go
@@ -16,62 +16,81 @@ package boot
import (
"fmt"
+ "syscall"
+ "gvisor.dev/gvisor/pkg/abi"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
+ "gvisor.dev/gvisor/pkg/sentry/strace"
)
// reportLimit is the max number of events that should be reported per tracker.
const reportLimit = 100
-// argsTracker reports only once for each different combination of arguments.
-// It's used for generic syscalls like ioctl to report once per 'cmd'.
-type argsTracker struct {
- // argsIdx is the syscall arguments to use as unique ID.
- argsIdx []int
- reported map[string]struct{}
- count int
+// newRegs create a empty Registers instance.
+func newRegs() *rpb.Registers {
+ return &rpb.Registers{
+ Arch: &rpb.Registers_Amd64{
+ Amd64: &rpb.AMD64Registers{},
+ },
+ }
}
-func newArgsTracker(argIdx ...int) *argsTracker {
- return &argsTracker{argsIdx: argIdx, reported: make(map[string]struct{})}
-}
+func argVal(argIdx int, regs *rpb.Registers) uint32 {
+ amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64
-// cmd returns the command based on the syscall argument index.
-func (a *argsTracker) key(regs *rpb.AMD64Registers) string {
- var rv string
- for _, idx := range a.argsIdx {
- rv += fmt.Sprintf("%d|", argVal(idx, regs))
+ switch argIdx {
+ case 0:
+ return uint32(amd64Regs.Rdi)
+ case 1:
+ return uint32(amd64Regs.Rsi)
+ case 2:
+ return uint32(amd64Regs.Rdx)
+ case 3:
+ return uint32(amd64Regs.R10)
+ case 4:
+ return uint32(amd64Regs.R8)
+ case 5:
+ return uint32(amd64Regs.R9)
}
- return rv
+ panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
}
-func argVal(argIdx int, regs *rpb.AMD64Registers) uint32 {
+func setArgVal(argIdx int, argVal uint64, regs *rpb.Registers) {
+ amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64
+
switch argIdx {
case 0:
- return uint32(regs.Rdi)
+ amd64Regs.Rdi = argVal
case 1:
- return uint32(regs.Rsi)
+ amd64Regs.Rsi = argVal
case 2:
- return uint32(regs.Rdx)
+ amd64Regs.Rdx = argVal
case 3:
- return uint32(regs.R10)
+ amd64Regs.R10 = argVal
case 4:
- return uint32(regs.R8)
+ amd64Regs.R8 = argVal
case 5:
- return uint32(regs.R9)
+ amd64Regs.R9 = argVal
+ default:
+ panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
}
- panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
}
-func (a *argsTracker) shouldReport(regs *rpb.AMD64Registers) bool {
- if a.count >= reportLimit {
- return false
- }
- _, ok := a.reported[a.key(regs)]
- return !ok
+func getSyscallNameMap() (strace.SyscallMap, bool) {
+ return strace.Lookup(abi.Linux, arch.AMD64)
+}
+
+func syscallNum(regs *rpb.Registers) uint64 {
+ amd64Regs := regs.GetArch().(*rpb.Registers_Amd64).Amd64
+ return amd64Regs.OrigRax
}
-func (a *argsTracker) onReported(regs *rpb.AMD64Registers) {
- a.count++
- a.reported[a.key(regs)] = struct{}{}
+func newArchArgsTracker(sysnr uint64) syscallTracker {
+ switch sysnr {
+ case syscall.SYS_ARCH_PRCTL:
+ // args: cmd, ...
+ return newArgsTracker(0)
+ }
+ return nil
}
diff --git a/runsc/boot/compat_arm64.go b/runsc/boot/compat_arm64.go
new file mode 100644
index 000000000..f784cd237
--- /dev/null
+++ b/runsc/boot/compat_arm64.go
@@ -0,0 +1,91 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/abi"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
+ "gvisor.dev/gvisor/pkg/sentry/strace"
+)
+
+// reportLimit is the max number of events that should be reported per tracker.
+const reportLimit = 100
+
+// newRegs create a empty Registers instance.
+func newRegs() *rpb.Registers {
+ return &rpb.Registers{
+ Arch: &rpb.Registers_Arm64{
+ Arm64: &rpb.ARM64Registers{},
+ },
+ }
+}
+
+func argVal(argIdx int, regs *rpb.Registers) uint32 {
+ arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64
+
+ switch argIdx {
+ case 0:
+ return uint32(arm64Regs.R0)
+ case 1:
+ return uint32(arm64Regs.R1)
+ case 2:
+ return uint32(arm64Regs.R2)
+ case 3:
+ return uint32(arm64Regs.R3)
+ case 4:
+ return uint32(arm64Regs.R4)
+ case 5:
+ return uint32(arm64Regs.R5)
+ }
+ panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
+}
+
+func setArgVal(argIdx int, argVal uint64, regs *rpb.Registers) {
+ arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64
+
+ switch argIdx {
+ case 0:
+ arm64Regs.R0 = argVal
+ case 1:
+ arm64Regs.R1 = argVal
+ case 2:
+ arm64Regs.R2 = argVal
+ case 3:
+ arm64Regs.R3 = argVal
+ case 4:
+ arm64Regs.R4 = argVal
+ case 5:
+ arm64Regs.R5 = argVal
+ default:
+ panic(fmt.Sprintf("invalid syscall argument index %d", argIdx))
+ }
+}
+
+func getSyscallNameMap() (strace.SyscallMap, bool) {
+ return strace.Lookup(abi.Linux, arch.ARM64)
+}
+
+func syscallNum(regs *rpb.Registers) uint64 {
+ arm64Regs := regs.GetArch().(*rpb.Registers_Arm64).Arm64
+ return arm64Regs.R8
+}
+
+func newArchArgsTracker(sysnr uint64) syscallTracker {
+ // currently, no arch specific syscalls need to be handled here.
+ return nil
+}
diff --git a/runsc/boot/compat_test.go b/runsc/boot/compat_test.go
index 388298d8d..839c5303b 100644
--- a/runsc/boot/compat_test.go
+++ b/runsc/boot/compat_test.go
@@ -16,8 +16,6 @@ package boot
import (
"testing"
-
- rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
)
func TestOnceTracker(t *testing.T) {
@@ -35,31 +33,34 @@ func TestOnceTracker(t *testing.T) {
func TestArgsTracker(t *testing.T) {
for _, tc := range []struct {
- name string
- idx []int
- rdi1 uint64
- rdi2 uint64
- rsi1 uint64
- rsi2 uint64
- want bool
+ name string
+ idx []int
+ arg1_1 uint64
+ arg1_2 uint64
+ arg2_1 uint64
+ arg2_2 uint64
+ want bool
}{
- {name: "same rdi", idx: []int{0}, rdi1: 123, rdi2: 123, want: false},
- {name: "same rsi", idx: []int{1}, rsi1: 123, rsi2: 123, want: false},
- {name: "diff rdi", idx: []int{0}, rdi1: 123, rdi2: 321, want: true},
- {name: "diff rsi", idx: []int{1}, rsi1: 123, rsi2: 321, want: true},
- {name: "cmd is uint32", idx: []int{0}, rsi1: 0xdead00000123, rsi2: 0xbeef00000123, want: false},
- {name: "same 2 args", idx: []int{0, 1}, rsi1: 123, rdi1: 321, rsi2: 123, rdi2: 321, want: false},
- {name: "diff 2 args", idx: []int{0, 1}, rsi1: 123, rdi1: 321, rsi2: 789, rdi2: 987, want: true},
+ {name: "same arg1", idx: []int{0}, arg1_1: 123, arg1_2: 123, want: false},
+ {name: "same arg2", idx: []int{1}, arg2_1: 123, arg2_2: 123, want: false},
+ {name: "diff arg1", idx: []int{0}, arg1_1: 123, arg1_2: 321, want: true},
+ {name: "diff arg2", idx: []int{1}, arg2_1: 123, arg2_2: 321, want: true},
+ {name: "cmd is uint32", idx: []int{0}, arg2_1: 0xdead00000123, arg2_2: 0xbeef00000123, want: false},
+ {name: "same 2 args", idx: []int{0, 1}, arg2_1: 123, arg1_1: 321, arg2_2: 123, arg1_2: 321, want: false},
+ {name: "diff 2 args", idx: []int{0, 1}, arg2_1: 123, arg1_1: 321, arg2_2: 789, arg1_2: 987, want: true},
} {
t.Run(tc.name, func(t *testing.T) {
c := newArgsTracker(tc.idx...)
- regs := &rpb.AMD64Registers{Rdi: tc.rdi1, Rsi: tc.rsi1}
+ regs := newRegs()
+ setArgVal(0, tc.arg1_1, regs)
+ setArgVal(1, tc.arg2_1, regs)
if !c.shouldReport(regs) {
t.Error("first call to shouldReport, got: false, want: true")
}
c.onReported(regs)
- regs.Rdi, regs.Rsi = tc.rdi2, tc.rsi2
+ setArgVal(0, tc.arg1_2, regs)
+ setArgVal(1, tc.arg2_2, regs)
if got := c.shouldReport(regs); tc.want != got {
t.Errorf("second call to shouldReport, got: %t, want: %t", got, tc.want)
}
@@ -70,7 +71,9 @@ func TestArgsTracker(t *testing.T) {
func TestArgsTrackerLimit(t *testing.T) {
c := newArgsTracker(0, 1)
for i := 0; i < reportLimit; i++ {
- regs := &rpb.AMD64Registers{Rdi: 123, Rsi: uint64(i)}
+ regs := newRegs()
+ setArgVal(0, 123, regs)
+ setArgVal(1, uint64(i), regs)
if !c.shouldReport(regs) {
t.Error("shouldReport before limit was reached, got: false, want: true")
}
@@ -78,7 +81,9 @@ func TestArgsTrackerLimit(t *testing.T) {
}
// Should hit the count limit now.
- regs := &rpb.AMD64Registers{Rdi: 123, Rsi: 123456}
+ regs := newRegs()
+ setArgVal(0, 123, regs)
+ setArgVal(1, 123456, regs)
if c.shouldReport(regs) {
t.Error("shouldReport after limit was reached, got: true, want: false")
}
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 72a33534f..a878bc2ce 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -250,6 +250,12 @@ type Config struct {
// multiple tests are run in parallel, since there is no way to pass
// parameters to the runtime from docker.
TestOnlyTestNameEnv string
+
+ // CPUNumFromQuota sets CPU number count to available CPU quota, using
+ // least integer value greater than or equal to quota.
+ //
+ // E.g. 0.2 CPU quota will result in 1, and 1.9 in 2.
+ CPUNumFromQuota bool
}
// ToFlags returns a slice of flags that correspond to the given Config.
@@ -282,6 +288,9 @@ func (c *Config) ToFlags() []string {
"--software-gso=" + strconv.FormatBool(c.SoftwareGSO),
"--overlayfs-stale-read=" + strconv.FormatBool(c.OverlayfsStaleRead),
}
+ if c.CPUNumFromQuota {
+ f = append(f, "--cpu-num-from-quota")
+ }
// Only include these if set since it is never to be used by users.
if c.TestOnlyAllowRunAsCurrentUserWithoutChroot {
f = append(f, "--TESTONLY-unsafe-nonroot=true")
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index f62be4c59..9c9e94864 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -152,7 +152,9 @@ func newController(fd int, l *Loader) (*controller, error) {
srv.Register(&debug{})
srv.Register(&control.Logging{})
if l.conf.ProfileEnable {
- srv.Register(&control.Profile{})
+ srv.Register(&control.Profile{
+ Kernel: l.k,
+ })
}
return &controller{
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 677356193..4fb9adca6 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -134,11 +134,6 @@ var allowedSyscalls = seccomp.SyscallRules{
seccomp.AllowValue(syscall.SOL_SOCKET),
seccomp.AllowValue(syscall.SO_SNDBUF),
},
- {
- seccomp.AllowAny{},
- seccomp.AllowValue(syscall.SOL_SOCKET),
- seccomp.AllowValue(syscall.SO_REUSEADDR),
- },
},
syscall.SYS_GETTID: {},
syscall.SYS_GETTIMEOFDAY: {},
@@ -317,6 +312,26 @@ func hostInetFilters() seccomp.SyscallRules {
syscall.SYS_GETSOCKOPT: []seccomp.Rule{
{
seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_IP),
+ seccomp.AllowValue(syscall.IP_TOS),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_IP),
+ seccomp.AllowValue(syscall.IP_RECVTOS),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_IPV6),
+ seccomp.AllowValue(syscall.IPV6_TCLASS),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_IPV6),
+ seccomp.AllowValue(syscall.IPV6_RECVTCLASS),
+ },
+ {
+ seccomp.AllowAny{},
seccomp.AllowValue(syscall.SOL_IPV6),
seccomp.AllowValue(syscall.IPV6_V6ONLY),
},
@@ -418,6 +433,34 @@ func hostInetFilters() seccomp.SyscallRules {
seccomp.AllowAny{},
seccomp.AllowValue(4),
},
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_IP),
+ seccomp.AllowValue(syscall.IP_TOS),
+ seccomp.AllowAny{},
+ seccomp.AllowValue(4),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_IP),
+ seccomp.AllowValue(syscall.IP_RECVTOS),
+ seccomp.AllowAny{},
+ seccomp.AllowValue(4),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_IPV6),
+ seccomp.AllowValue(syscall.IPV6_TCLASS),
+ seccomp.AllowAny{},
+ seccomp.AllowValue(4),
+ },
+ {
+ seccomp.AllowAny{},
+ seccomp.AllowValue(syscall.SOL_IPV6),
+ seccomp.AllowValue(syscall.IPV6_RECVTCLASS),
+ seccomp.AllowAny{},
+ seccomp.AllowValue(4),
+ },
},
syscall.SYS_SHUTDOWN: []seccomp.Rule{
{
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index bc9ffaf81..421ccd255 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -16,7 +16,6 @@ package boot
import (
"fmt"
- "path"
"path/filepath"
"sort"
"strconv"
@@ -52,7 +51,7 @@ const (
rootDevice = "9pfs-/"
// MountPrefix is the annotation prefix for mount hints.
- MountPrefix = "gvisor.dev/spec/mount"
+ MountPrefix = "dev.gvisor.spec.mount."
// Filesystems that runsc supports.
bind = "bind"
@@ -490,14 +489,15 @@ type podMountHints struct {
func newPodMountHints(spec *specs.Spec) (*podMountHints, error) {
mnts := make(map[string]*mountHint)
for k, v := range spec.Annotations {
- // Look for 'gvisor.dev/spec/mount' annotations and parse them.
+ // Look for 'dev.gvisor.spec.mount' annotations and parse them.
if strings.HasPrefix(k, MountPrefix) {
- parts := strings.Split(k, "/")
- if len(parts) != 5 {
+ // Remove the prefix and split the rest.
+ parts := strings.Split(k[len(MountPrefix):], ".")
+ if len(parts) != 2 {
return nil, fmt.Errorf("invalid mount annotation: %s=%s", k, v)
}
- name := parts[3]
- if len(name) == 0 || path.Clean(name) != name {
+ name := parts[0]
+ if len(name) == 0 {
return nil, fmt.Errorf("invalid mount name: %s", name)
}
mnt := mnts[name]
@@ -505,7 +505,7 @@ func newPodMountHints(spec *specs.Spec) (*podMountHints, error) {
mnt = &mountHint{name: name}
mnts[name] = mnt
}
- if err := mnt.setField(parts[4], v); err != nil {
+ if err := mnt.setField(parts[1], v); err != nil {
return nil, err
}
}
@@ -575,6 +575,11 @@ func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hin
func (c *containerMounter) processHints(conf *Config) error {
ctx := c.k.SupervisorContext()
for _, hint := range c.hints.mounts {
+ // TODO(b/142076984): Only support tmpfs for now. Bind mounts require a
+ // common gofer to mount all shared volumes.
+ if hint.mount.Type != tmpfs {
+ continue
+ }
log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type)
inode, err := c.mountSharedMaster(ctx, conf, hint)
if err != nil {
@@ -851,7 +856,7 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns
return fmt.Errorf("mount %q error: %v", m.Destination, err)
}
- log.Infof("Mounted %q to %q type %s", m.Source, m.Destination, m.Type)
+ log.Infof("Mounted %q to %q type: %s, internal-options: %q", m.Source, m.Destination, m.Type, opts)
return nil
}
diff --git a/runsc/boot/fs_test.go b/runsc/boot/fs_test.go
index 0396a4cfb..912037075 100644
--- a/runsc/boot/fs_test.go
+++ b/runsc/boot/fs_test.go
@@ -15,7 +15,6 @@
package boot
import (
- "path"
"reflect"
"strings"
"testing"
@@ -26,19 +25,19 @@ import (
func TestPodMountHintsHappy(t *testing.T) {
spec := &specs.Spec{
Annotations: map[string]string{
- path.Join(MountPrefix, "mount1", "source"): "foo",
- path.Join(MountPrefix, "mount1", "type"): "tmpfs",
- path.Join(MountPrefix, "mount1", "share"): "pod",
+ MountPrefix + "mount1.source": "foo",
+ MountPrefix + "mount1.type": "tmpfs",
+ MountPrefix + "mount1.share": "pod",
- path.Join(MountPrefix, "mount2", "source"): "bar",
- path.Join(MountPrefix, "mount2", "type"): "bind",
- path.Join(MountPrefix, "mount2", "share"): "container",
- path.Join(MountPrefix, "mount2", "options"): "rw,private",
+ MountPrefix + "mount2.source": "bar",
+ MountPrefix + "mount2.type": "bind",
+ MountPrefix + "mount2.share": "container",
+ MountPrefix + "mount2.options": "rw,private",
},
}
podHints, err := newPodMountHints(spec)
if err != nil {
- t.Errorf("newPodMountHints failed: %v", err)
+ t.Fatalf("newPodMountHints failed: %v", err)
}
// Check that fields were set correctly.
@@ -86,95 +85,95 @@ func TestPodMountHintsErrors(t *testing.T) {
{
name: "too short",
annotations: map[string]string{
- path.Join(MountPrefix, "mount1"): "foo",
+ MountPrefix + "mount1": "foo",
},
error: "invalid mount annotation",
},
{
name: "no name",
annotations: map[string]string{
- MountPrefix + "//source": "foo",
+ MountPrefix + ".source": "foo",
},
error: "invalid mount name",
},
{
name: "missing source",
annotations: map[string]string{
- path.Join(MountPrefix, "mount1", "type"): "tmpfs",
- path.Join(MountPrefix, "mount1", "share"): "pod",
+ MountPrefix + "mount1.type": "tmpfs",
+ MountPrefix + "mount1.share": "pod",
},
error: "source field",
},
{
name: "missing type",
annotations: map[string]string{
- path.Join(MountPrefix, "mount1", "source"): "foo",
- path.Join(MountPrefix, "mount1", "share"): "pod",
+ MountPrefix + "mount1.source": "foo",
+ MountPrefix + "mount1.share": "pod",
},
error: "type field",
},
{
name: "missing share",
annotations: map[string]string{
- path.Join(MountPrefix, "mount1", "source"): "foo",
- path.Join(MountPrefix, "mount1", "type"): "tmpfs",
+ MountPrefix + "mount1.source": "foo",
+ MountPrefix + "mount1.type": "tmpfs",
},
error: "share field",
},
{
name: "invalid field name",
annotations: map[string]string{
- path.Join(MountPrefix, "mount1", "invalid"): "foo",
+ MountPrefix + "mount1.invalid": "foo",
},
error: "invalid mount annotation",
},
{
name: "invalid source",
annotations: map[string]string{
- path.Join(MountPrefix, "mount1", "source"): "",
- path.Join(MountPrefix, "mount1", "type"): "tmpfs",
- path.Join(MountPrefix, "mount1", "share"): "pod",
+ MountPrefix + "mount1.source": "",
+ MountPrefix + "mount1.type": "tmpfs",
+ MountPrefix + "mount1.share": "pod",
},
error: "source cannot be empty",
},
{
name: "invalid type",
annotations: map[string]string{
- path.Join(MountPrefix, "mount1", "source"): "foo",
- path.Join(MountPrefix, "mount1", "type"): "invalid-type",
- path.Join(MountPrefix, "mount1", "share"): "pod",
+ MountPrefix + "mount1.source": "foo",
+ MountPrefix + "mount1.type": "invalid-type",
+ MountPrefix + "mount1.share": "pod",
},
error: "invalid type",
},
{
name: "invalid share",
annotations: map[string]string{
- path.Join(MountPrefix, "mount1", "source"): "foo",
- path.Join(MountPrefix, "mount1", "type"): "tmpfs",
- path.Join(MountPrefix, "mount1", "share"): "invalid-share",
+ MountPrefix + "mount1.source": "foo",
+ MountPrefix + "mount1.type": "tmpfs",
+ MountPrefix + "mount1.share": "invalid-share",
},
error: "invalid share",
},
{
name: "invalid options",
annotations: map[string]string{
- path.Join(MountPrefix, "mount1", "source"): "foo",
- path.Join(MountPrefix, "mount1", "type"): "tmpfs",
- path.Join(MountPrefix, "mount1", "share"): "pod",
- path.Join(MountPrefix, "mount1", "options"): "invalid-option",
+ MountPrefix + "mount1.source": "foo",
+ MountPrefix + "mount1.type": "tmpfs",
+ MountPrefix + "mount1.share": "pod",
+ MountPrefix + "mount1.options": "invalid-option",
},
error: "unknown mount option",
},
{
name: "duplicate source",
annotations: map[string]string{
- path.Join(MountPrefix, "mount1", "source"): "foo",
- path.Join(MountPrefix, "mount1", "type"): "tmpfs",
- path.Join(MountPrefix, "mount1", "share"): "pod",
+ MountPrefix + "mount1.source": "foo",
+ MountPrefix + "mount1.type": "tmpfs",
+ MountPrefix + "mount1.share": "pod",
- path.Join(MountPrefix, "mount2", "source"): "foo",
- path.Join(MountPrefix, "mount2", "type"): "bind",
- path.Join(MountPrefix, "mount2", "share"): "container",
+ MountPrefix + "mount2.source": "foo",
+ MountPrefix + "mount2.type": "bind",
+ MountPrefix + "mount2.share": "container",
},
error: "have the same mount source",
},
@@ -202,36 +201,36 @@ func TestGetMountAccessType(t *testing.T) {
{
name: "container=exclusive",
annotations: map[string]string{
- path.Join(MountPrefix, "mount1", "source"): source,
- path.Join(MountPrefix, "mount1", "type"): "bind",
- path.Join(MountPrefix, "mount1", "share"): "container",
+ MountPrefix + "mount1.source": source,
+ MountPrefix + "mount1.type": "bind",
+ MountPrefix + "mount1.share": "container",
},
want: FileAccessExclusive,
},
{
name: "pod=shared",
annotations: map[string]string{
- path.Join(MountPrefix, "mount1", "source"): source,
- path.Join(MountPrefix, "mount1", "type"): "bind",
- path.Join(MountPrefix, "mount1", "share"): "pod",
+ MountPrefix + "mount1.source": source,
+ MountPrefix + "mount1.type": "bind",
+ MountPrefix + "mount1.share": "pod",
},
want: FileAccessShared,
},
{
name: "shared=shared",
annotations: map[string]string{
- path.Join(MountPrefix, "mount1", "source"): source,
- path.Join(MountPrefix, "mount1", "type"): "bind",
- path.Join(MountPrefix, "mount1", "share"): "shared",
+ MountPrefix + "mount1.source": source,
+ MountPrefix + "mount1.type": "bind",
+ MountPrefix + "mount1.share": "shared",
},
want: FileAccessShared,
},
{
name: "default=shared",
annotations: map[string]string{
- path.Join(MountPrefix, "mount1", "source"): source + "mismatch",
- path.Join(MountPrefix, "mount1", "type"): "bind",
- path.Join(MountPrefix, "mount1", "share"): "container",
+ MountPrefix + "mount1.source": source + "mismatch",
+ MountPrefix + "mount1.type": "bind",
+ MountPrefix + "mount1.share": "container",
},
want: FileAccessShared,
},
diff --git a/runsc/boot/limits.go b/runsc/boot/limits.go
index d1c0bb9b5..ce62236e5 100644
--- a/runsc/boot/limits.go
+++ b/runsc/boot/limits.go
@@ -16,12 +16,12 @@ package boot
import (
"fmt"
- "sync"
"syscall"
specs "github.com/opencontainers/runtime-spec/specs-go"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/limits"
+ "gvisor.dev/gvisor/pkg/sync"
)
// Mapping from linux resource names to limits.LimitType.
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index df6052c88..fad72f4ab 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -20,7 +20,6 @@ import (
mrand "math/rand"
"os"
"runtime"
- "sync"
"sync/atomic"
"syscall"
gtime "time"
@@ -46,6 +45,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/time"
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/sentry/watchdog"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
"gvisor.dev/gvisor/pkg/tcpip/network/arp"
@@ -93,10 +93,6 @@ type Loader struct {
// spec is the base configuration for the root container.
spec *specs.Spec
- // startSignalForwarding enables forwarding of signals to the sandboxed
- // container. It should be called after the init process is loaded.
- startSignalForwarding func() func()
-
// stopSignalForwarding disables forwarding of signals to the sandboxed
// container. It should be called when a sandbox is destroyed.
stopSignalForwarding func()
@@ -336,29 +332,6 @@ func New(args Args) (*Loader, error) {
return nil, fmt.Errorf("ignore child stop signals failed: %v", err)
}
- // Handle signals by forwarding them to the root container process
- // (except for panic signal, which should cause a panic).
- l.startSignalForwarding = sighandling.PrepareHandler(func(sig linux.Signal) {
- // Panic signal should cause a panic.
- if args.Conf.PanicSignal != -1 && sig == linux.Signal(args.Conf.PanicSignal) {
- panic("Signal-induced panic")
- }
-
- // Otherwise forward to root container.
- deliveryMode := DeliverToProcess
- if args.Console {
- // Since we are running with a console, we should
- // forward the signal to the foreground process group
- // so that job control signals like ^C can be handled
- // properly.
- deliveryMode = DeliverToForegroundProcessGroup
- }
- log.Infof("Received external signal %d, mode: %v", sig, deliveryMode)
- if err := l.signal(args.ID, 0, int32(sig), deliveryMode); err != nil {
- log.Warningf("error sending signal %v to container %q: %v", sig, args.ID, err)
- }
- })
-
// Create the control server using the provided FD.
//
// This must be done *after* we have initialized the kernel since the
@@ -566,8 +539,27 @@ func (l *Loader) run() error {
ep.tty.InitForegroundProcessGroup(ep.tg.ProcessGroup())
}
- // Start signal forwarding only after an init process is created.
- l.stopSignalForwarding = l.startSignalForwarding()
+ // Handle signals by forwarding them to the root container process
+ // (except for panic signal, which should cause a panic).
+ l.stopSignalForwarding = sighandling.StartSignalForwarding(func(sig linux.Signal) {
+ // Panic signal should cause a panic.
+ if l.conf.PanicSignal != -1 && sig == linux.Signal(l.conf.PanicSignal) {
+ panic("Signal-induced panic")
+ }
+
+ // Otherwise forward to root container.
+ deliveryMode := DeliverToProcess
+ if l.console {
+ // Since we are running with a console, we should forward the signal to
+ // the foreground process group so that job control signals like ^C can
+ // be handled properly.
+ deliveryMode = DeliverToForegroundProcessGroup
+ }
+ log.Infof("Received external signal %d, mode: %v", sig, deliveryMode)
+ if err := l.signal(l.sandboxID, 0, int32(sig), deliveryMode); err != nil {
+ log.Warningf("error sending signal %v to container %q: %v", sig, l.sandboxID, err)
+ }
+ })
log.Infof("Process should have started...")
l.watchdog.Start()
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index 147ff7703..bec0dc292 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -19,7 +19,6 @@ import (
"math/rand"
"os"
"reflect"
- "sync"
"syscall"
"testing"
"time"
@@ -30,6 +29,7 @@ import (
"gvisor.dev/gvisor/pkg/p9"
"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
"gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/unet"
"gvisor.dev/gvisor/runsc/fsgofer"
)
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index f98c5fd36..6a8765ec8 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -80,7 +80,8 @@ type CreateLinksAndRoutesArgs struct {
LoopbackLinks []LoopbackLink
FDBasedLinks []FDBasedLink
- DefaultGateway DefaultRoute
+ Defaultv4Gateway DefaultRoute
+ Defaultv6Gateway DefaultRoute
}
// Empty returns true if route hasn't been set.
@@ -122,10 +123,10 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
nicID++
nicids[link.Name] = nicID
- ep := loopback.New()
+ linkEP := loopback.New()
log.Infof("Enabling loopback interface %q with id %d on addresses %+v", link.Name, nicID, link.Addresses)
- if err := n.createNICWithAddrs(nicID, link.Name, ep, link.Addresses, true /* loopback */); err != nil {
+ if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
return err
}
@@ -157,7 +158,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
}
mac := tcpip.LinkAddress(link.LinkAddress)
- ep, err := fdbased.New(&fdbased.Options{
+ linkEP, err := fdbased.New(&fdbased.Options{
FDs: FDs,
MTU: uint32(link.MTU),
EthernetHeader: true,
@@ -172,7 +173,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
}
log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels)
- if err := n.createNICWithAddrs(nicID, link.Name, ep, link.Addresses, false /* loopback */); err != nil {
+ if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
return err
}
@@ -186,12 +187,24 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
}
}
- if !args.DefaultGateway.Route.Empty() {
- nicID, ok := nicids[args.DefaultGateway.Name]
+ if !args.Defaultv4Gateway.Route.Empty() {
+ nicID, ok := nicids[args.Defaultv4Gateway.Name]
if !ok {
- return fmt.Errorf("invalid interface name %q for default route", args.DefaultGateway.Name)
+ return fmt.Errorf("invalid interface name %q for default route", args.Defaultv4Gateway.Name)
}
- route, err := args.DefaultGateway.Route.toTcpipRoute(nicID)
+ route, err := args.Defaultv4Gateway.Route.toTcpipRoute(nicID)
+ if err != nil {
+ return err
+ }
+ routes = append(routes, route)
+ }
+
+ if !args.Defaultv6Gateway.Route.Empty() {
+ nicID, ok := nicids[args.Defaultv6Gateway.Name]
+ if !ok {
+ return fmt.Errorf("invalid interface name %q for default route", args.Defaultv6Gateway.Name)
+ }
+ route, err := args.Defaultv6Gateway.Route.toTcpipRoute(nicID)
if err != nil {
return err
}
@@ -205,15 +218,10 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
// createNICWithAddrs creates a NIC in the network stack and adds the given
// addresses.
-func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, ep stack.LinkEndpoint, addrs []net.IP, loopback bool) error {
- if loopback {
- if err := n.Stack.CreateNamedLoopbackNIC(id, name, sniffer.New(ep)); err != nil {
- return fmt.Errorf("CreateNamedLoopbackNIC(%v, %v) failed: %v", id, name, err)
- }
- } else {
- if err := n.Stack.CreateNamedNIC(id, name, sniffer.New(ep)); err != nil {
- return fmt.Errorf("CreateNamedNIC(%v, %v) failed: %v", id, name, err)
- }
+func (n *Network) createNICWithAddrs(id tcpip.NICID, name string, ep stack.LinkEndpoint, addrs []net.IP) error {
+ opts := stack.NICOptions{Name: name}
+ if err := n.Stack.CreateNICWithOptions(id, sniffer.New(ep), opts); err != nil {
+ return fmt.Errorf("CreateNICWithOptions(%d, _, %+v) failed: %v", id, opts, err)
}
// Always start with an arp address for the NIC.
diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go
index ab3a25b9b..653ca5f52 100644
--- a/runsc/cgroup/cgroup.go
+++ b/runsc/cgroup/cgroup.go
@@ -101,6 +101,14 @@ func getValue(path, name string) (string, error) {
return string(out), nil
}
+func getInt(path, name string) (int, error) {
+ s, err := getValue(path, name)
+ if err != nil {
+ return 0, err
+ }
+ return strconv.Atoi(strings.TrimSpace(s))
+}
+
// fillFromAncestor sets the value of a cgroup file from the first ancestor
// that has content. It does nothing if the file in 'path' has already been set.
func fillFromAncestor(path string) (string, error) {
@@ -323,6 +331,22 @@ func (c *Cgroup) Join() (func(), error) {
return undo, nil
}
+func (c *Cgroup) CPUQuota() (float64, error) {
+ path := c.makePath("cpu")
+ quota, err := getInt(path, "cpu.cfs_quota_us")
+ if err != nil {
+ return -1, err
+ }
+ period, err := getInt(path, "cpu.cfs_period_us")
+ if err != nil {
+ return -1, err
+ }
+ if quota <= 0 || period <= 0 {
+ return -1, err
+ }
+ return float64(quota) / float64(period), nil
+}
+
// NumCPU returns the number of CPUs configured in 'cpuset/cpuset.cpus'.
func (c *Cgroup) NumCPU() (int, error) {
path := c.makePath("cpuset")
diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index 250845ad7..b94bc4fa0 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -44,6 +44,7 @@ go_library(
"//pkg/sentry/control",
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
+ "//pkg/sync",
"//pkg/unet",
"//pkg/urpc",
"//runsc/boot",
diff --git a/runsc/cmd/create.go b/runsc/cmd/create.go
index a4e3071b3..1815c93b9 100644
--- a/runsc/cmd/create.go
+++ b/runsc/cmd/create.go
@@ -16,6 +16,7 @@ package cmd
import (
"context"
+
"flag"
"github.com/google/subcommands"
"gvisor.dev/gvisor/runsc/boot"
diff --git a/runsc/cmd/debug.go b/runsc/cmd/debug.go
index 7313e473f..f37415810 100644
--- a/runsc/cmd/debug.go
+++ b/runsc/cmd/debug.go
@@ -32,16 +32,17 @@ import (
// Debug implements subcommands.Command for the "debug" command.
type Debug struct {
- pid int
- stacks bool
- signal int
- profileHeap string
- profileCPU string
- profileDelay int
- trace string
- strace string
- logLevel string
- logPackets string
+ pid int
+ stacks bool
+ signal int
+ profileHeap string
+ profileCPU string
+ trace string
+ strace string
+ logLevel string
+ logPackets string
+ duration time.Duration
+ ps bool
}
// Name implements subcommands.Command.
@@ -65,12 +66,13 @@ func (d *Debug) SetFlags(f *flag.FlagSet) {
f.BoolVar(&d.stacks, "stacks", false, "if true, dumps all sandbox stacks to the log")
f.StringVar(&d.profileHeap, "profile-heap", "", "writes heap profile to the given file.")
f.StringVar(&d.profileCPU, "profile-cpu", "", "writes CPU profile to the given file.")
- f.IntVar(&d.profileDelay, "profile-delay", 5, "amount of time to wait before stoping CPU profile")
+ f.DurationVar(&d.duration, "duration", time.Second, "amount of time to wait for CPU and trace profiles")
f.StringVar(&d.trace, "trace", "", "writes an execution trace to the given file.")
f.IntVar(&d.signal, "signal", -1, "sends signal to the sandbox")
f.StringVar(&d.strace, "strace", "", `A comma separated list of syscalls to trace. "all" enables all traces, "off" disables all`)
f.StringVar(&d.logLevel, "log-level", "", "The log level to set: warning (0), info (1), or debug (2).")
f.StringVar(&d.logPackets, "log-packets", "", "A boolean value to enable or disable packet logging: true or false.")
+ f.BoolVar(&d.ps, "ps", false, "lists processes")
}
// Execute implements subcommands.Command.Execute.
@@ -163,7 +165,7 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
if err := c.Sandbox.StartCPUProfile(f); err != nil {
return Errorf(err.Error())
}
- log.Infof("CPU profile started for %d sec, writing to %q", d.profileDelay, d.profileCPU)
+ log.Infof("CPU profile started for %v, writing to %q", d.duration, d.profileCPU)
}
if d.trace != "" {
delay = true
@@ -181,8 +183,7 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
if err := c.Sandbox.StartTrace(f); err != nil {
return Errorf(err.Error())
}
- log.Infof("Tracing started for %d sec, writing to %q", d.profileDelay, d.trace)
-
+ log.Infof("Tracing started for %v, writing to %q", d.duration, d.trace)
}
if d.strace != "" || len(d.logLevel) != 0 || len(d.logPackets) != 0 {
@@ -241,9 +242,20 @@ func (d *Debug) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
}
log.Infof("Logging options changed")
}
+ if d.ps {
+ pList, err := c.Processes()
+ if err != nil {
+ Fatalf("getting processes for container: %v", err)
+ }
+ o, err := control.ProcessListToJSON(pList)
+ if err != nil {
+ Fatalf("generating JSON: %v", err)
+ }
+ log.Infof(o)
+ }
if delay {
- time.Sleep(time.Duration(d.profileDelay) * time.Second)
+ time.Sleep(d.duration)
}
return subcommands.ExitSuccess
diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go
index 4831210c0..7df7995f0 100644
--- a/runsc/cmd/gofer.go
+++ b/runsc/cmd/gofer.go
@@ -21,7 +21,6 @@ import (
"os"
"path/filepath"
"strings"
- "sync"
"syscall"
"flag"
@@ -30,6 +29,7 @@ import (
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/p9"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/unet"
"gvisor.dev/gvisor/runsc/boot"
"gvisor.dev/gvisor/runsc/fsgofer"
diff --git a/runsc/cmd/start.go b/runsc/cmd/start.go
index de2115dff..5e9bc53ab 100644
--- a/runsc/cmd/start.go
+++ b/runsc/cmd/start.go
@@ -16,6 +16,7 @@ package cmd
import (
"context"
+
"flag"
"github.com/google/subcommands"
"gvisor.dev/gvisor/runsc/boot"
diff --git a/runsc/container/BUILD b/runsc/container/BUILD
index 2bd12120d..6dea179e4 100644
--- a/runsc/container/BUILD
+++ b/runsc/container/BUILD
@@ -18,6 +18,7 @@ go_library(
deps = [
"//pkg/log",
"//pkg/sentry/control",
+ "//pkg/sync",
"//runsc/boot",
"//runsc/cgroup",
"//runsc/sandbox",
@@ -53,6 +54,7 @@ go_test(
"//pkg/sentry/control",
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
+ "//pkg/sync",
"//pkg/unet",
"//pkg/urpc",
"//runsc/boot",
diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go
index 7d67c3a75..060b63bf3 100644
--- a/runsc/container/console_test.go
+++ b/runsc/container/console_test.go
@@ -20,7 +20,6 @@ import (
"io"
"os"
"path/filepath"
- "sync"
"syscall"
"testing"
"time"
@@ -28,6 +27,8 @@ import (
"github.com/kr/pty"
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/sentry/control"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/unet"
"gvisor.dev/gvisor/pkg/urpc"
"gvisor.dev/gvisor/runsc/testutil"
@@ -219,9 +220,9 @@ func TestJobControlSignalExec(t *testing.T) {
// Make sure all the processes are running.
expectedPL := []*control.Process{
// Root container process.
- {PID: 1, Cmd: "sleep"},
+ {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
// Bash from exec process.
- {PID: 2, Cmd: "bash"},
+ {PID: 2, Cmd: "bash", Threads: []kernel.ThreadID{2}},
}
if err := waitForProcessList(c, expectedPL); err != nil {
t.Error(err)
@@ -231,7 +232,7 @@ func TestJobControlSignalExec(t *testing.T) {
ptyMaster.Write([]byte("sleep 100\n"))
// Wait for it to start. Sleep's PPID is bash's PID.
- expectedPL = append(expectedPL, &control.Process{PID: 3, PPID: 2, Cmd: "sleep"})
+ expectedPL = append(expectedPL, &control.Process{PID: 3, PPID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{3}})
if err := waitForProcessList(c, expectedPL); err != nil {
t.Error(err)
}
@@ -361,7 +362,7 @@ func TestJobControlSignalRootContainer(t *testing.T) {
// Wait for bash to start.
expectedPL := []*control.Process{
- {PID: 1, Cmd: "bash"},
+ {PID: 1, Cmd: "bash", Threads: []kernel.ThreadID{1}},
}
if err := waitForProcessList(c, expectedPL); err != nil {
t.Fatal(err)
@@ -371,7 +372,7 @@ func TestJobControlSignalRootContainer(t *testing.T) {
ptyMaster.Write([]byte("sleep 100\n"))
// Wait for sleep to start.
- expectedPL = append(expectedPL, &control.Process{PID: 2, PPID: 1, Cmd: "sleep"})
+ expectedPL = append(expectedPL, &control.Process{PID: 2, PPID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{2}})
if err := waitForProcessList(c, expectedPL); err != nil {
t.Fatal(err)
}
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index 07eacaac0..b54d8f712 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -26,7 +26,6 @@ import (
"reflect"
"strconv"
"strings"
- "sync"
"syscall"
"testing"
"time"
@@ -37,7 +36,9 @@ import (
"gvisor.dev/gvisor/pkg/bits"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/control"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/runsc/boot"
"gvisor.dev/gvisor/runsc/boot/platforms"
"gvisor.dev/gvisor/runsc/specutils"
@@ -52,8 +53,9 @@ func waitForProcessList(cont *Container, want []*control.Process) error {
err = fmt.Errorf("error getting process data from container: %v", err)
return &backoff.PermanentError{Err: err}
}
- if !procListsEqual(got, want) {
- return fmt.Errorf("container got process list: %s, want: %s", procListToString(got), procListToString(want))
+ if r, err := procListsEqual(got, want); !r {
+ return fmt.Errorf("container got process list: %s, want: %s: error: %v",
+ procListToString(got), procListToString(want), err)
}
return nil
}
@@ -91,22 +93,34 @@ func blockUntilWaitable(pid int) error {
// procListsEqual is used to check whether 2 Process lists are equal for all
// implemented fields.
-func procListsEqual(got, want []*control.Process) bool {
+func procListsEqual(got, want []*control.Process) (bool, error) {
if len(got) != len(want) {
- return false
+ return false, nil
}
for i := range got {
pd1 := got[i]
pd2 := want[i]
- // Zero out unimplemented and timing dependant fields.
+ // Zero out timing dependant fields.
pd1.Time = ""
pd1.STime = ""
pd1.C = 0
- if *pd1 != *pd2 {
- return false
+ // Ignore TTY field too, since it's not relevant in the cases
+ // where we use this method. Tests that care about the TTY
+ // field should check for it themselves.
+ pd1.TTY = ""
+ pd1Json, err := control.ProcessListToJSON([]*control.Process{pd1})
+ if err != nil {
+ return false, err
+ }
+ pd2Json, err := control.ProcessListToJSON([]*control.Process{pd2})
+ if err != nil {
+ return false, err
+ }
+ if pd1Json != pd2Json {
+ return false, nil
}
}
- return true
+ return true, nil
}
// getAndCheckProcLists is similar to waitForProcessList, but does not wait and retry the
@@ -116,7 +130,11 @@ func getAndCheckProcLists(cont *Container, want []*control.Process) error {
if err != nil {
return fmt.Errorf("error getting process data from container: %v", err)
}
- if procListsEqual(got, want) {
+ equal, err := procListsEqual(got, want)
+ if err != nil {
+ return err
+ }
+ if equal {
return nil
}
return fmt.Errorf("container got process list: %s, want: %s", procListToString(got), procListToString(want))
@@ -288,11 +306,12 @@ func TestLifecycle(t *testing.T) {
// expectedPL lists the expected process state of the container.
expectedPL := []*control.Process{
{
- UID: 0,
- PID: 1,
- PPID: 0,
- C: 0,
- Cmd: "sleep",
+ UID: 0,
+ PID: 1,
+ PPID: 0,
+ C: 0,
+ Cmd: "sleep",
+ Threads: []kernel.ThreadID{1},
},
}
// Create the container.
@@ -590,18 +609,20 @@ func TestExec(t *testing.T) {
// expectedPL lists the expected process state of the container.
expectedPL := []*control.Process{
{
- UID: 0,
- PID: 1,
- PPID: 0,
- C: 0,
- Cmd: "sleep",
+ UID: 0,
+ PID: 1,
+ PPID: 0,
+ C: 0,
+ Cmd: "sleep",
+ Threads: []kernel.ThreadID{1},
},
{
- UID: uid,
- PID: 2,
- PPID: 0,
- C: 0,
- Cmd: "sleep",
+ UID: uid,
+ PID: 2,
+ PPID: 0,
+ C: 0,
+ Cmd: "sleep",
+ Threads: []kernel.ThreadID{2},
},
}
@@ -1062,18 +1083,20 @@ func TestPauseResume(t *testing.T) {
// expectedPL lists the expected process state of the container.
expectedPL := []*control.Process{
{
- UID: 0,
- PID: 1,
- PPID: 0,
- C: 0,
- Cmd: "sleep",
+ UID: 0,
+ PID: 1,
+ PPID: 0,
+ C: 0,
+ Cmd: "sleep",
+ Threads: []kernel.ThreadID{1},
},
{
- UID: uid,
- PID: 2,
- PPID: 0,
- C: 0,
- Cmd: "bash",
+ UID: uid,
+ PID: 2,
+ PPID: 0,
+ C: 0,
+ Cmd: "bash",
+ Threads: []kernel.ThreadID{2},
},
}
@@ -1126,11 +1149,12 @@ func TestPauseResume(t *testing.T) {
expectedPL2 := []*control.Process{
{
- UID: 0,
- PID: 1,
- PPID: 0,
- C: 0,
- Cmd: "sleep",
+ UID: 0,
+ PID: 1,
+ PPID: 0,
+ C: 0,
+ Cmd: "sleep",
+ Threads: []kernel.ThreadID{1},
},
}
@@ -1241,18 +1265,20 @@ func TestCapabilities(t *testing.T) {
// expectedPL lists the expected process state of the container.
expectedPL := []*control.Process{
{
- UID: 0,
- PID: 1,
- PPID: 0,
- C: 0,
- Cmd: "sleep",
+ UID: 0,
+ PID: 1,
+ PPID: 0,
+ C: 0,
+ Cmd: "sleep",
+ Threads: []kernel.ThreadID{1},
},
{
- UID: uid,
- PID: 2,
- PPID: 0,
- C: 0,
- Cmd: "exe",
+ UID: uid,
+ PID: 2,
+ PPID: 0,
+ C: 0,
+ Cmd: "exe",
+ Threads: []kernel.ThreadID{2},
},
}
if err := waitForProcessList(cont, expectedPL[:1]); err != nil {
@@ -2112,6 +2138,95 @@ func TestOverlayfsStaleRead(t *testing.T) {
}
}
+// TestTTYField checks TTY field returned by container.Processes().
+func TestTTYField(t *testing.T) {
+ stop := testutil.StartReaper()
+ defer stop()
+
+ testApp, err := testutil.FindFile("runsc/container/test_app/test_app")
+ if err != nil {
+ t.Fatal("error finding test_app:", err)
+ }
+
+ testCases := []struct {
+ name string
+ useTTY bool
+ wantTTYField string
+ }{
+ {
+ name: "no tty",
+ useTTY: false,
+ wantTTYField: "?",
+ },
+ {
+ name: "tty used",
+ useTTY: true,
+ wantTTYField: "pts/0",
+ },
+ }
+
+ for _, test := range testCases {
+ t.Run(test.name, func(t *testing.T) {
+ conf := testutil.TestConfig()
+
+ // We will run /bin/sleep, possibly with an open TTY.
+ cmd := []string{"/bin/sleep", "10000"}
+ if test.useTTY {
+ // Run inside the "pty-runner".
+ cmd = append([]string{testApp, "pty-runner"}, cmd...)
+ }
+
+ spec := testutil.NewSpecWithArgs(cmd...)
+ rootDir, bundleDir, err := testutil.SetupContainer(spec, conf)
+ if err != nil {
+ t.Fatalf("error setting up container: %v", err)
+ }
+ defer os.RemoveAll(rootDir)
+ defer os.RemoveAll(bundleDir)
+
+ // Create and start the container.
+ args := Args{
+ ID: testutil.UniqueContainerID(),
+ Spec: spec,
+ BundleDir: bundleDir,
+ }
+ c, err := New(conf, args)
+ if err != nil {
+ t.Fatalf("error creating container: %v", err)
+ }
+ defer c.Destroy()
+ if err := c.Start(conf); err != nil {
+ t.Fatalf("error starting container: %v", err)
+ }
+
+ // Wait for sleep to be running, and check the TTY
+ // field.
+ var gotTTYField string
+ cb := func() error {
+ ps, err := c.Processes()
+ if err != nil {
+ err = fmt.Errorf("error getting process data from container: %v", err)
+ return &backoff.PermanentError{Err: err}
+ }
+ for _, p := range ps {
+ if strings.Contains(p.Cmd, "sleep") {
+ gotTTYField = p.TTY
+ return nil
+ }
+ }
+ return fmt.Errorf("sleep not running")
+ }
+ if err := testutil.Poll(cb, 30*time.Second); err != nil {
+ t.Fatalf("error waiting for sleep process: %v", err)
+ }
+
+ if gotTTYField != test.wantTTYField {
+ t.Errorf("tty field got %q, want %q", gotTTYField, test.wantTTYField)
+ }
+ })
+ }
+}
+
// executeSync synchronously executes a new process.
func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus, error) {
pid, err := cont.Execute(args)
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index a5a62378c..2da93ec5b 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -22,7 +22,6 @@ import (
"path"
"path/filepath"
"strings"
- "sync"
"syscall"
"testing"
"time"
@@ -30,6 +29,7 @@ import (
specs "github.com/opencontainers/runtime-spec/specs-go"
"gvisor.dev/gvisor/pkg/sentry/control"
"gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/runsc/boot"
"gvisor.dev/gvisor/runsc/specutils"
"gvisor.dev/gvisor/runsc/testutil"
@@ -123,11 +123,11 @@ func execMany(execs []execDesc) error {
func createSharedMount(mount specs.Mount, name string, pod ...*specs.Spec) {
for _, spec := range pod {
- spec.Annotations[path.Join(boot.MountPrefix, name, "source")] = mount.Source
- spec.Annotations[path.Join(boot.MountPrefix, name, "type")] = mount.Type
- spec.Annotations[path.Join(boot.MountPrefix, name, "share")] = "pod"
+ spec.Annotations[boot.MountPrefix+name+".source"] = mount.Source
+ spec.Annotations[boot.MountPrefix+name+".type"] = mount.Type
+ spec.Annotations[boot.MountPrefix+name+".share"] = "pod"
if len(mount.Options) > 0 {
- spec.Annotations[path.Join(boot.MountPrefix, name, "options")] = strings.Join(mount.Options, ",")
+ spec.Annotations[boot.MountPrefix+name+".options"] = strings.Join(mount.Options, ",")
}
}
}
@@ -156,13 +156,13 @@ func TestMultiContainerSanity(t *testing.T) {
// Check via ps that multiple processes are running.
expectedPL := []*control.Process{
- {PID: 1, Cmd: "sleep"},
+ {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
}
if err := waitForProcessList(containers[0], expectedPL); err != nil {
t.Errorf("failed to wait for sleep to start: %v", err)
}
expectedPL = []*control.Process{
- {PID: 2, Cmd: "sleep"},
+ {PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
}
if err := waitForProcessList(containers[1], expectedPL); err != nil {
t.Errorf("failed to wait for sleep to start: %v", err)
@@ -202,13 +202,13 @@ func TestMultiPIDNS(t *testing.T) {
// Check via ps that multiple processes are running.
expectedPL := []*control.Process{
- {PID: 1, Cmd: "sleep"},
+ {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
}
if err := waitForProcessList(containers[0], expectedPL); err != nil {
t.Errorf("failed to wait for sleep to start: %v", err)
}
expectedPL = []*control.Process{
- {PID: 1, Cmd: "sleep"},
+ {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
}
if err := waitForProcessList(containers[1], expectedPL); err != nil {
t.Errorf("failed to wait for sleep to start: %v", err)
@@ -264,7 +264,7 @@ func TestMultiPIDNSPath(t *testing.T) {
// Check via ps that multiple processes are running.
expectedPL := []*control.Process{
- {PID: 1, Cmd: "sleep"},
+ {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
}
if err := waitForProcessList(containers[0], expectedPL); err != nil {
t.Errorf("failed to wait for sleep to start: %v", err)
@@ -274,7 +274,7 @@ func TestMultiPIDNSPath(t *testing.T) {
}
expectedPL = []*control.Process{
- {PID: 2, Cmd: "sleep"},
+ {PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
}
if err := waitForProcessList(containers[1], expectedPL); err != nil {
t.Errorf("failed to wait for sleep to start: %v", err)
@@ -306,7 +306,7 @@ func TestMultiContainerWait(t *testing.T) {
// Check via ps that multiple processes are running.
expectedPL := []*control.Process{
- {PID: 2, Cmd: "sleep"},
+ {PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
}
if err := waitForProcessList(containers[1], expectedPL); err != nil {
t.Errorf("failed to wait for sleep to start: %v", err)
@@ -351,7 +351,7 @@ func TestMultiContainerWait(t *testing.T) {
// After Wait returns, ensure that the root container is running and
// the child has finished.
expectedPL = []*control.Process{
- {PID: 1, Cmd: "sleep"},
+ {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
}
if err := waitForProcessList(containers[0], expectedPL); err != nil {
t.Errorf("failed to wait for %q to start: %v", strings.Join(containers[0].Spec.Process.Args, " "), err)
@@ -383,7 +383,7 @@ func TestExecWait(t *testing.T) {
// Check via ps that process is running.
expectedPL := []*control.Process{
- {PID: 2, Cmd: "sleep"},
+ {PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
}
if err := waitForProcessList(containers[1], expectedPL); err != nil {
t.Fatalf("failed to wait for sleep to start: %v", err)
@@ -418,7 +418,7 @@ func TestExecWait(t *testing.T) {
// Wait for the exec'd process to exit.
expectedPL = []*control.Process{
- {PID: 1, Cmd: "sleep"},
+ {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
}
if err := waitForProcessList(containers[0], expectedPL); err != nil {
t.Fatalf("failed to wait for second container to stop: %v", err)
@@ -505,7 +505,7 @@ func TestMultiContainerSignal(t *testing.T) {
// Check via ps that container 1 process is running.
expectedPL := []*control.Process{
- {PID: 2, Cmd: "sleep"},
+ {PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}},
}
if err := waitForProcessList(containers[1], expectedPL); err != nil {
@@ -519,7 +519,7 @@ func TestMultiContainerSignal(t *testing.T) {
// Make sure process 1 is still running.
expectedPL = []*control.Process{
- {PID: 1, Cmd: "sleep"},
+ {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
}
if err := waitForProcessList(containers[0], expectedPL); err != nil {
t.Errorf("failed to wait for sleep to start: %v", err)
@@ -633,9 +633,10 @@ func TestMultiContainerDestroy(t *testing.T) {
if err != nil {
t.Fatalf("error getting process data from sandbox: %v", err)
}
- expectedPL := []*control.Process{{PID: 1, Cmd: "sleep"}}
- if !procListsEqual(pss, expectedPL) {
- t.Errorf("container got process list: %s, want: %s", procListToString(pss), procListToString(expectedPL))
+ expectedPL := []*control.Process{{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}}
+ if r, err := procListsEqual(pss, expectedPL); !r {
+ t.Errorf("container got process list: %s, want: %s: error: %v",
+ procListToString(pss), procListToString(expectedPL), err)
}
// Check that cont.Destroy is safe to call multiple times.
@@ -669,7 +670,7 @@ func TestMultiContainerProcesses(t *testing.T) {
// Check root's container process list doesn't include other containers.
expectedPL0 := []*control.Process{
- {PID: 1, Cmd: "sleep"},
+ {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}},
}
if err := waitForProcessList(containers[0], expectedPL0); err != nil {
t.Errorf("failed to wait for process to start: %v", err)
@@ -677,8 +678,8 @@ func TestMultiContainerProcesses(t *testing.T) {
// Same for the other container.
expectedPL1 := []*control.Process{
- {PID: 2, Cmd: "sh"},
- {PID: 3, PPID: 2, Cmd: "sleep"},
+ {PID: 2, Cmd: "sh", Threads: []kernel.ThreadID{2}},
+ {PID: 3, PPID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{3}},
}
if err := waitForProcessList(containers[1], expectedPL1); err != nil {
t.Errorf("failed to wait for process to start: %v", err)
@@ -692,7 +693,7 @@ func TestMultiContainerProcesses(t *testing.T) {
if _, err := containers[1].Execute(args); err != nil {
t.Fatalf("error exec'ing: %v", err)
}
- expectedPL1 = append(expectedPL1, &control.Process{PID: 4, Cmd: "sleep"})
+ expectedPL1 = append(expectedPL1, &control.Process{PID: 4, Cmd: "sleep", Threads: []kernel.ThreadID{4}})
if err := waitForProcessList(containers[1], expectedPL1); err != nil {
t.Errorf("failed to wait for process to start: %v", err)
}
@@ -1513,7 +1514,7 @@ func TestMultiContainerGoferKilled(t *testing.T) {
// Ensure container is running
c := containers[2]
expectedPL := []*control.Process{
- {PID: 3, Cmd: "sleep"},
+ {PID: 3, Cmd: "sleep", Threads: []kernel.ThreadID{3}},
}
if err := waitForProcessList(c, expectedPL); err != nil {
t.Errorf("failed to wait for sleep to start: %v", err)
@@ -1541,7 +1542,7 @@ func TestMultiContainerGoferKilled(t *testing.T) {
continue // container[2] has been killed.
}
pl := []*control.Process{
- {PID: kernel.ThreadID(i + 1), Cmd: "sleep"},
+ {PID: kernel.ThreadID(i + 1), Cmd: "sleep", Threads: []kernel.ThreadID{kernel.ThreadID(i + 1)}},
}
if err := waitForProcessList(c, pl); err != nil {
t.Errorf("Container %q was affected by another container: %v", c.ID, err)
@@ -1561,7 +1562,7 @@ func TestMultiContainerGoferKilled(t *testing.T) {
// Wait until sandbox stops. waitForProcessList will loop until sandbox exits
// and RPC errors out.
impossiblePL := []*control.Process{
- {PID: 100, Cmd: "non-existent-process"},
+ {PID: 100, Cmd: "non-existent-process", Threads: []kernel.ThreadID{100}},
}
if err := waitForProcessList(c, impossiblePL); err == nil {
t.Fatalf("Sandbox was not killed after gofer death")
diff --git a/runsc/container/state_file.go b/runsc/container/state_file.go
index d95151ea5..17a251530 100644
--- a/runsc/container/state_file.go
+++ b/runsc/container/state_file.go
@@ -20,10 +20,10 @@ import (
"io/ioutil"
"os"
"path/filepath"
- "sync"
"github.com/gofrs/flock"
"gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/sync"
)
const stateFileExtension = ".state"
diff --git a/runsc/container/test_app/BUILD b/runsc/container/test_app/BUILD
index 9bf9e6e9d..bfd338bb6 100644
--- a/runsc/container/test_app/BUILD
+++ b/runsc/container/test_app/BUILD
@@ -15,5 +15,6 @@ go_binary(
"//pkg/unet",
"//runsc/testutil",
"@com_github_google_subcommands//:go_default_library",
+ "@com_github_kr_pty//:go_default_library",
],
)
diff --git a/runsc/container/test_app/test_app.go b/runsc/container/test_app/test_app.go
index 913d781c6..a1c8a741a 100644
--- a/runsc/container/test_app/test_app.go
+++ b/runsc/container/test_app/test_app.go
@@ -19,6 +19,7 @@ package main
import (
"context"
"fmt"
+ "io"
"io/ioutil"
"log"
"net"
@@ -31,6 +32,7 @@ import (
"flag"
"github.com/google/subcommands"
+ "github.com/kr/pty"
"gvisor.dev/gvisor/runsc/testutil"
)
@@ -41,6 +43,7 @@ func main() {
subcommands.Register(new(fdReceiver), "")
subcommands.Register(new(fdSender), "")
subcommands.Register(new(forkBomb), "")
+ subcommands.Register(new(ptyRunner), "")
subcommands.Register(new(reaper), "")
subcommands.Register(new(syscall), "")
subcommands.Register(new(taskTree), "")
@@ -352,3 +355,40 @@ func (c *capability) Execute(ctx context.Context, f *flag.FlagSet, args ...inter
return subcommands.ExitSuccess
}
+
+type ptyRunner struct{}
+
+// Name implements subcommands.Command.
+func (*ptyRunner) Name() string {
+ return "pty-runner"
+}
+
+// Synopsis implements subcommands.Command.
+func (*ptyRunner) Synopsis() string {
+ return "runs the given command with an open pty terminal"
+}
+
+// Usage implements subcommands.Command.
+func (*ptyRunner) Usage() string {
+ return "pty-runner [command]"
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (*ptyRunner) SetFlags(f *flag.FlagSet) {}
+
+// Execute implements subcommands.Command.
+func (*ptyRunner) Execute(_ context.Context, fs *flag.FlagSet, _ ...interface{}) subcommands.ExitStatus {
+ c := exec.Command(fs.Args()[0], fs.Args()[1:]...)
+ f, err := pty.Start(c)
+ if err != nil {
+ fmt.Printf("pty.Start failed: %v", err)
+ return subcommands.ExitFailure
+ }
+ defer f.Close()
+
+ // Copy stdout from the command to keep this process alive until the
+ // subprocess exits.
+ io.Copy(os.Stdout, f)
+
+ return subcommands.ExitSuccess
+}
diff --git a/runsc/dockerutil/dockerutil.go b/runsc/dockerutil/dockerutil.go
index 57f6ae8de..9b6346ca2 100644
--- a/runsc/dockerutil/dockerutil.go
+++ b/runsc/dockerutil/dockerutil.go
@@ -380,6 +380,16 @@ func (d *Docker) FindPort(sandboxPort int) (int, error) {
return port, nil
}
+// FindIP returns the IP address of the container as a string.
+func (d *Docker) FindIP() (string, error) {
+ const format = `{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}`
+ out, err := do("inspect", "-f", format, d.Name)
+ if err != nil {
+ return "", fmt.Errorf("error retrieving IP: %v", err)
+ }
+ return strings.TrimSpace(out), nil
+}
+
// SandboxPid returns the PID to the sandbox process.
func (d *Docker) SandboxPid() (int, error) {
out, err := do("inspect", "-f={{.State.Pid}}", d.Name)
diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD
index afcb41801..a9582d92b 100644
--- a/runsc/fsgofer/BUILD
+++ b/runsc/fsgofer/BUILD
@@ -19,6 +19,7 @@ go_library(
"//pkg/fd",
"//pkg/log",
"//pkg/p9",
+ "//pkg/sync",
"//pkg/syserr",
"//runsc/specutils",
"@org_golang_x_sys//unix:go_default_library",
diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go
index 9117d9616..93606d051 100644
--- a/runsc/fsgofer/fsgofer.go
+++ b/runsc/fsgofer/fsgofer.go
@@ -29,7 +29,6 @@ import (
"path/filepath"
"runtime"
"strconv"
- "sync"
"syscall"
"golang.org/x/sys/unix"
@@ -37,6 +36,7 @@ import (
"gvisor.dev/gvisor/pkg/fd"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/p9"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/runsc/specutils"
)
@@ -199,6 +199,7 @@ func (a *attachPoint) makeQID(stat syscall.Stat_t) p9.QID {
// The reason that the file is not opened initially as read-write is for better
// performance with 'overlay2' storage driver. overlay2 eagerly copies the
// entire file up when it's opened in write mode, and would perform badly when
+// multiple files are only being opened for read (esp. startup).
type localFile struct {
p9.DefaultWalkGetAttr
@@ -956,14 +957,14 @@ func (l *localFile) Readdir(offset uint64, count uint32) ([]p9.Dirent, error) {
}
func (l *localFile) readDirent(f int, offset uint64, count uint32, skip uint64) ([]p9.Dirent, error) {
+ var dirents []p9.Dirent
+
// Limit 'count' to cap the slice size that is returned.
const maxCount = 100000
if count > maxCount {
count = maxCount
}
- dirents := make([]p9.Dirent, 0, count)
-
// Pre-allocate buffers that will be reused to get partial results.
direntsBuf := make([]byte, 8192)
names := make([]string, 0, 100)
diff --git a/runsc/main.go b/runsc/main.go
index 4682b308c..abf929511 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -82,6 +82,7 @@ var (
numNetworkChannels = flag.Int("num-network-channels", 1, "number of underlying channels(FDs) to use for network link endpoints.")
rootless = flag.Bool("rootless", false, "it allows the sandbox to be started with a user that is not root. Sandbox and Gofer processes may run with same privileges as current user.")
referenceLeakMode = flag.String("ref-leak-mode", "disabled", "sets reference leak check mode: disabled (default), log-names, log-traces.")
+ cpuNumFromQuota = flag.Bool("cpu-num-from-quota", false, "set cpu number to cpu quota (least integer greater or equal to quota value, but not less than 2)")
// Test flags, not to be used outside tests, ever.
testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.")
@@ -225,6 +226,7 @@ func main() {
AlsoLogToStderr: *alsoLogToStderr,
ReferenceLeakMode: refsLeakMode,
OverlayfsStaleRead: *overlayfsStaleRead,
+ CPUNumFromQuota: *cpuNumFromQuota,
TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot,
TestOnlyTestNameEnv: *testOnlyTestNameEnv,
diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD
index 27459e6d1..ddbc37456 100644
--- a/runsc/sandbox/BUILD
+++ b/runsc/sandbox/BUILD
@@ -19,6 +19,8 @@ go_library(
"//pkg/log",
"//pkg/sentry/control",
"//pkg/sentry/platform",
+ "//pkg/sync",
+ "//pkg/tcpip/header",
"//pkg/tcpip/stack",
"//pkg/urpc",
"//runsc/boot",
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index d42de0176..ff48f5646 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -28,6 +28,7 @@ import (
"github.com/vishvananda/netlink"
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/tcpip/header"
"gvisor.dev/gvisor/pkg/tcpip/stack"
"gvisor.dev/gvisor/pkg/urpc"
"gvisor.dev/gvisor/runsc/boot"
@@ -183,36 +184,39 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG
continue
}
- // Keep only IPv4 addresses.
- var ip4addrs []*net.IPNet
+ var ipAddrs []*net.IPNet
for _, ifaddr := range allAddrs {
ipNet, ok := ifaddr.(*net.IPNet)
if !ok {
return fmt.Errorf("address is not IPNet: %+v", ifaddr)
}
- if ipNet.IP.To4() == nil {
- log.Warningf("IPv6 is not supported, skipping: %v", ipNet)
- continue
- }
- ip4addrs = append(ip4addrs, ipNet)
+ ipAddrs = append(ipAddrs, ipNet)
}
- if len(ip4addrs) == 0 {
- log.Warningf("No IPv4 address found for interface %q, skipping", iface.Name)
+ if len(ipAddrs) == 0 {
+ log.Warningf("No usable IP addresses found for interface %q, skipping", iface.Name)
continue
}
// Scrape the routes before removing the address, since that
// will remove the routes as well.
- routes, def, err := routesForIface(iface)
+ routes, defv4, defv6, err := routesForIface(iface)
if err != nil {
return fmt.Errorf("getting routes for interface %q: %v", iface.Name, err)
}
- if def != nil {
- if !args.DefaultGateway.Route.Empty() {
- return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, def, args.DefaultGateway)
+ if defv4 != nil {
+ if !args.Defaultv4Gateway.Route.Empty() {
+ return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv4, args.Defaultv4Gateway)
}
- args.DefaultGateway.Route = *def
- args.DefaultGateway.Name = iface.Name
+ args.Defaultv4Gateway.Route = *defv4
+ args.Defaultv4Gateway.Name = iface.Name
+ }
+
+ if defv6 != nil {
+ if !args.Defaultv6Gateway.Route.Empty() {
+ return fmt.Errorf("more than one default route found, interface: %v, route: %v, default route: %+v", iface.Name, defv6, args.Defaultv6Gateway)
+ }
+ args.Defaultv6Gateway.Route = *defv6
+ args.Defaultv6Gateway.Name = iface.Name
}
link := boot.FDBasedLink{
@@ -247,6 +251,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG
}
args.FilePayload.Files = append(args.FilePayload.Files, socketEntry.deviceFile)
}
+
if link.GSOMaxSize == 0 && softwareGSO {
// Hardware GSO is disabled. Let's enable software GSO.
link.GSOMaxSize = stack.SoftwareGSOMaxSize
@@ -255,7 +260,7 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, hardwareG
// Collect the addresses for the interface, enable forwarding,
// and remove them from the host.
- for _, addr := range ip4addrs {
+ for _, addr := range ipAddrs {
link.Addresses = append(link.Addresses, addr.IP)
// Steal IP address from NIC.
@@ -316,16 +321,21 @@ func createSocket(iface net.Interface, ifaceLink netlink.Link, enableGSO bool) (
}
}
- // Use SO_RCVBUFFORCE because on linux the receive buffer for an
- // AF_PACKET socket is capped by "net.core.rmem_max". rmem_max
- // defaults to a unusually low value of 208KB. This is too low
- // for gVisor to be able to receive packets at high throughputs
- // without incurring packet drops.
- const rcvBufSize = 4 << 20 // 4MB.
+ // Use SO_RCVBUFFORCE/SO_SNDBUFFORCE because on linux the receive/send buffer
+ // for an AF_PACKET socket is capped by "net.core.rmem_max/wmem_max".
+ // wmem_max/rmem_max default to a unusually low value of 208KB. This is too low
+ // for gVisor to be able to receive packets at high throughputs without
+ // incurring packet drops.
+ const bufSize = 4 << 20 // 4MB.
- if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUFFORCE, rcvBufSize); err != nil {
- return nil, fmt.Errorf("failed to increase socket rcv buffer to %d: %v", rcvBufSize, err)
+ if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUFFORCE, bufSize); err != nil {
+ return nil, fmt.Errorf("failed to increase socket rcv buffer to %d: %v", bufSize, err)
}
+
+ if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_SNDBUFFORCE, bufSize); err != nil {
+ return nil, fmt.Errorf("failed to increase socket snd buffer to %d: %v", bufSize, err)
+ }
+
return &socketEntry{deviceFile, gsoMaxSize}, nil
}
@@ -351,46 +361,56 @@ func loopbackLinks(iface net.Interface, addrs []net.Addr) ([]boot.LoopbackLink,
}
// routesForIface iterates over all routes for the given interface and converts
-// them to boot.Routes.
-func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, error) {
+// them to boot.Routes. It also returns the a default v4/v6 route if found.
+func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, *boot.Route, error) {
link, err := netlink.LinkByIndex(iface.Index)
if err != nil {
- return nil, nil, err
+ return nil, nil, nil, err
}
rs, err := netlink.RouteList(link, netlink.FAMILY_ALL)
if err != nil {
- return nil, nil, fmt.Errorf("getting routes from %q: %v", iface.Name, err)
+ return nil, nil, nil, fmt.Errorf("getting routes from %q: %v", iface.Name, err)
}
- var def *boot.Route
+ var defv4, defv6 *boot.Route
var routes []boot.Route
for _, r := range rs {
// Is it a default route?
if r.Dst == nil {
if r.Gw == nil {
- return nil, nil, fmt.Errorf("default route with no gateway %q: %+v", iface.Name, r)
- }
- if r.Gw.To4() == nil {
- log.Warningf("IPv6 is not supported, skipping default route: %v", r)
- continue
- }
- if def != nil {
- return nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, def, r)
+ return nil, nil, nil, fmt.Errorf("default route with no gateway %q: %+v", iface.Name, r)
}
// Create a catch all route to the gateway.
- def = &boot.Route{
- Destination: net.IPNet{
- IP: net.IPv4zero,
- Mask: net.IPMask(net.IPv4zero),
- },
- Gateway: r.Gw,
+ switch len(r.Gw) {
+ case header.IPv4AddressSize:
+ if defv4 != nil {
+ return nil, nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, defv4, r)
+ }
+ defv4 = &boot.Route{
+ Destination: net.IPNet{
+ IP: net.IPv4zero,
+ Mask: net.IPMask(net.IPv4zero),
+ },
+ Gateway: r.Gw,
+ }
+ case header.IPv6AddressSize:
+ if defv6 != nil {
+ return nil, nil, nil, fmt.Errorf("more than one default route found %q, def: %+v, route: %+v", iface.Name, defv6, r)
+ }
+
+ defv6 = &boot.Route{
+ Destination: net.IPNet{
+ IP: net.IPv6zero,
+ Mask: net.IPMask(net.IPv6zero),
+ },
+ Gateway: r.Gw,
+ }
+ default:
+ return nil, nil, nil, fmt.Errorf("unexpected address size for gateway: %+v for route: %+v", r.Gw, r)
}
continue
}
- if r.Dst.IP.To4() == nil {
- log.Warningf("IPv6 is not supported, skipping route: %v", r)
- continue
- }
+
dst := *r.Dst
dst.IP = dst.IP.Mask(dst.Mask)
routes = append(routes, boot.Route{
@@ -398,7 +418,7 @@ func routesForIface(iface net.Interface) ([]boot.Route, *boot.Route, error) {
Gateway: r.Gw,
})
}
- return routes, def, nil
+ return routes, defv4, defv6, nil
}
// removeAddress removes IP address from network device. It's equivalent to:
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index ee9327fc8..ec72bdbfd 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -18,10 +18,10 @@ package sandbox
import (
"context"
"fmt"
+ "math"
"os"
"os/exec"
"strconv"
- "sync"
"syscall"
"time"
@@ -33,6 +33,7 @@ import (
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/control"
"gvisor.dev/gvisor/pkg/sentry/platform"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/urpc"
"gvisor.dev/gvisor/runsc/boot"
"gvisor.dev/gvisor/runsc/boot/platforms"
@@ -631,6 +632,26 @@ func (s *Sandbox) createSandboxProcess(conf *boot.Config, args *Args, startSyncF
if err != nil {
return fmt.Errorf("getting cpu count from cgroups: %v", err)
}
+ if conf.CPUNumFromQuota {
+ // Dropping below 2 CPUs can trigger application to disable
+ // locks that can lead do hard to debug errors, so just
+ // leaving two cores as reasonable default.
+ const minCPUs = 2
+
+ quota, err := s.Cgroup.CPUQuota()
+ if err != nil {
+ return fmt.Errorf("getting cpu qouta from cgroups: %v", err)
+ }
+ if n := int(math.Ceil(quota)); n > 0 {
+ if n < minCPUs {
+ n = minCPUs
+ }
+ if n < cpuNum {
+ // Only lower the cpu number.
+ cpuNum = n
+ }
+ }
+ }
cmd.Args = append(cmd.Args, "--cpu-num", strconv.Itoa(cpuNum))
mem, err := s.Cgroup.MemoryLimit()
@@ -1004,16 +1025,22 @@ func (s *Sandbox) ChangeLogging(args control.LoggingArgs) error {
// DestroyContainer destroys the given container. If it is the root container,
// then the entire sandbox is destroyed.
func (s *Sandbox) DestroyContainer(cid string) error {
+ if err := s.destroyContainer(cid); err != nil {
+ // If the sandbox isn't running, the container has already been destroyed,
+ // ignore the error in this case.
+ if s.IsRunning() {
+ return err
+ }
+ }
+ return nil
+}
+
+func (s *Sandbox) destroyContainer(cid string) error {
if s.IsRootContainer(cid) {
log.Debugf("Destroying root container %q by destroying sandbox", cid)
return s.destroy()
}
- if !s.IsRunning() {
- // Sandbox isn't running anymore, container is already destroyed.
- return nil
- }
-
log.Debugf("Destroying container %q in sandbox %q", cid, s.ID)
conn, err := s.sandboxConnect()
if err != nil {
diff --git a/runsc/testutil/BUILD b/runsc/testutil/BUILD
index c96ca2eb6..3c3027cb5 100644
--- a/runsc/testutil/BUILD
+++ b/runsc/testutil/BUILD
@@ -10,6 +10,7 @@ go_library(
visibility = ["//:sandbox"],
deps = [
"//pkg/log",
+ "//pkg/sync",
"//runsc/boot",
"//runsc/specutils",
"@com_github_cenkalti_backoff//:go_default_library",
diff --git a/runsc/testutil/testutil.go b/runsc/testutil/testutil.go
index 9632776d2..fb22eae39 100644
--- a/runsc/testutil/testutil.go
+++ b/runsc/testutil/testutil.go
@@ -34,7 +34,6 @@ import (
"path/filepath"
"strconv"
"strings"
- "sync"
"sync/atomic"
"syscall"
"time"
@@ -42,6 +41,7 @@ import (
"github.com/cenkalti/backoff"
specs "github.com/opencontainers/runtime-spec/specs-go"
"gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/runsc/boot"
"gvisor.dev/gvisor/runsc/specutils"
)
diff --git a/scripts/benchmarks.sh b/scripts/benchmarks.sh
new file mode 100755
index 000000000..6b9065b07
--- /dev/null
+++ b/scripts/benchmarks.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#!/usr/bin/env bash
+
+if [ "$#" -lt "1" ]; then
+ echo "usage: $0 <--mock |--env=<filename>> ..."
+ echo "example: $0 --mock --runs=8"
+ exit 1
+fi
+
+source $(dirname $0)/common.sh
+
+readonly TIMESTAMP=`date "+%Y%m%d-%H%M%S"`
+readonly OUTDIR="$(mktemp --tmpdir -d run-${TIMESTAMP}-XXX)"
+readonly DEFAULT_RUNTIMES="--runtime=runc --runtime=runsc --runtime=runsc-kvm"
+readonly ALL_RUNTIMES="--runtime=runc --runtime=runsc --runtime=runsc-kvm"
+
+run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} 'fio.(read|write)' --metric=bandwidth --size=5g --ioengine=sync --blocksize=1m > "${OUTDIR}/fio.csv"
+run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} fio.rand --metric=bandwidth --size=5g --ioengine=sync --blocksize=4k --time=30 > "${OUTDIR}/tmp_fio.csv"
+cat "${OUTDIR}/tmp_fio.csv" | grep "\(runc\|runsc\)" >> "${OUTDIR}/fio.csv" && rm "${OUTDIR}/tmp_fio.csv"
+
+run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} 'fio.(read|write)' --metric=bandwidth --tmpfs=True --size=5g --ioengine=sync --blocksize=1m > "${OUTDIR}/fio-tmpfs.csv"
+run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} fio.rand --metric=bandwidth --tmpfs=True --size=5g --ioengine=sync --blocksize=4k --time=30 > "${OUTDIR}/tmp_fio.csv"
+cat "${OUTDIR}/tmp_fio.csv" | grep "\(runc\|runsc\)" >> "${OUTDIR}/fio-tmpfs.csv" && rm "${OUTDIR}/tmp_fio.csv"
+
+run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} startup --count=50 > "${OUTDIR}/startup.csv"
+run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} density > "${OUTDIR}/density.csv"
+
+run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} sysbench.cpu --threads=1 --max_prime=50000 --options='--max-time=5' > "${OUTDIR}/sysbench-cpu.csv"
+run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} sysbench.memory --threads=1 --options='--memory-block-size=1M --memory-total-size=500G' > "${OUTDIR}/sysbench-memory.csv"
+run //benchmarks:perf -- run "$@" ${ALL_RUNTIMES} syscall > "${OUTDIR}/syscall.csv"
+run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} 'network.(upload|download)' --runs=20 > "${OUTDIR}/iperf.csv"
+run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} ml.tensorflow > "${OUTDIR}/tensorflow.csv"
+run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} media.ffmpeg > "${OUTDIR}/ffmpeg.csv"
+run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} http.httpd --path=latin100k.txt --connections=1 --connections=5 --connections=10 --connections=25 > "${OUTDIR}/httpd100k.csv"
+run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} http.httpd --path=latin10240k.txt --connections=1 --connections=5 --connections=10 --connections=25 > "${OUTDIR}/httpd10240k.csv"
+run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} redis > "${OUTDIR}/redis.csv"
+run //benchmarks:perf -- run "$@" ${DEFAULT_RUNTIMES} 'http.(ruby|node)' > "${OUTDIR}/applications.csv"
+
+echo "${OUTPUT}" && exit 0
diff --git a/scripts/common.sh b/scripts/common.sh
index 6dabad141..fdb1aa142 100755
--- a/scripts/common.sh
+++ b/scripts/common.sh
@@ -73,7 +73,7 @@ function install_runsc() {
sudo "${RUNSC_BIN}" install --experimental=true --runtime="${runtime}" -- --debug-log "${RUNSC_LOGS}" "$@"
# Clear old logs files that may exist.
- sudo rm -f "${RUNSC_LOGS_DIR}"/*
+ sudo rm -f "${RUNSC_LOGS_DIR}"/'*'
# Restart docker to pick up the new runtime configuration.
sudo systemctl restart docker
diff --git a/scripts/dev.sh b/scripts/dev.sh
index c67003018..6238b4d0b 100755
--- a/scripts/dev.sh
+++ b/scripts/dev.sh
@@ -54,9 +54,10 @@ declare OUTPUT="$(build //runsc)"
if [[ ${REFRESH} -eq 0 ]]; then
install_runsc "${RUNTIME}" --net-raw
install_runsc "${RUNTIME}-d" --net-raw --debug --strace --log-packets
+ install_runsc "${RUNTIME}-p" --net-raw --profile
echo
- echo "Runtimes ${RUNTIME} and ${RUNTIME}-d (debug enabled) setup."
+ echo "Runtimes ${RUNTIME}, ${RUNTIME}-d (debug enabled), and ${RUNTIME}-p installed."
echo "Use --runtime="${RUNTIME}" with your Docker command."
echo " docker run --rm --runtime="${RUNTIME}" hello-world"
echo
diff --git a/scripts/go.sh b/scripts/go.sh
index 0dbfb7747..626ed8fa4 100755
--- a/scripts/go.sh
+++ b/scripts/go.sh
@@ -25,6 +25,8 @@ tools/go_branch.sh
# Checkout the new branch.
git checkout go && git clean -f
+go version
+
# Build everything.
go build ./...
diff --git a/scripts/iptables_tests.sh b/scripts/iptables_tests.sh
new file mode 100755
index 000000000..c47cbd675
--- /dev/null
+++ b/scripts/iptables_tests.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+# Copyright 2018 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+source $(dirname $0)/common.sh
+
+install_runsc_for_test iptables
+
+# Build the docker image for the test.
+run //test/iptables/runner --norun
+
+# TODO(gvisor.dev/issue/170): Also test this on runsc once iptables are better
+# supported
+test //test/iptables:iptables_test "--test_arg=--runtime=runc" \
+ "--test_arg=--image=bazel/test/iptables/runner:runner"
diff --git a/scripts/issue_reviver.sh b/scripts/issue_reviver.sh
new file mode 100755
index 000000000..bac9b9192
--- /dev/null
+++ b/scripts/issue_reviver.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DIR=$(dirname $0)
+source "${DIR}"/common.sh
+
+# Provide a credential file if available.
+export OAUTH_TOKEN_FILE=""
+if [[ -v KOKORO_GITHUB_ACCESS_TOKEN ]]; then
+ OAUTH_TOKEN_FILE="${KOKORO_KEYSTORE_DIR}/${KOKORO_GITHUB_ACCESS_TOKEN}"
+fi
+
+REPO_ROOT=$(cd "$(dirname "${DIR}")"; pwd)
+run //tools/issue_reviver:issue_reviver --path "${REPO_ROOT}" --oauth-token-file="${OAUTH_TOKEN_FILE}"
diff --git a/scripts/simple_tests.sh b/scripts/simple_tests.sh
index 585216aae..3a15050c2 100755
--- a/scripts/simple_tests.sh
+++ b/scripts/simple_tests.sh
@@ -17,4 +17,4 @@
source $(dirname $0)/common.sh
# Run all simple tests (locally).
-test //pkg/... //runsc/... //tools/...
+test //pkg/... //runsc/... //tools/... //benchmarks/... //benchmarks/runner:runner_test
diff --git a/test/iptables/BUILD b/test/iptables/BUILD
new file mode 100644
index 000000000..fa833c3b2
--- /dev/null
+++ b/test/iptables/BUILD
@@ -0,0 +1,31 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+ name = "iptables",
+ srcs = [
+ "filter_input.go",
+ "iptables.go",
+ "iptables_util.go",
+ ],
+ importpath = "gvisor.dev/gvisor/test/iptables",
+ visibility = ["//test/iptables:__subpackages__"],
+)
+
+go_test(
+ name = "iptables_test",
+ srcs = [
+ "iptables_test.go",
+ ],
+ embed = [":iptables"],
+ tags = [
+ "local",
+ "manual",
+ ],
+ deps = [
+ "//pkg/log",
+ "//runsc/dockerutil",
+ "//runsc/testutil",
+ ],
+)
diff --git a/test/iptables/README.md b/test/iptables/README.md
new file mode 100644
index 000000000..9f8e34420
--- /dev/null
+++ b/test/iptables/README.md
@@ -0,0 +1,44 @@
+# iptables Tests
+
+iptables tests are run via `scripts/iptables_test.sh`.
+
+## Test Structure
+
+Each test implements `TestCase`, providing (1) a function to run inside the
+container and (2) a function to run locally. Those processes are given each
+others' IP addresses. The test succeeds when both functions succeed.
+
+The function inside the container (`ContainerAction`) typically sets some
+iptables rules and then tries to send or receive packets. The local function
+(`LocalAction`) will typically just send or receive packets.
+
+### Adding Tests
+
+1) Add your test to the `iptables` package.
+
+2) Register the test in an `init` function via `RegisterTestCase` (see
+`filter_input.go` as an example).
+
+3) Add it to `iptables_test.go` (see the other tests in that file).
+
+Your test is now runnable with bazel!
+
+## Run individual tests
+
+Build the testing Docker container:
+
+```bash
+$ bazel run //test/iptables/runner -- --norun
+```
+
+Run an individual test via:
+
+```bash
+$ bazel test //test/iptables:iptables_test --test_filter=<TESTNAME>
+```
+
+To run an individual test with `runc`:
+
+```bash
+$ bazel test //test/iptables:iptables_test --test_filter=<TESTNAME> --test_arg=--runtime=runc
+```
diff --git a/test/iptables/filter_input.go b/test/iptables/filter_input.go
new file mode 100644
index 000000000..923f44e68
--- /dev/null
+++ b/test/iptables/filter_input.go
@@ -0,0 +1,124 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package iptables
+
+import (
+ "fmt"
+ "net"
+ "time"
+)
+
+const (
+ dropPort = 2401
+ acceptPort = 2402
+ sendloopDuration = 2 * time.Second
+ network = "udp4"
+)
+
+func init() {
+ RegisterTestCase(FilterInputDropUDP{})
+ RegisterTestCase(FilterInputDropUDPPort{})
+ RegisterTestCase(FilterInputDropDifferentUDPPort{})
+}
+
+// FilterInputDropUDP tests that we can drop UDP traffic.
+type FilterInputDropUDP struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputDropUDP) Name() string {
+ return "FilterInputDropUDP"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputDropUDP) ContainerAction(ip net.IP) error {
+ if err := filterTable("-A", "INPUT", "-p", "udp", "-j", "DROP"); err != nil {
+ return err
+ }
+
+ // Listen for UDP packets on dropPort.
+ if err := listenUDP(dropPort, sendloopDuration); err == nil {
+ return fmt.Errorf("packets on port %d should have been dropped, but got a packet", dropPort)
+ } else if netErr, ok := err.(net.Error); !ok || !netErr.Timeout() {
+ return fmt.Errorf("error reading: %v", err)
+ }
+
+ // At this point we know that reading timed out and never received a
+ // packet.
+ return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputDropUDP) LocalAction(ip net.IP) error {
+ return sendUDPLoop(ip, dropPort, sendloopDuration)
+}
+
+// FilterInputDropUDPPort tests that we can drop UDP traffic by port.
+type FilterInputDropUDPPort struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputDropUDPPort) Name() string {
+ return "FilterInputDropUDPPort"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputDropUDPPort) ContainerAction(ip net.IP) error {
+ if err := filterTable("-A", "INPUT", "-p", "udp", "-m", "udp", "--destination-port", fmt.Sprintf("%d", dropPort), "-j", "DROP"); err != nil {
+ return err
+ }
+
+ // Listen for UDP packets on dropPort.
+ if err := listenUDP(dropPort, sendloopDuration); err == nil {
+ return fmt.Errorf("packets on port %d should have been dropped, but got a packet", dropPort)
+ } else if netErr, ok := err.(net.Error); !ok || !netErr.Timeout() {
+ return fmt.Errorf("error reading: %v", err)
+ }
+
+ // At this point we know that reading timed out and never received a
+ // packet.
+ return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputDropUDPPort) LocalAction(ip net.IP) error {
+ return sendUDPLoop(ip, dropPort, sendloopDuration)
+}
+
+// FilterInputDropDifferentUDPPort tests that dropping traffic for a single UDP port
+// doesn't drop packets on other ports.
+type FilterInputDropDifferentUDPPort struct{}
+
+// Name implements TestCase.Name.
+func (FilterInputDropDifferentUDPPort) Name() string {
+ return "FilterInputDropDifferentUDPPort"
+}
+
+// ContainerAction implements TestCase.ContainerAction.
+func (FilterInputDropDifferentUDPPort) ContainerAction(ip net.IP) error {
+ if err := filterTable("-A", "INPUT", "-p", "udp", "-m", "udp", "--destination-port", fmt.Sprintf("%d", dropPort), "-j", "DROP"); err != nil {
+ return err
+ }
+
+ // Listen for UDP packets on another port.
+ if err := listenUDP(acceptPort, sendloopDuration); err != nil {
+ return fmt.Errorf("packets on port %d should be allowed, but encountered an error: %v", acceptPort, err)
+ }
+
+ return nil
+}
+
+// LocalAction implements TestCase.LocalAction.
+func (FilterInputDropDifferentUDPPort) LocalAction(ip net.IP) error {
+ return sendUDPLoop(ip, acceptPort, sendloopDuration)
+}
diff --git a/test/iptables/iptables.go b/test/iptables/iptables.go
new file mode 100644
index 000000000..2e565d988
--- /dev/null
+++ b/test/iptables/iptables.go
@@ -0,0 +1,53 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package iptables contains a set of iptables tests implemented as TestCases
+package iptables
+
+import (
+ "fmt"
+ "net"
+)
+
+// IPExchangePort is the port the container listens on to receive the IP
+// address of the local process.
+const IPExchangePort = 2349
+
+// A TestCase contains one action to run in the container and one to run
+// locally. The actions run concurrently and each must succeed for the test
+// pass.
+type TestCase interface {
+ // Name returns the name of the test.
+ Name() string
+
+ // ContainerAction runs inside the container. It receives the IP of the
+ // local process.
+ ContainerAction(ip net.IP) error
+
+ // LocalAction runs locally. It receives the IP of the container.
+ LocalAction(ip net.IP) error
+}
+
+// Tests maps test names to TestCase.
+//
+// New TestCases are added by calling RegisterTestCase in an init function.
+var Tests = map[string]TestCase{}
+
+// RegisterTestCase registers tc so it can be run.
+func RegisterTestCase(tc TestCase) {
+ if _, ok := Tests[tc.Name()]; ok {
+ panic(fmt.Sprintf("TestCase %s already registered.", tc.Name()))
+ }
+ Tests[tc.Name()] = tc
+}
diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go
new file mode 100644
index 000000000..bfbf1bb87
--- /dev/null
+++ b/test/iptables/iptables_test.go
@@ -0,0 +1,179 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package iptables
+
+import (
+ "fmt"
+ "net"
+ "os"
+ "path"
+ "testing"
+ "time"
+
+ "flag"
+ "gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/runsc/dockerutil"
+ "gvisor.dev/gvisor/runsc/testutil"
+)
+
+const timeout time.Duration = 10 * time.Second
+
+var image = flag.String("image", "bazel/test/iptables/runner:runner", "image to run tests in")
+
+type result struct {
+ output string
+ err error
+}
+
+// singleTest runs a TestCase. Each test follows a pattern:
+// - Create a container.
+// - Get the container's IP.
+// - Send the container our IP.
+// - Start a new goroutine running the local action of the test.
+// - Wait for both the container and local actions to finish.
+//
+// Container output is logged to $TEST_UNDECLARED_OUTPUTS_DIR if it exists, or
+// to stderr.
+func singleTest(test TestCase) error {
+ if _, ok := Tests[test.Name()]; !ok {
+ return fmt.Errorf("no test found with name %q. Has it been registered?", test.Name())
+ }
+
+ // Create and start the container.
+ cont := dockerutil.MakeDocker("gvisor-iptables")
+ defer cont.CleanUp()
+ resultChan := make(chan *result)
+ go func() {
+ output, err := cont.RunFg("--cap-add=NET_ADMIN", *image, "-name", test.Name())
+ logContainer(output, err)
+ resultChan <- &result{output, err}
+ }()
+
+ // Get the container IP.
+ ip, err := getIP(cont)
+ if err != nil {
+ return fmt.Errorf("failed to get container IP: %v", err)
+ }
+
+ // Give the container our IP.
+ if err := sendIP(ip); err != nil {
+ return fmt.Errorf("failed to send IP to container: %v", err)
+ }
+
+ // Run our side of the test.
+ errChan := make(chan error)
+ go func() {
+ errChan <- test.LocalAction(ip)
+ }()
+
+ // Wait for both the container and local tests to finish.
+ var res *result
+ to := time.After(timeout)
+ for localDone := false; res == nil || !localDone; {
+ select {
+ case res = <-resultChan:
+ log.Infof("Container finished.")
+ case err, localDone = <-errChan:
+ log.Infof("Local finished.")
+ if err != nil {
+ return fmt.Errorf("local test failed: %v", err)
+ }
+ case <-to:
+ return fmt.Errorf("timed out after %f seconds", timeout.Seconds())
+ }
+ }
+
+ return res.err
+}
+
+func getIP(cont dockerutil.Docker) (net.IP, error) {
+ // The container might not have started yet, so retry a few times.
+ var ipStr string
+ to := time.After(timeout)
+ for ipStr == "" {
+ ipStr, _ = cont.FindIP()
+ select {
+ case <-to:
+ return net.IP{}, fmt.Errorf("timed out getting IP after %f seconds", timeout.Seconds())
+ default:
+ time.Sleep(250 * time.Millisecond)
+ }
+ }
+ ip := net.ParseIP(ipStr)
+ if ip == nil {
+ return net.IP{}, fmt.Errorf("invalid IP: %q", ipStr)
+ }
+ log.Infof("Container has IP of %s", ipStr)
+ return ip, nil
+}
+
+func sendIP(ip net.IP) error {
+ contAddr := net.TCPAddr{
+ IP: ip,
+ Port: IPExchangePort,
+ }
+ var conn *net.TCPConn
+ // The container may not be listening when we first connect, so retry
+ // upon error.
+ cb := func() error {
+ c, err := net.DialTCP("tcp4", nil, &contAddr)
+ conn = c
+ return err
+ }
+ if err := testutil.Poll(cb, timeout); err != nil {
+ return fmt.Errorf("timed out waiting to send IP, most recent error: %v", err)
+ }
+ if _, err := conn.Write([]byte{0}); err != nil {
+ return fmt.Errorf("error writing to container: %v", err)
+ }
+ return nil
+}
+
+func logContainer(output string, err error) {
+ msg := fmt.Sprintf("Container error: %v\nContainer output:\n%v", err, output)
+ if artifactsDir := os.Getenv("TEST_UNDECLARED_OUTPUTS_DIR"); artifactsDir != "" {
+ fpath := path.Join(artifactsDir, "container.log")
+ if file, err := os.OpenFile(fpath, os.O_WRONLY|os.O_CREATE, 0644); err != nil {
+ log.Warningf("Failed to open log file %q: %v", fpath, err)
+ } else {
+ defer file.Close()
+ if _, err := file.Write([]byte(msg)); err == nil {
+ return
+ }
+ log.Warningf("Failed to write to log file %s: %v", fpath, err)
+ }
+ }
+
+ // We couldn't write to the output directory -- just log to stderr.
+ log.Infof(msg)
+}
+
+func TestFilterInputDropUDP(t *testing.T) {
+ if err := singleTest(FilterInputDropUDP{}); err != nil {
+ t.Fatal(err)
+ }
+}
+
+func TestFilterInputDropUDPPort(t *testing.T) {
+ if err := singleTest(FilterInputDropUDPPort{}); err != nil {
+ t.Fatal(err)
+ }
+}
+
+func TestFilterInputDropDifferentUDPPort(t *testing.T) {
+ if err := singleTest(FilterInputDropDifferentUDPPort{}); err != nil {
+ t.Fatal(err)
+ }
+}
diff --git a/test/iptables/iptables_util.go b/test/iptables/iptables_util.go
new file mode 100644
index 000000000..3a4d11f1a
--- /dev/null
+++ b/test/iptables/iptables_util.go
@@ -0,0 +1,82 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package iptables
+
+import (
+ "fmt"
+ "net"
+ "os/exec"
+ "time"
+)
+
+const iptablesBinary = "iptables"
+
+// filterTable calls `iptables -t filter` with the given args.
+func filterTable(args ...string) error {
+ args = append([]string{"-t", "filter"}, args...)
+ cmd := exec.Command(iptablesBinary, args...)
+ if out, err := cmd.CombinedOutput(); err != nil {
+ return fmt.Errorf("error running iptables with args %v\nerror: %v\noutput: %s", args, err, string(out))
+ }
+ return nil
+}
+
+// listenUDP listens on a UDP port and returns the value of net.Conn.Read() for
+// the first read on that port.
+func listenUDP(port int, timeout time.Duration) error {
+ localAddr := net.UDPAddr{
+ Port: port,
+ }
+ conn, err := net.ListenUDP(network, &localAddr)
+ if err != nil {
+ return err
+ }
+ defer conn.Close()
+ conn.SetDeadline(time.Now().Add(timeout))
+ _, err = conn.Read([]byte{0})
+ return err
+}
+
+// sendUDPLoop sends 1 byte UDP packets repeatedly to the IP and port specified
+// over a duration.
+func sendUDPLoop(ip net.IP, port int, duration time.Duration) error {
+ // Send packets for a few seconds.
+ remote := net.UDPAddr{
+ IP: ip,
+ Port: port,
+ }
+ conn, err := net.DialUDP(network, nil, &remote)
+ if err != nil {
+ return err
+ }
+ defer conn.Close()
+
+ to := time.After(duration)
+ for timedOut := false; !timedOut; {
+ // This may return an error (connection refused) if the remote
+ // hasn't started listening yet or they're dropping our
+ // packets. So we ignore Write errors and depend on the remote
+ // to report a failure if it doesn't get a packet it needs.
+ conn.Write([]byte{0})
+ select {
+ case <-to:
+ timedOut = true
+ default:
+ time.Sleep(200 * time.Millisecond)
+ }
+ }
+
+ return nil
+}
diff --git a/test/iptables/runner/BUILD b/test/iptables/runner/BUILD
new file mode 100644
index 000000000..c6c42d870
--- /dev/null
+++ b/test/iptables/runner/BUILD
@@ -0,0 +1,16 @@
+load("@io_bazel_rules_docker//go:image.bzl", "go_image")
+load("@io_bazel_rules_docker//container:container.bzl", "container_image")
+
+package(licenses = ["notice"])
+
+container_image(
+ name = "iptables-base",
+ base = "@iptables-test//image",
+)
+
+go_image(
+ name = "runner",
+ srcs = ["main.go"],
+ base = ":iptables-base",
+ deps = ["//test/iptables"],
+)
diff --git a/test/iptables/runner/Dockerfile b/test/iptables/runner/Dockerfile
new file mode 100644
index 000000000..b77db44a1
--- /dev/null
+++ b/test/iptables/runner/Dockerfile
@@ -0,0 +1,4 @@
+# This Dockerfile builds the image hosted at
+# gcr.io/gvisor-presubmit/iptables-test.
+FROM ubuntu
+RUN apt update && apt install -y iptables
diff --git a/test/iptables/runner/main.go b/test/iptables/runner/main.go
new file mode 100644
index 000000000..3c794114e
--- /dev/null
+++ b/test/iptables/runner/main.go
@@ -0,0 +1,70 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package main runs iptables tests from within a docker container.
+package main
+
+import (
+ "flag"
+ "fmt"
+ "log"
+ "net"
+
+ "gvisor.dev/gvisor/test/iptables"
+)
+
+var name = flag.String("name", "", "name of the test to run")
+
+func main() {
+ flag.Parse()
+
+ // Find out which test we're running.
+ test, ok := iptables.Tests[*name]
+ if !ok {
+ log.Fatalf("No test found named %q", *name)
+ }
+ log.Printf("Running test %q", *name)
+
+ // Get the IP of the local process.
+ ip, err := getIP()
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ // Run the test.
+ if err := test.ContainerAction(ip); err != nil {
+ log.Fatalf("Failed running test %q: %v", *name, err)
+ }
+}
+
+// getIP listens for a connection from the local process and returns the source
+// IP of that connection.
+func getIP() (net.IP, error) {
+ localAddr := net.TCPAddr{
+ Port: iptables.IPExchangePort,
+ }
+ listener, err := net.ListenTCP("tcp4", &localAddr)
+ if err != nil {
+ return net.IP{}, fmt.Errorf("failed listening for IP: %v", err)
+ }
+ defer listener.Close()
+ conn, err := listener.AcceptTCP()
+ if err != nil {
+ return net.IP{}, fmt.Errorf("failed accepting IP: %v", err)
+ }
+ defer conn.Close()
+ log.Printf("Connected to %v", conn.RemoteAddr())
+
+ return conn.RemoteAddr().(*net.TCPAddr).IP, nil
+}
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index 722d14b53..a3a85917d 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -376,6 +376,8 @@ syscall_test(
syscall_test(test = "//test/syscalls/linux:rlimits_test")
+syscall_test(test = "//test/syscalls/linux:rseq_test")
+
syscall_test(test = "//test/syscalls/linux:rtsignal_test")
syscall_test(test = "//test/syscalls/linux:sched_test")
@@ -669,6 +671,7 @@ syscall_test(test = "//test/syscalls/linux:udp_bind_test")
syscall_test(
size = "medium",
+ add_hostinet = True,
shard_count = 10,
test = "//test/syscalls/linux:udp_socket_test",
)
@@ -714,6 +717,11 @@ syscall_test(test = "//test/syscalls/linux:proc_net_tcp_test")
syscall_test(test = "//test/syscalls/linux:proc_net_udp_test")
+syscall_test(
+ add_overlay = True,
+ test = "//test/syscalls/linux:xattr_test",
+)
+
go_binary(
name = "syscall_test_runner",
testonly = 1,
diff --git a/test/syscalls/build_defs.bzl b/test/syscalls/build_defs.bzl
index dcf5b73ed..aaf77c65b 100644
--- a/test/syscalls/build_defs.bzl
+++ b/test/syscalls/build_defs.bzl
@@ -9,6 +9,7 @@ def syscall_test(
use_tmpfs = False,
add_overlay = False,
add_uds_tree = False,
+ add_hostinet = False,
tags = None):
_syscall_test(
test = test,
@@ -65,6 +66,18 @@ def syscall_test(
file_access = "shared",
)
+ if add_hostinet:
+ _syscall_test(
+ test = test,
+ shard_count = shard_count,
+ size = size,
+ platform = "ptrace",
+ use_tmpfs = use_tmpfs,
+ network = "host",
+ add_uds_tree = add_uds_tree,
+ tags = tags,
+ )
+
def _syscall_test(
test,
shard_count,
@@ -72,6 +85,7 @@ def _syscall_test(
platform,
use_tmpfs,
tags,
+ network = "none",
file_access = "exclusive",
overlay = False,
add_uds_tree = False):
@@ -85,6 +99,8 @@ def _syscall_test(
name += "_shared"
if overlay:
name += "_overlay"
+ if network != "none":
+ name += "_" + network + "net"
if tags == None:
tags = []
@@ -107,6 +123,7 @@ def _syscall_test(
# Arguments are passed directly to syscall_test_runner binary.
"--test-name=" + test_name,
"--platform=" + platform,
+ "--network=" + network,
"--use-tmpfs=" + str(use_tmpfs),
"--file-access=" + file_access,
"--overlay=" + str(overlay),
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 6345ea28c..4c7ec3f06 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -6,6 +6,19 @@ package(
licenses = ["notice"],
)
+exports_files(
+ [
+ "socket.cc",
+ "socket_inet_loopback.cc",
+ "socket_ip_loopback_blocking.cc",
+ "socket_ip_tcp_loopback.cc",
+ "socket_ipv4_udp_unbound_loopback.cc",
+ "tcp_socket.cc",
+ "udp_socket.cc",
+ ],
+ visibility = ["//:sandbox"],
+)
+
cc_binary(
name = "sigaltstack_check",
testonly = 1,
@@ -743,6 +756,7 @@ cc_binary(
"//test/util:eventfd_util",
"//test/util:multiprocess_util",
"//test/util:posix_error",
+ "//test/util:save_util",
"//test/util:temp_path",
"//test/util:test_util",
"//test/util:timer_util",
@@ -1795,7 +1809,6 @@ cc_binary(
name = "readv_socket_test",
testonly = 1,
srcs = [
- "file_base.h",
"readv_common.cc",
"readv_common.h",
"readv_socket.cc",
@@ -1843,6 +1856,22 @@ cc_binary(
)
cc_binary(
+ name = "rseq_test",
+ testonly = 1,
+ srcs = ["rseq.cc"],
+ data = ["//test/syscalls/linux/rseq"],
+ linkstatic = 1,
+ deps = [
+ "//test/syscalls/linux/rseq:lib",
+ "//test/util:logging",
+ "//test/util:multiprocess_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "@com_google_googletest//:gtest",
+ ],
+)
+
+cc_binary(
name = "rtsignal_test",
testonly = 1,
srcs = ["rtsignal.cc"],
@@ -2142,6 +2171,7 @@ cc_library(
":socket_test_util",
"//test/util:test_util",
"//test/util:thread_util",
+ "@com_google_absl//absl/time",
"@com_google_googletest//:gtest",
],
alwayslink = 1,
@@ -2663,6 +2693,7 @@ cc_binary(
srcs = ["socket_inet_loopback.cc"],
linkstatic = 1,
deps = [
+ ":ip_socket_test_util",
":socket_test_util",
"//test/util:file_descriptor",
"//test/util:posix_error",
@@ -2858,7 +2889,6 @@ cc_library(
":unix_domain_socket_test_util",
"//test/util:test_util",
"//test/util:thread_util",
- "//test/util:timer_util",
"@com_google_absl//absl/time",
"@com_google_googletest//:gtest",
],
@@ -3245,8 +3275,6 @@ cc_binary(
testonly = 1,
srcs = ["tcp_socket.cc"],
linkstatic = 1,
- # FIXME(b/135470853)
- tags = ["flaky"],
deps = [
":socket_test_util",
"//test/util:file_descriptor",
@@ -3351,11 +3379,15 @@ cc_binary(
],
)
-cc_binary(
- name = "udp_socket_test",
+cc_library(
+ name = "udp_socket_test_cases",
testonly = 1,
- srcs = ["udp_socket.cc"],
- linkstatic = 1,
+ srcs = [
+ "udp_socket_test_cases.cc",
+ ] + select_for_linux([
+ "udp_socket_errqueue_test_case.cc",
+ ]),
+ hdrs = ["udp_socket_test_cases.h"],
deps = [
":socket_test_util",
":unix_domain_socket_test_util",
@@ -3366,6 +3398,17 @@ cc_binary(
"@com_google_absl//absl/time",
"@com_google_googletest//:gtest",
],
+ alwayslink = 1,
+)
+
+cc_binary(
+ name = "udp_socket_test",
+ testonly = 1,
+ srcs = ["udp_socket.cc"],
+ linkstatic = 1,
+ deps = [
+ ":udp_socket_test_cases",
+ ],
)
cc_binary(
@@ -3682,3 +3725,24 @@ cc_binary(
"@com_google_googletest//:gtest",
],
)
+
+cc_binary(
+ name = "xattr_test",
+ testonly = 1,
+ srcs = [
+ "file_base.h",
+ "xattr.cc",
+ ],
+ linkstatic = 1,
+ deps = [
+ "//test/util:capability_util",
+ "//test/util:file_descriptor",
+ "//test/util:fs_util",
+ "//test/util:posix_error",
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "@com_google_absl//absl/strings",
+ "@com_google_googletest//:gtest",
+ ],
+)
diff --git a/test/syscalls/linux/aio.cc b/test/syscalls/linux/aio.cc
index b27d4e10a..a33daff17 100644
--- a/test/syscalls/linux/aio.cc
+++ b/test/syscalls/linux/aio.cc
@@ -129,7 +129,7 @@ TEST_F(AIOTest, BasicWrite) {
// aio implementation uses aio_ring. gVisor doesn't and returns all zeroes.
// Linux implements aio_ring, so skip the zeroes check.
//
- // TODO(b/65486370): Remove when gVisor implements aio_ring.
+ // TODO(gvisor.dev/issue/204): Remove when gVisor implements aio_ring.
auto ring = reinterpret_cast<struct aio_ring*>(ctx_);
auto magic = IsRunningOnGvisor() ? 0 : AIO_RING_MAGIC;
EXPECT_EQ(ring->magic, magic);
diff --git a/test/syscalls/linux/clock_gettime.cc b/test/syscalls/linux/clock_gettime.cc
index 2aa91691e..7f6015049 100644
--- a/test/syscalls/linux/clock_gettime.cc
+++ b/test/syscalls/linux/clock_gettime.cc
@@ -56,11 +56,6 @@ void spin_ns(int64_t ns) {
// Test that CLOCK_PROCESS_CPUTIME_ID is a superset of CLOCK_THREAD_CPUTIME_ID.
TEST(ClockGettime, CputimeId) {
- // TODO(b/128871825,golang.org/issue/10958): Test times out when there is a
- // small number of core because one goroutine starves the others.
- printf("CPUS: %d\n", std::thread::hardware_concurrency());
- SKIP_IF(std::thread::hardware_concurrency() <= 2);
-
constexpr int kNumThreads = 13; // arbitrary
absl::Duration spin_time = absl::Seconds(1);
diff --git a/test/syscalls/linux/exceptions.cc b/test/syscalls/linux/exceptions.cc
index 370e85166..3d564e720 100644
--- a/test/syscalls/linux/exceptions.cc
+++ b/test/syscalls/linux/exceptions.cc
@@ -22,6 +22,23 @@
namespace gvisor {
namespace testing {
+// Default value for the x87 FPU control word. See Intel SDM Vol 1, Ch 8.1.5
+// "x87 FPU Control Word".
+constexpr uint16_t kX87ControlWordDefault = 0x37f;
+
+// Mask for the divide-by-zero exception.
+constexpr uint16_t kX87ControlWordDiv0Mask = 1 << 2;
+
+// Default value for the SSE control register (MXCSR). See Intel SDM Vol 1, Ch
+// 11.6.4 "Initialization of SSE/SSE3 Extensions".
+constexpr uint32_t kMXCSRDefault = 0x1f80;
+
+// Mask for the divide-by-zero exception.
+constexpr uint32_t kMXCSRDiv0Mask = 1 << 9;
+
+// Flag for a pending divide-by-zero exception.
+constexpr uint32_t kMXCSRDiv0Flag = 1 << 2;
+
void inline Halt() { asm("hlt\r\n"); }
void inline SetAlignmentCheck() {
@@ -107,6 +124,170 @@ TEST(ExceptionTest, DivideByZero) {
::testing::KilledBySignal(SIGFPE), "");
}
+// By default, x87 exceptions are masked and simply return a default value.
+TEST(ExceptionTest, X87DivideByZeroMasked) {
+ int32_t quotient;
+ int32_t value = 1;
+ int32_t divisor = 0;
+ asm("fildl %[value]\r\n"
+ "fidivl %[divisor]\r\n"
+ "fistpl %[quotient]\r\n"
+ : [ quotient ] "=m"(quotient)
+ : [ value ] "m"(value), [ divisor ] "m"(divisor));
+
+ EXPECT_EQ(quotient, INT32_MIN);
+}
+
+// When unmasked, division by zero raises SIGFPE.
+TEST(ExceptionTest, X87DivideByZeroUnmasked) {
+ // See above.
+ struct sigaction sa = {};
+ sa.sa_handler = SIG_DFL;
+ auto const cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGFPE, sa));
+
+ EXPECT_EXIT(
+ {
+ // Clear the divide by zero exception mask.
+ constexpr uint16_t kControlWord =
+ kX87ControlWordDefault & ~kX87ControlWordDiv0Mask;
+
+ int32_t quotient;
+ int32_t value = 1;
+ int32_t divisor = 0;
+ asm volatile(
+ "fldcw %[cw]\r\n"
+ "fildl %[value]\r\n"
+ "fidivl %[divisor]\r\n"
+ "fistpl %[quotient]\r\n"
+ : [ quotient ] "=m"(quotient)
+ : [ cw ] "m"(kControlWord), [ value ] "m"(value),
+ [ divisor ] "m"(divisor));
+ },
+ ::testing::KilledBySignal(SIGFPE), "");
+}
+
+// Pending exceptions in the x87 status register are not clobbered by syscalls.
+TEST(ExceptionTest, X87StatusClobber) {
+ // See above.
+ struct sigaction sa = {};
+ sa.sa_handler = SIG_DFL;
+ auto const cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGFPE, sa));
+
+ EXPECT_EXIT(
+ {
+ // Clear the divide by zero exception mask.
+ constexpr uint16_t kControlWord =
+ kX87ControlWordDefault & ~kX87ControlWordDiv0Mask;
+
+ int32_t quotient;
+ int32_t value = 1;
+ int32_t divisor = 0;
+ asm volatile(
+ "fildl %[value]\r\n"
+ "fidivl %[divisor]\r\n"
+ // Exception is masked, so it does not occur here.
+ "fistpl %[quotient]\r\n"
+
+ // SYS_getpid placed in rax by constraint.
+ "syscall\r\n"
+
+ // Unmask exception. The syscall didn't clobber the pending
+ // exception, so now it can be raised.
+ //
+ // N.B. "a floating-point exception will be generated upon execution
+ // of the *next* floating-point instruction".
+ "fldcw %[cw]\r\n"
+ "fwait\r\n"
+ : [ quotient ] "=m"(quotient)
+ : [ value ] "m"(value), [ divisor ] "m"(divisor), "a"(SYS_getpid),
+ [ cw ] "m"(kControlWord)
+ : "rcx", "r11");
+ },
+ ::testing::KilledBySignal(SIGFPE), "");
+}
+
+// By default, SSE exceptions are masked and simply return a default value.
+TEST(ExceptionTest, SSEDivideByZeroMasked) {
+ uint32_t status;
+ int32_t quotient;
+ int32_t value = 1;
+ int32_t divisor = 0;
+ asm("cvtsi2ssl %[value], %%xmm0\r\n"
+ "cvtsi2ssl %[divisor], %%xmm1\r\n"
+ "divss %%xmm1, %%xmm0\r\n"
+ "cvtss2sil %%xmm0, %[quotient]\r\n"
+ : [ quotient ] "=r"(quotient), [ status ] "=r"(status)
+ : [ value ] "r"(value), [ divisor ] "r"(divisor)
+ : "xmm0", "xmm1");
+
+ EXPECT_EQ(quotient, INT32_MIN);
+}
+
+// When unmasked, division by zero raises SIGFPE.
+TEST(ExceptionTest, SSEDivideByZeroUnmasked) {
+ // See above.
+ struct sigaction sa = {};
+ sa.sa_handler = SIG_DFL;
+ auto const cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGFPE, sa));
+
+ EXPECT_EXIT(
+ {
+ // Clear the divide by zero exception mask.
+ constexpr uint32_t kMXCSR = kMXCSRDefault & ~kMXCSRDiv0Mask;
+
+ int32_t quotient;
+ int32_t value = 1;
+ int32_t divisor = 0;
+ asm volatile(
+ "ldmxcsr %[mxcsr]\r\n"
+ "cvtsi2ssl %[value], %%xmm0\r\n"
+ "cvtsi2ssl %[divisor], %%xmm1\r\n"
+ "divss %%xmm1, %%xmm0\r\n"
+ "cvtss2sil %%xmm0, %[quotient]\r\n"
+ : [ quotient ] "=r"(quotient)
+ : [ mxcsr ] "m"(kMXCSR), [ value ] "r"(value),
+ [ divisor ] "r"(divisor)
+ : "xmm0", "xmm1");
+ },
+ ::testing::KilledBySignal(SIGFPE), "");
+}
+
+// Pending exceptions in the SSE status register are not clobbered by syscalls.
+TEST(ExceptionTest, SSEStatusClobber) {
+ uint32_t mxcsr;
+ int32_t quotient;
+ int32_t value = 1;
+ int32_t divisor = 0;
+ asm("cvtsi2ssl %[value], %%xmm0\r\n"
+ "cvtsi2ssl %[divisor], %%xmm1\r\n"
+ "divss %%xmm1, %%xmm0\r\n"
+ // Exception is masked, so it does not occur here.
+ "cvtss2sil %%xmm0, %[quotient]\r\n"
+
+ // SYS_getpid placed in rax by constraint.
+ "syscall\r\n"
+
+ // Intel SDM Vol 1, Ch 10.2.3.1 "SIMD Floating-Point Mask and Flag Bits":
+ // "If LDMXCSR or FXRSTOR clears a mask bit and sets the corresponding
+ // exception flag bit, a SIMD floating-point exception will not be
+ // generated as a result of this change. The unmasked exception will be
+ // generated only upon the execution of the next SSE/SSE2/SSE3 instruction
+ // that detects the unmasked exception condition."
+ //
+ // Though ambiguous, empirical evidence indicates that this means that
+ // exception flags set in the status register will never cause an
+ // exception to be raised; only a new exception condition will do so.
+ //
+ // Thus here we just check for the flag itself rather than trying to raise
+ // the exception.
+ "stmxcsr %[mxcsr]\r\n"
+ : [ quotient ] "=r"(quotient), [ mxcsr ] "+m"(mxcsr)
+ : [ value ] "r"(value), [ divisor ] "r"(divisor), "a"(SYS_getpid)
+ : "xmm0", "xmm1", "rcx", "r11");
+
+ EXPECT_TRUE(mxcsr & kMXCSRDiv0Flag);
+}
+
TEST(ExceptionTest, IOAccessFault) {
// See above.
struct sigaction sa = {};
diff --git a/test/syscalls/linux/exec.cc b/test/syscalls/linux/exec.cc
index 581f03533..b5e0a512b 100644
--- a/test/syscalls/linux/exec.cc
+++ b/test/syscalls/linux/exec.cc
@@ -47,23 +47,14 @@ namespace testing {
namespace {
-constexpr char kBasicWorkload[] = "exec_basic_workload";
-constexpr char kExitScript[] = "exit_script";
-constexpr char kStateWorkload[] = "exec_state_workload";
-constexpr char kProcExeWorkload[] = "exec_proc_exe_workload";
-constexpr char kAssertClosedWorkload[] = "exec_assert_closed_workload";
-constexpr char kPriorityWorkload[] = "priority_execve";
-
-std::string WorkloadPath(absl::string_view binary) {
- std::string full_path;
- char* test_src = getenv("TEST_SRCDIR");
- if (test_src) {
- full_path = JoinPath(test_src, "__main__/test/syscalls/linux", binary);
- }
-
- TEST_CHECK(full_path.empty() == false);
- return full_path;
-}
+constexpr char kBasicWorkload[] = "test/syscalls/linux/exec_basic_workload";
+constexpr char kExitScript[] = "test/syscalls/linux/exit_script";
+constexpr char kStateWorkload[] = "test/syscalls/linux/exec_state_workload";
+constexpr char kProcExeWorkload[] =
+ "test/syscalls/linux/exec_proc_exe_workload";
+constexpr char kAssertClosedWorkload[] =
+ "test/syscalls/linux/exec_assert_closed_workload";
+constexpr char kPriorityWorkload[] = "test/syscalls/linux/priority_execve";
constexpr char kExit42[] = "--exec_exit_42";
constexpr char kExecWithThread[] = "--exec_exec_with_thread";
@@ -171,44 +162,44 @@ TEST(ExecTest, EmptyPath) {
}
TEST(ExecTest, Basic) {
- CheckExec(WorkloadPath(kBasicWorkload), {WorkloadPath(kBasicWorkload)}, {},
+ CheckExec(RunfilePath(kBasicWorkload), {RunfilePath(kBasicWorkload)}, {},
ArgEnvExitStatus(0, 0),
- absl::StrCat(WorkloadPath(kBasicWorkload), "\n"));
+ absl::StrCat(RunfilePath(kBasicWorkload), "\n"));
}
TEST(ExecTest, OneArg) {
- CheckExec(WorkloadPath(kBasicWorkload), {WorkloadPath(kBasicWorkload), "1"},
- {}, ArgEnvExitStatus(1, 0),
- absl::StrCat(WorkloadPath(kBasicWorkload), "\n1\n"));
+ CheckExec(RunfilePath(kBasicWorkload), {RunfilePath(kBasicWorkload), "1"}, {},
+ ArgEnvExitStatus(1, 0),
+ absl::StrCat(RunfilePath(kBasicWorkload), "\n1\n"));
}
TEST(ExecTest, FiveArg) {
- CheckExec(WorkloadPath(kBasicWorkload),
- {WorkloadPath(kBasicWorkload), "1", "2", "3", "4", "5"}, {},
+ CheckExec(RunfilePath(kBasicWorkload),
+ {RunfilePath(kBasicWorkload), "1", "2", "3", "4", "5"}, {},
ArgEnvExitStatus(5, 0),
- absl::StrCat(WorkloadPath(kBasicWorkload), "\n1\n2\n3\n4\n5\n"));
+ absl::StrCat(RunfilePath(kBasicWorkload), "\n1\n2\n3\n4\n5\n"));
}
TEST(ExecTest, OneEnv) {
- CheckExec(WorkloadPath(kBasicWorkload), {WorkloadPath(kBasicWorkload)}, {"1"},
+ CheckExec(RunfilePath(kBasicWorkload), {RunfilePath(kBasicWorkload)}, {"1"},
ArgEnvExitStatus(0, 1),
- absl::StrCat(WorkloadPath(kBasicWorkload), "\n1\n"));
+ absl::StrCat(RunfilePath(kBasicWorkload), "\n1\n"));
}
TEST(ExecTest, FiveEnv) {
- CheckExec(WorkloadPath(kBasicWorkload), {WorkloadPath(kBasicWorkload)},
+ CheckExec(RunfilePath(kBasicWorkload), {RunfilePath(kBasicWorkload)},
{"1", "2", "3", "4", "5"}, ArgEnvExitStatus(0, 5),
- absl::StrCat(WorkloadPath(kBasicWorkload), "\n1\n2\n3\n4\n5\n"));
+ absl::StrCat(RunfilePath(kBasicWorkload), "\n1\n2\n3\n4\n5\n"));
}
TEST(ExecTest, OneArgOneEnv) {
- CheckExec(WorkloadPath(kBasicWorkload), {WorkloadPath(kBasicWorkload), "arg"},
+ CheckExec(RunfilePath(kBasicWorkload), {RunfilePath(kBasicWorkload), "arg"},
{"env"}, ArgEnvExitStatus(1, 1),
- absl::StrCat(WorkloadPath(kBasicWorkload), "\narg\nenv\n"));
+ absl::StrCat(RunfilePath(kBasicWorkload), "\narg\nenv\n"));
}
TEST(ExecTest, InterpreterScript) {
- CheckExec(WorkloadPath(kExitScript), {WorkloadPath(kExitScript), "25"}, {},
+ CheckExec(RunfilePath(kExitScript), {RunfilePath(kExitScript), "25"}, {},
ArgEnvExitStatus(25, 0), "");
}
@@ -216,7 +207,7 @@ TEST(ExecTest, InterpreterScript) {
TEST(ExecTest, InterpreterScriptArgSplit) {
// Symlink through /tmp to ensure the path is short enough.
TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
- TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload)));
+ TempPath::CreateSymlinkTo("/tmp", RunfilePath(kBasicWorkload)));
TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path(), " foo bar"),
@@ -230,7 +221,7 @@ TEST(ExecTest, InterpreterScriptArgSplit) {
TEST(ExecTest, InterpreterScriptArgvZero) {
// Symlink through /tmp to ensure the path is short enough.
TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
- TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload)));
+ TempPath::CreateSymlinkTo("/tmp", RunfilePath(kBasicWorkload)));
TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path()), 0755));
@@ -244,7 +235,7 @@ TEST(ExecTest, InterpreterScriptArgvZero) {
TEST(ExecTest, InterpreterScriptArgvZeroRelative) {
// Symlink through /tmp to ensure the path is short enough.
TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
- TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload)));
+ TempPath::CreateSymlinkTo("/tmp", RunfilePath(kBasicWorkload)));
TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path()), 0755));
@@ -261,7 +252,7 @@ TEST(ExecTest, InterpreterScriptArgvZeroRelative) {
TEST(ExecTest, InterpreterScriptArgvZeroAdded) {
// Symlink through /tmp to ensure the path is short enough.
TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
- TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload)));
+ TempPath::CreateSymlinkTo("/tmp", RunfilePath(kBasicWorkload)));
TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path()), 0755));
@@ -274,7 +265,7 @@ TEST(ExecTest, InterpreterScriptArgvZeroAdded) {
TEST(ExecTest, InterpreterScriptArgNUL) {
// Symlink through /tmp to ensure the path is short enough.
TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
- TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload)));
+ TempPath::CreateSymlinkTo("/tmp", RunfilePath(kBasicWorkload)));
TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
GetAbsoluteTestTmpdir(),
@@ -289,7 +280,7 @@ TEST(ExecTest, InterpreterScriptArgNUL) {
TEST(ExecTest, InterpreterScriptTrailingWhitespace) {
// Symlink through /tmp to ensure the path is short enough.
TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
- TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload)));
+ TempPath::CreateSymlinkTo("/tmp", RunfilePath(kBasicWorkload)));
TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path(), " "), 0755));
@@ -302,7 +293,7 @@ TEST(ExecTest, InterpreterScriptTrailingWhitespace) {
TEST(ExecTest, InterpreterScriptArgWhitespace) {
// Symlink through /tmp to ensure the path is short enough.
TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
- TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload)));
+ TempPath::CreateSymlinkTo("/tmp", RunfilePath(kBasicWorkload)));
TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path(), " foo"), 0755));
@@ -325,7 +316,7 @@ TEST(ExecTest, InterpreterScriptNoPath) {
TEST(ExecTest, ExecFn) {
// Symlink through /tmp to ensure the path is short enough.
TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
- TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kStateWorkload)));
+ TempPath::CreateSymlinkTo("/tmp", RunfilePath(kStateWorkload)));
TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path(), " PrintExecFn"),
@@ -342,7 +333,7 @@ TEST(ExecTest, ExecFn) {
}
TEST(ExecTest, ExecName) {
- std::string path = WorkloadPath(kStateWorkload);
+ std::string path = RunfilePath(kStateWorkload);
CheckExec(path, {path, "PrintExecName"}, {}, ArgEnvExitStatus(0, 0),
absl::StrCat(Basename(path).substr(0, 15), "\n"));
@@ -351,7 +342,7 @@ TEST(ExecTest, ExecName) {
TEST(ExecTest, ExecNameScript) {
// Symlink through /tmp to ensure the path is short enough.
TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
- TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kStateWorkload)));
+ TempPath::CreateSymlinkTo("/tmp", RunfilePath(kStateWorkload)));
TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
GetAbsoluteTestTmpdir(),
@@ -405,13 +396,13 @@ TEST(ExecStateTest, HandlerReset) {
ASSERT_THAT(sigaction(SIGUSR1, &sa, nullptr), SyscallSucceeds());
ExecveArray args = {
- WorkloadPath(kStateWorkload),
+ RunfilePath(kStateWorkload),
"CheckSigHandler",
absl::StrCat(SIGUSR1),
absl::StrCat(absl::Hex(reinterpret_cast<uintptr_t>(SIG_DFL))),
};
- CheckExec(WorkloadPath(kStateWorkload), args, {}, W_EXITCODE(0, 0), "");
+ CheckExec(RunfilePath(kStateWorkload), args, {}, W_EXITCODE(0, 0), "");
}
// Ignored signal dispositions are not reset.
@@ -421,13 +412,13 @@ TEST(ExecStateTest, IgnorePreserved) {
ASSERT_THAT(sigaction(SIGUSR1, &sa, nullptr), SyscallSucceeds());
ExecveArray args = {
- WorkloadPath(kStateWorkload),
+ RunfilePath(kStateWorkload),
"CheckSigHandler",
absl::StrCat(SIGUSR1),
absl::StrCat(absl::Hex(reinterpret_cast<uintptr_t>(SIG_IGN))),
};
- CheckExec(WorkloadPath(kStateWorkload), args, {}, W_EXITCODE(0, 0), "");
+ CheckExec(RunfilePath(kStateWorkload), args, {}, W_EXITCODE(0, 0), "");
}
// Signal masks are not reset on exec
@@ -438,12 +429,12 @@ TEST(ExecStateTest, SignalMask) {
ASSERT_THAT(sigprocmask(SIG_BLOCK, &s, nullptr), SyscallSucceeds());
ExecveArray args = {
- WorkloadPath(kStateWorkload),
+ RunfilePath(kStateWorkload),
"CheckSigBlocked",
absl::StrCat(SIGUSR1),
};
- CheckExec(WorkloadPath(kStateWorkload), args, {}, W_EXITCODE(0, 0), "");
+ CheckExec(RunfilePath(kStateWorkload), args, {}, W_EXITCODE(0, 0), "");
}
// itimers persist across execve.
@@ -471,7 +462,7 @@ TEST(ExecStateTest, ItimerPreserved) {
}
};
- std::string filename = WorkloadPath(kStateWorkload);
+ std::string filename = RunfilePath(kStateWorkload);
ExecveArray argv = {
filename,
"CheckItimerEnabled",
@@ -495,8 +486,8 @@ TEST(ExecStateTest, ItimerPreserved) {
TEST(ProcSelfExe, ChangesAcrossExecve) {
// See exec_proc_exe_workload for more details. We simply
// assert that the /proc/self/exe link changes across execve.
- CheckExec(WorkloadPath(kProcExeWorkload),
- {WorkloadPath(kProcExeWorkload),
+ CheckExec(RunfilePath(kProcExeWorkload),
+ {RunfilePath(kProcExeWorkload),
ASSERT_NO_ERRNO_AND_VALUE(ProcessExePath(getpid()))},
{}, W_EXITCODE(0, 0), "");
}
@@ -507,8 +498,8 @@ TEST(ExecTest, CloexecNormalFile) {
const FileDescriptor fd_closed_on_exec =
ASSERT_NO_ERRNO_AND_VALUE(Open(tempFile.path(), O_RDONLY | O_CLOEXEC));
- CheckExec(WorkloadPath(kAssertClosedWorkload),
- {WorkloadPath(kAssertClosedWorkload),
+ CheckExec(RunfilePath(kAssertClosedWorkload),
+ {RunfilePath(kAssertClosedWorkload),
absl::StrCat(fd_closed_on_exec.get())},
{}, W_EXITCODE(0, 0), "");
@@ -517,10 +508,10 @@ TEST(ExecTest, CloexecNormalFile) {
const FileDescriptor fd_open_on_exec =
ASSERT_NO_ERRNO_AND_VALUE(Open(tempFile.path(), O_RDONLY));
- CheckExec(WorkloadPath(kAssertClosedWorkload),
- {WorkloadPath(kAssertClosedWorkload),
- absl::StrCat(fd_open_on_exec.get())},
- {}, W_EXITCODE(2, 0), "");
+ CheckExec(
+ RunfilePath(kAssertClosedWorkload),
+ {RunfilePath(kAssertClosedWorkload), absl::StrCat(fd_open_on_exec.get())},
+ {}, W_EXITCODE(2, 0), "");
}
TEST(ExecTest, CloexecEventfd) {
@@ -528,15 +519,15 @@ TEST(ExecTest, CloexecEventfd) {
ASSERT_THAT(efd = eventfd(0, EFD_CLOEXEC), SyscallSucceeds());
FileDescriptor fd(efd);
- CheckExec(WorkloadPath(kAssertClosedWorkload),
- {WorkloadPath(kAssertClosedWorkload), absl::StrCat(fd.get())}, {},
+ CheckExec(RunfilePath(kAssertClosedWorkload),
+ {RunfilePath(kAssertClosedWorkload), absl::StrCat(fd.get())}, {},
W_EXITCODE(0, 0), "");
}
constexpr int kLinuxMaxSymlinks = 40;
TEST(ExecTest, SymlinkLimitExceeded) {
- std::string path = WorkloadPath(kBasicWorkload);
+ std::string path = RunfilePath(kBasicWorkload);
// Hold onto TempPath objects so they are not destructed prematurely.
std::vector<TempPath> symlinks;
@@ -575,13 +566,13 @@ TEST(ExecTest, SymlinkLimitRefreshedForInterpreter) {
}
TEST(ExecveatTest, BasicWithFDCWD) {
- std::string path = WorkloadPath(kBasicWorkload);
+ std::string path = RunfilePath(kBasicWorkload);
CheckExecveat(AT_FDCWD, path, {path}, {}, /*flags=*/0, ArgEnvExitStatus(0, 0),
absl::StrCat(path, "\n"));
}
TEST(ExecveatTest, Basic) {
- std::string absolute_path = WorkloadPath(kBasicWorkload);
+ std::string absolute_path = RunfilePath(kBasicWorkload);
std::string parent_dir = std::string(Dirname(absolute_path));
std::string base = std::string(Basename(absolute_path));
const FileDescriptor dirfd =
@@ -592,7 +583,7 @@ TEST(ExecveatTest, Basic) {
}
TEST(ExecveatTest, FDNotADirectory) {
- std::string absolute_path = WorkloadPath(kBasicWorkload);
+ std::string absolute_path = RunfilePath(kBasicWorkload);
std::string base = std::string(Basename(absolute_path));
const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(absolute_path, 0));
@@ -604,13 +595,13 @@ TEST(ExecveatTest, FDNotADirectory) {
}
TEST(ExecveatTest, AbsolutePathWithFDCWD) {
- std::string path = WorkloadPath(kBasicWorkload);
+ std::string path = RunfilePath(kBasicWorkload);
CheckExecveat(AT_FDCWD, path, {path}, {}, ArgEnvExitStatus(0, 0), 0,
absl::StrCat(path, "\n"));
}
TEST(ExecveatTest, AbsolutePath) {
- std::string path = WorkloadPath(kBasicWorkload);
+ std::string path = RunfilePath(kBasicWorkload);
// File descriptor should be ignored when an absolute path is given.
const int32_t badFD = -1;
CheckExecveat(badFD, path, {path}, {}, ArgEnvExitStatus(0, 0), 0,
@@ -618,7 +609,7 @@ TEST(ExecveatTest, AbsolutePath) {
}
TEST(ExecveatTest, EmptyPathBasic) {
- std::string path = WorkloadPath(kBasicWorkload);
+ std::string path = RunfilePath(kBasicWorkload);
const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_PATH));
CheckExecveat(fd.get(), "", {path}, {}, AT_EMPTY_PATH, ArgEnvExitStatus(0, 0),
@@ -626,7 +617,7 @@ TEST(ExecveatTest, EmptyPathBasic) {
}
TEST(ExecveatTest, EmptyPathWithDirFD) {
- std::string path = WorkloadPath(kBasicWorkload);
+ std::string path = RunfilePath(kBasicWorkload);
std::string parent_dir = std::string(Dirname(path));
const FileDescriptor dirfd =
ASSERT_NO_ERRNO_AND_VALUE(Open(parent_dir, O_DIRECTORY));
@@ -639,7 +630,7 @@ TEST(ExecveatTest, EmptyPathWithDirFD) {
}
TEST(ExecveatTest, EmptyPathWithoutEmptyPathFlag) {
- std::string path = WorkloadPath(kBasicWorkload);
+ std::string path = RunfilePath(kBasicWorkload);
const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_PATH));
int execve_errno;
@@ -649,7 +640,7 @@ TEST(ExecveatTest, EmptyPathWithoutEmptyPathFlag) {
}
TEST(ExecveatTest, AbsolutePathWithEmptyPathFlag) {
- std::string path = WorkloadPath(kBasicWorkload);
+ std::string path = RunfilePath(kBasicWorkload);
const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_PATH));
CheckExecveat(fd.get(), path, {path}, {}, AT_EMPTY_PATH,
@@ -657,7 +648,7 @@ TEST(ExecveatTest, AbsolutePathWithEmptyPathFlag) {
}
TEST(ExecveatTest, RelativePathWithEmptyPathFlag) {
- std::string absolute_path = WorkloadPath(kBasicWorkload);
+ std::string absolute_path = RunfilePath(kBasicWorkload);
std::string parent_dir = std::string(Dirname(absolute_path));
std::string base = std::string(Basename(absolute_path));
const FileDescriptor dirfd =
@@ -670,7 +661,7 @@ TEST(ExecveatTest, RelativePathWithEmptyPathFlag) {
TEST(ExecveatTest, SymlinkNoFollowWithRelativePath) {
std::string parent_dir = "/tmp";
TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
- TempPath::CreateSymlinkTo(parent_dir, WorkloadPath(kBasicWorkload)));
+ TempPath::CreateSymlinkTo(parent_dir, RunfilePath(kBasicWorkload)));
const FileDescriptor dirfd =
ASSERT_NO_ERRNO_AND_VALUE(Open(parent_dir, O_DIRECTORY));
std::string base = std::string(Basename(link.path()));
@@ -685,7 +676,7 @@ TEST(ExecveatTest, SymlinkNoFollowWithRelativePath) {
TEST(ExecveatTest, SymlinkNoFollowWithAbsolutePath) {
std::string parent_dir = "/tmp";
TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
- TempPath::CreateSymlinkTo(parent_dir, WorkloadPath(kBasicWorkload)));
+ TempPath::CreateSymlinkTo(parent_dir, RunfilePath(kBasicWorkload)));
std::string path = link.path();
int execve_errno;
@@ -697,7 +688,7 @@ TEST(ExecveatTest, SymlinkNoFollowWithAbsolutePath) {
TEST(ExecveatTest, SymlinkNoFollowAndEmptyPath) {
TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
- TempPath::CreateSymlinkTo("/tmp", WorkloadPath(kBasicWorkload)));
+ TempPath::CreateSymlinkTo("/tmp", RunfilePath(kBasicWorkload)));
std::string path = link.path();
const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, 0));
@@ -723,7 +714,7 @@ TEST(ExecveatTest, SymlinkNoFollowWithNormalFile) {
}
TEST(ExecveatTest, BasicWithCloexecFD) {
- std::string path = WorkloadPath(kBasicWorkload);
+ std::string path = RunfilePath(kBasicWorkload);
const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_CLOEXEC));
CheckExecveat(fd.get(), "", {path}, {}, AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH,
@@ -731,7 +722,7 @@ TEST(ExecveatTest, BasicWithCloexecFD) {
}
TEST(ExecveatTest, InterpreterScriptWithCloexecFD) {
- std::string path = WorkloadPath(kExitScript);
+ std::string path = RunfilePath(kExitScript);
const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_CLOEXEC));
int execve_errno;
@@ -742,7 +733,7 @@ TEST(ExecveatTest, InterpreterScriptWithCloexecFD) {
}
TEST(ExecveatTest, InterpreterScriptWithCloexecDirFD) {
- std::string absolute_path = WorkloadPath(kExitScript);
+ std::string absolute_path = RunfilePath(kExitScript);
std::string parent_dir = std::string(Dirname(absolute_path));
std::string base = std::string(Basename(absolute_path));
const FileDescriptor dirfd =
@@ -775,7 +766,7 @@ TEST(GetpriorityTest, ExecveMaintainsPriority) {
// Program run (priority_execve) will exit(X) where
// X=getpriority(PRIO_PROCESS,0). Check that this exit value is prio.
- CheckExec(WorkloadPath(kPriorityWorkload), {WorkloadPath(kPriorityWorkload)},
+ CheckExec(RunfilePath(kPriorityWorkload), {RunfilePath(kPriorityWorkload)},
{}, W_EXITCODE(expected_exit_code, 0), "");
}
diff --git a/test/syscalls/linux/fcntl.cc b/test/syscalls/linux/fcntl.cc
index 8a45be12a..4f3aa81d6 100644
--- a/test/syscalls/linux/fcntl.cc
+++ b/test/syscalls/linux/fcntl.cc
@@ -14,6 +14,7 @@
#include <fcntl.h>
#include <signal.h>
+#include <sys/types.h>
#include <syscall.h>
#include <unistd.h>
@@ -32,6 +33,7 @@
#include "test/util/eventfd_util.h"
#include "test/util/multiprocess_util.h"
#include "test/util/posix_error.h"
+#include "test/util/save_util.h"
#include "test/util/temp_path.h"
#include "test/util/test_util.h"
#include "test/util/timer_util.h"
@@ -910,8 +912,166 @@ TEST(FcntlTest, GetOwn) {
FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
- ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN),
+ EXPECT_EQ(syscall(__NR_fcntl, s.get(), F_GETOWN), 0);
+ MaybeSave();
+}
+
+TEST(FcntlTest, GetOwnEx) {
+ FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+ f_owner_ex owner = {};
+ EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN_EX, &owner),
+ SyscallSucceedsWithValue(0));
+}
+
+TEST(FcntlTest, SetOwnExInvalidType) {
+ FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+ f_owner_ex owner = {};
+ owner.type = __pid_type(-1);
+ EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(FcntlTest, SetOwnExInvalidTid) {
+ FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+ f_owner_ex owner = {};
+ owner.type = F_OWNER_TID;
+ owner.pid = -1;
+
+ EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner),
+ SyscallFailsWithErrno(ESRCH));
+}
+
+TEST(FcntlTest, SetOwnExInvalidPid) {
+ FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+ f_owner_ex owner = {};
+ owner.type = F_OWNER_PID;
+ owner.pid = -1;
+
+ EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner),
+ SyscallFailsWithErrno(ESRCH));
+}
+
+TEST(FcntlTest, SetOwnExInvalidPgrp) {
+ FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+ f_owner_ex owner = {};
+ owner.type = F_OWNER_PGRP;
+ owner.pid = -1;
+
+ EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner),
+ SyscallFailsWithErrno(ESRCH));
+}
+
+TEST(FcntlTest, SetOwnExTid) {
+ FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+ f_owner_ex owner = {};
+ owner.type = F_OWNER_TID;
+ EXPECT_THAT(owner.pid = syscall(__NR_gettid), SyscallSucceeds());
+
+ ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner),
+ SyscallSucceeds());
+
+ EXPECT_EQ(syscall(__NR_fcntl, s.get(), F_GETOWN), owner.pid);
+ MaybeSave();
+}
+
+TEST(FcntlTest, SetOwnExPid) {
+ FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+ f_owner_ex owner = {};
+ owner.type = F_OWNER_PID;
+ EXPECT_THAT(owner.pid = getpid(), SyscallSucceeds());
+
+ ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner),
+ SyscallSucceeds());
+
+ EXPECT_EQ(syscall(__NR_fcntl, s.get(), F_GETOWN), owner.pid);
+ MaybeSave();
+}
+
+TEST(FcntlTest, SetOwnExPgrp) {
+ FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+ f_owner_ex owner = {};
+ owner.type = F_OWNER_PGRP;
+ EXPECT_THAT(owner.pid = getpgrp(), SyscallSucceeds());
+
+ ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner),
+ SyscallSucceeds());
+
+ // NOTE(igudger): I don't understand why, but this is flaky on Linux.
+ // GetOwnExPgrp (below) does not have this issue.
+ SKIP_IF(!IsRunningOnGvisor());
+
+ EXPECT_EQ(syscall(__NR_fcntl, s.get(), F_GETOWN), -owner.pid);
+ MaybeSave();
+}
+
+TEST(FcntlTest, GetOwnExTid) {
+ FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+ f_owner_ex set_owner = {};
+ set_owner.type = F_OWNER_TID;
+ EXPECT_THAT(set_owner.pid = syscall(__NR_gettid), SyscallSucceeds());
+
+ ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &set_owner),
+ SyscallSucceeds());
+
+ f_owner_ex got_owner = {};
+ ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN_EX, &got_owner),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(got_owner.type, set_owner.type);
+ EXPECT_EQ(got_owner.pid, set_owner.pid);
+}
+
+TEST(FcntlTest, GetOwnExPid) {
+ FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+ f_owner_ex set_owner = {};
+ set_owner.type = F_OWNER_PID;
+ EXPECT_THAT(set_owner.pid = getpid(), SyscallSucceeds());
+
+ ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &set_owner),
+ SyscallSucceeds());
+
+ f_owner_ex got_owner = {};
+ ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN_EX, &got_owner),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(got_owner.type, set_owner.type);
+ EXPECT_EQ(got_owner.pid, set_owner.pid);
+}
+
+TEST(FcntlTest, GetOwnExPgrp) {
+ FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+ f_owner_ex set_owner = {};
+ set_owner.type = F_OWNER_PGRP;
+ EXPECT_THAT(set_owner.pid = getpgrp(), SyscallSucceeds());
+
+ ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &set_owner),
+ SyscallSucceeds());
+
+ f_owner_ex got_owner = {};
+ ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN_EX, &got_owner),
SyscallSucceedsWithValue(0));
+ EXPECT_EQ(got_owner.type, set_owner.type);
+ EXPECT_EQ(got_owner.pid, set_owner.pid);
}
} // namespace
diff --git a/test/syscalls/linux/file_base.h b/test/syscalls/linux/file_base.h
index 4e048320e..6f80bc97c 100644
--- a/test/syscalls/linux/file_base.h
+++ b/test/syscalls/linux/file_base.h
@@ -111,95 +111,6 @@ class FileTest : public ::testing::Test {
int test_pipe_[2];
};
-class SocketTest : public ::testing::Test {
- public:
- void SetUp() override {
- test_unix_stream_socket_[0] = -1;
- test_unix_stream_socket_[1] = -1;
- test_unix_dgram_socket_[0] = -1;
- test_unix_dgram_socket_[1] = -1;
- test_unix_seqpacket_socket_[0] = -1;
- test_unix_seqpacket_socket_[1] = -1;
- test_tcp_socket_[0] = -1;
- test_tcp_socket_[1] = -1;
-
- ASSERT_THAT(socketpair(AF_UNIX, SOCK_STREAM, 0, test_unix_stream_socket_),
- SyscallSucceeds());
- ASSERT_THAT(fcntl(test_unix_stream_socket_[0], F_SETFL, O_NONBLOCK),
- SyscallSucceeds());
- ASSERT_THAT(socketpair(AF_UNIX, SOCK_DGRAM, 0, test_unix_dgram_socket_),
- SyscallSucceeds());
- ASSERT_THAT(fcntl(test_unix_dgram_socket_[0], F_SETFL, O_NONBLOCK),
- SyscallSucceeds());
- ASSERT_THAT(
- socketpair(AF_UNIX, SOCK_SEQPACKET, 0, test_unix_seqpacket_socket_),
- SyscallSucceeds());
- ASSERT_THAT(fcntl(test_unix_seqpacket_socket_[0], F_SETFL, O_NONBLOCK),
- SyscallSucceeds());
- }
-
- void TearDown() override {
- close(test_unix_stream_socket_[0]);
- close(test_unix_stream_socket_[1]);
-
- close(test_unix_dgram_socket_[0]);
- close(test_unix_dgram_socket_[1]);
-
- close(test_unix_seqpacket_socket_[0]);
- close(test_unix_seqpacket_socket_[1]);
-
- close(test_tcp_socket_[0]);
- close(test_tcp_socket_[1]);
- }
-
- int test_unix_stream_socket_[2];
- int test_unix_dgram_socket_[2];
- int test_unix_seqpacket_socket_[2];
- int test_tcp_socket_[2];
-};
-
-// MatchesStringLength checks that a tuple argument of (struct iovec *, int)
-// corresponding to an iovec array and its length, contains data that matches
-// the string length strlen.
-MATCHER_P(MatchesStringLength, strlen, "") {
- struct iovec* iovs = arg.first;
- int niov = arg.second;
- int offset = 0;
- for (int i = 0; i < niov; i++) {
- offset += iovs[i].iov_len;
- }
- if (offset != static_cast<int>(strlen)) {
- *result_listener << offset;
- return false;
- }
- return true;
-}
-
-// MatchesStringValue checks that a tuple argument of (struct iovec *, int)
-// corresponding to an iovec array and its length, contains data that matches
-// the string value str.
-MATCHER_P(MatchesStringValue, str, "") {
- struct iovec* iovs = arg.first;
- int len = strlen(str);
- int niov = arg.second;
- int offset = 0;
- for (int i = 0; i < niov; i++) {
- struct iovec iov = iovs[i];
- if (len < offset) {
- *result_listener << "strlen " << len << " < offset " << offset;
- return false;
- }
- if (strncmp(static_cast<char*>(iov.iov_base), &str[offset], iov.iov_len)) {
- absl::string_view iovec_string(static_cast<char*>(iov.iov_base),
- iov.iov_len);
- *result_listener << iovec_string << " @offset " << offset;
- return false;
- }
- offset += iov.iov_len;
- }
- return true;
-}
-
} // namespace testing
} // namespace gvisor
diff --git a/test/syscalls/linux/futex.cc b/test/syscalls/linux/futex.cc
index d3e3f998c..40c80a6e1 100644
--- a/test/syscalls/linux/futex.cc
+++ b/test/syscalls/linux/futex.cc
@@ -239,6 +239,27 @@ TEST_P(PrivateAndSharedFutexTest, Wake1_NoRandomSave) {
EXPECT_THAT(futex_wake(IsPrivate(), &a, 1), SyscallSucceedsWithValue(1));
}
+TEST_P(PrivateAndSharedFutexTest, Wake0_NoRandomSave) {
+ constexpr int kInitialValue = 1;
+ std::atomic<int> a = ATOMIC_VAR_INIT(kInitialValue);
+
+ // Prevent save/restore from interrupting futex_wait, which will cause it to
+ // return EAGAIN instead of the expected result if futex_wait is restarted
+ // after we change the value of a below.
+ DisableSave ds;
+ ScopedThread thread([&] {
+ EXPECT_THAT(futex_wait(IsPrivate(), &a, kInitialValue),
+ SyscallSucceedsWithValue(0));
+ });
+ absl::SleepFor(kWaiterStartupDelay);
+
+ // Change a so that if futex_wake happens before futex_wait, the latter
+ // returns EAGAIN instead of hanging the test.
+ a.fetch_add(1);
+ // The Linux kernel wakes one waiter even if val is 0 or negative.
+ EXPECT_THAT(futex_wake(IsPrivate(), &a, 0), SyscallSucceedsWithValue(1));
+}
+
TEST_P(PrivateAndSharedFutexTest, WakeAll_NoRandomSave) {
constexpr int kInitialValue = 1;
std::atomic<int> a = ATOMIC_VAR_INIT(kInitialValue);
diff --git a/test/syscalls/linux/inotify.cc b/test/syscalls/linux/inotify.cc
index 7384c27dc..fdef646eb 100644
--- a/test/syscalls/linux/inotify.cc
+++ b/test/syscalls/linux/inotify.cc
@@ -977,7 +977,7 @@ TEST(Inotify, WatchOnRelativePath) {
ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDONLY));
// Change working directory to root.
- const char* old_working_dir = get_current_dir_name();
+ const FileDescriptor cwd = ASSERT_NO_ERRNO_AND_VALUE(Open(".", O_PATH));
EXPECT_THAT(chdir(root.path().c_str()), SyscallSucceeds());
// Add a watch on file1 with a relative path.
@@ -997,7 +997,7 @@ TEST(Inotify, WatchOnRelativePath) {
// continue to hold a reference, random save/restore tests can fail if a save
// is triggered after "root" is unlinked; we can't save deleted fs objects
// with active references.
- EXPECT_THAT(chdir(old_working_dir), SyscallSucceeds());
+ EXPECT_THAT(fchdir(cwd.get()), SyscallSucceeds());
}
TEST(Inotify, ZeroLengthReadWriteDoesNotGenerateEvent) {
@@ -1591,6 +1591,34 @@ TEST(Inotify, EpollNoDeadlock) {
}
}
+TEST(Inotify, SpliceEvent) {
+ int pipes[2];
+ ASSERT_THAT(pipe2(pipes, O_NONBLOCK), SyscallSucceeds());
+
+ const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+ const TempPath file1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ root.path(), "some content", TempPath::kDefaultFileMode));
+
+ const FileDescriptor file1_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDONLY));
+ const int watcher = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS));
+
+ char buf;
+ EXPECT_THAT(read(file1_fd.get(), &buf, 1), SyscallSucceeds());
+
+ EXPECT_THAT(splice(fd.get(), nullptr, pipes[1], nullptr,
+ sizeof(struct inotify_event) + 1, SPLICE_F_NONBLOCK),
+ SyscallSucceedsWithValue(sizeof(struct inotify_event)));
+
+ const FileDescriptor read_fd(pipes[0]);
+ const std::vector<Event> events =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(read_fd.get()));
+ ASSERT_THAT(events, Are({Event(IN_ACCESS, watcher)}));
+}
+
} // namespace
} // namespace testing
} // namespace gvisor
diff --git a/test/syscalls/linux/ioctl.cc b/test/syscalls/linux/ioctl.cc
index c4f8bff08..b0a07a064 100644
--- a/test/syscalls/linux/ioctl.cc
+++ b/test/syscalls/linux/ioctl.cc
@@ -215,7 +215,8 @@ TEST_F(IoctlTest, FIOASYNCSelfTarget2) {
auto mask_cleanup =
ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_UNBLOCK, SIGIO));
- pid_t pid = getpid();
+ pid_t pid = -1;
+ EXPECT_THAT(pid = getpid(), SyscallSucceeds());
EXPECT_THAT(ioctl(pair->second_fd(), FIOSETOWN, &pid), SyscallSucceeds());
int set = 1;
diff --git a/test/syscalls/linux/ip_socket_test_util.cc b/test/syscalls/linux/ip_socket_test_util.cc
index 8398fc95f..6b472eb2f 100644
--- a/test/syscalls/linux/ip_socket_test_util.cc
+++ b/test/syscalls/linux/ip_socket_test_util.cc
@@ -187,24 +187,24 @@ PosixErrorOr<int> IfAddrHelper::GetIndex(std::string name) {
return InterfaceIndex(name);
}
-std::string GetAddr4Str(in_addr* a) {
+std::string GetAddr4Str(const in_addr* a) {
char str[INET_ADDRSTRLEN];
inet_ntop(AF_INET, a, str, sizeof(str));
return std::string(str);
}
-std::string GetAddr6Str(in6_addr* a) {
+std::string GetAddr6Str(const in6_addr* a) {
char str[INET6_ADDRSTRLEN];
inet_ntop(AF_INET6, a, str, sizeof(str));
return std::string(str);
}
-std::string GetAddrStr(sockaddr* a) {
+std::string GetAddrStr(const sockaddr* a) {
if (a->sa_family == AF_INET) {
- auto src = &(reinterpret_cast<sockaddr_in*>(a)->sin_addr);
+ auto src = &(reinterpret_cast<const sockaddr_in*>(a)->sin_addr);
return GetAddr4Str(src);
} else if (a->sa_family == AF_INET6) {
- auto src = &(reinterpret_cast<sockaddr_in6*>(a)->sin6_addr);
+ auto src = &(reinterpret_cast<const sockaddr_in6*>(a)->sin6_addr);
return GetAddr6Str(src);
}
return std::string("<invalid>");
diff --git a/test/syscalls/linux/ip_socket_test_util.h b/test/syscalls/linux/ip_socket_test_util.h
index 072230d85..0f58e0f77 100644
--- a/test/syscalls/linux/ip_socket_test_util.h
+++ b/test/syscalls/linux/ip_socket_test_util.h
@@ -26,25 +26,6 @@
namespace gvisor {
namespace testing {
-// Possible values of the "st" field in a /proc/net/{tcp,udp} entry. Source:
-// Linux kernel, include/net/tcp_states.h.
-enum {
- TCP_ESTABLISHED = 1,
- TCP_SYN_SENT,
- TCP_SYN_RECV,
- TCP_FIN_WAIT1,
- TCP_FIN_WAIT2,
- TCP_TIME_WAIT,
- TCP_CLOSE,
- TCP_CLOSE_WAIT,
- TCP_LAST_ACK,
- TCP_LISTEN,
- TCP_CLOSING,
- TCP_NEW_SYN_RECV,
-
- TCP_MAX_STATES
-};
-
// Extracts the IP address from an inet sockaddr in network byte order.
uint32_t IPFromInetSockaddr(const struct sockaddr* addr);
@@ -124,14 +105,14 @@ class IfAddrHelper {
};
// GetAddr4Str returns the given IPv4 network address structure as a string.
-std::string GetAddr4Str(in_addr* a);
+std::string GetAddr4Str(const in_addr* a);
// GetAddr6Str returns the given IPv6 network address structure as a string.
-std::string GetAddr6Str(in6_addr* a);
+std::string GetAddr6Str(const in6_addr* a);
// GetAddrStr returns the given IPv4 or IPv6 network address structure as a
// string.
-std::string GetAddrStr(sockaddr* a);
+std::string GetAddrStr(const sockaddr* a);
} // namespace testing
} // namespace gvisor
diff --git a/test/syscalls/linux/mmap.cc b/test/syscalls/linux/mmap.cc
index 6f2639d8a..1c4d9f1c7 100644
--- a/test/syscalls/linux/mmap.cc
+++ b/test/syscalls/linux/mmap.cc
@@ -814,23 +814,27 @@ class MMapFileTest : public MMapTest {
}
};
+class MMapFileParamTest
+ : public MMapFileTest,
+ public ::testing::WithParamInterface<std::tuple<int, int>> {
+ protected:
+ int prot() const { return std::get<0>(GetParam()); }
+
+ int flags() const { return std::get<1>(GetParam()); }
+};
+
// MAP_POPULATE allowed.
// There isn't a good way to verify it actually did anything.
-//
-// FIXME(b/37222275): Parameterize.
-TEST_F(MMapFileTest, MapPopulate) {
- ASSERT_THAT(
- Map(0, kPageSize, PROT_READ, MAP_PRIVATE | MAP_POPULATE, fd_.get(), 0),
- SyscallSucceeds());
+TEST_P(MMapFileParamTest, MapPopulate) {
+ ASSERT_THAT(Map(0, kPageSize, prot(), flags() | MAP_POPULATE, fd_.get(), 0),
+ SyscallSucceeds());
}
// MAP_POPULATE on a short file.
-//
-// FIXME(b/37222275): Parameterize.
-TEST_F(MMapFileTest, MapPopulateShort) {
- ASSERT_THAT(Map(0, 2 * kPageSize, PROT_READ, MAP_PRIVATE | MAP_POPULATE,
- fd_.get(), 0),
- SyscallSucceeds());
+TEST_P(MMapFileParamTest, MapPopulateShort) {
+ ASSERT_THAT(
+ Map(0, 2 * kPageSize, prot(), flags() | MAP_POPULATE, fd_.get(), 0),
+ SyscallSucceeds());
}
// Read contents from mapped file.
@@ -901,16 +905,6 @@ TEST_F(MMapFileTest, WritePrivateOnReadOnlyFd) {
reinterpret_cast<volatile char*>(addr));
}
-// MAP_PRIVATE PROT_READ is not allowed on write-only FDs.
-TEST_F(MMapFileTest, ReadPrivateOnWriteOnlyFd) {
- const FileDescriptor fd =
- ASSERT_NO_ERRNO_AND_VALUE(Open(filename_, O_WRONLY));
-
- uintptr_t addr;
- EXPECT_THAT(addr = Map(0, kPageSize, PROT_READ, MAP_PRIVATE, fd.get(), 0),
- SyscallFailsWithErrno(EACCES));
-}
-
// MAP_SHARED PROT_WRITE not allowed on read-only FDs.
TEST_F(MMapFileTest, WriteSharedOnReadOnlyFd) {
const FileDescriptor fd =
@@ -922,28 +916,13 @@ TEST_F(MMapFileTest, WriteSharedOnReadOnlyFd) {
SyscallFailsWithErrno(EACCES));
}
-// MAP_SHARED PROT_READ not allowed on write-only FDs.
-//
-// FIXME(b/37222275): Parameterize.
-TEST_F(MMapFileTest, ReadSharedOnWriteOnlyFd) {
- const FileDescriptor fd =
- ASSERT_NO_ERRNO_AND_VALUE(Open(filename_, O_WRONLY));
-
- uintptr_t addr;
- EXPECT_THAT(addr = Map(0, kPageSize, PROT_READ, MAP_SHARED, fd.get(), 0),
- SyscallFailsWithErrno(EACCES));
-}
-
-// MAP_SHARED PROT_WRITE not allowed on write-only FDs.
-// The FD must always be readable.
-//
-// FIXME(b/37222275): Parameterize.
-TEST_F(MMapFileTest, WriteSharedOnWriteOnlyFd) {
+// The FD must be readable.
+TEST_P(MMapFileParamTest, WriteOnlyFd) {
const FileDescriptor fd =
ASSERT_NO_ERRNO_AND_VALUE(Open(filename_, O_WRONLY));
uintptr_t addr;
- EXPECT_THAT(addr = Map(0, kPageSize, PROT_WRITE, MAP_SHARED, fd.get(), 0),
+ EXPECT_THAT(addr = Map(0, kPageSize, prot(), flags(), fd.get(), 0),
SyscallFailsWithErrno(EACCES));
}
@@ -1182,7 +1161,7 @@ TEST_F(MMapFileTest, ReadSharedTruncateDownThenUp) {
ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ, MAP_SHARED, fd_.get(), 0),
SyscallSucceeds());
- // Check that the memory contains he file data.
+ // Check that the memory contains the file data.
EXPECT_EQ(0, memcmp(reinterpret_cast<void*>(addr), buf.c_str(), kPageSize));
// Truncate down, then up.
@@ -1371,125 +1350,68 @@ TEST_F(MMapFileTest, WritePrivate) {
EqualsMemory(std::string(len, '\0')));
}
-// SIGBUS raised when writing past end of file to a private mapping.
-//
-// FIXME(b/37222275): Parameterize.
-TEST_F(MMapFileTest, SigBusDeathWritePrivate) {
+// SIGBUS raised when reading or writing past end of a mapped file.
+TEST_P(MMapFileParamTest, SigBusDeath) {
SetupGvisorDeathTest();
uintptr_t addr;
- ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE,
- fd_.get(), 0),
+ ASSERT_THAT(addr = Map(0, 2 * kPageSize, prot(), flags(), fd_.get(), 0),
SyscallSucceeds());
- // MMapFileTest makes a file kPageSize/2 long. The entire first page will be
- // accessible. Write just beyond that.
- size_t len = strlen(kFileContents);
- EXPECT_EXIT(std::copy(kFileContents, kFileContents + len,
- reinterpret_cast<volatile char*>(addr + kPageSize)),
- ::testing::KilledBySignal(SIGBUS), "");
-}
-
-// SIGBUS raised when reading past end of file on a shared mapping.
-//
-// FIXME(b/37222275): Parameterize.
-TEST_F(MMapFileTest, SigBusDeathReadShared) {
- SetupGvisorDeathTest();
-
- uintptr_t addr;
- ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ, MAP_SHARED, fd_.get(), 0),
- SyscallSucceeds());
-
- // MMapFileTest makes a file kPageSize/2 long. The entire first page will be
- // accessible. Read just beyond that.
- std::vector<char> in(kPageSize);
- EXPECT_EXIT(
- std::copy(reinterpret_cast<volatile char*>(addr + kPageSize),
- reinterpret_cast<volatile char*>(addr + kPageSize) + kPageSize,
- in.data()),
- ::testing::KilledBySignal(SIGBUS), "");
+ auto* start = reinterpret_cast<volatile char*>(addr + kPageSize);
+
+ // MMapFileTest makes a file kPageSize/2 long. The entire first page should be
+ // accessible, but anything beyond it should not.
+ if (prot() & PROT_WRITE) {
+ // Write beyond first page.
+ size_t len = strlen(kFileContents);
+ EXPECT_EXIT(std::copy(kFileContents, kFileContents + len, start),
+ ::testing::KilledBySignal(SIGBUS), "");
+ } else {
+ // Read beyond first page.
+ std::vector<char> in(kPageSize);
+ EXPECT_EXIT(std::copy(start, start + kPageSize, in.data()),
+ ::testing::KilledBySignal(SIGBUS), "");
+ }
}
-// SIGBUS raised when reading past end of file on a shared mapping.
+// Tests that SIGBUS is not raised when reading or writing to a file-mapped
+// page before EOF, even if part of the mapping extends beyond EOF.
//
-// FIXME(b/37222275): Parameterize.
-TEST_F(MMapFileTest, SigBusDeathWriteShared) {
- SetupGvisorDeathTest();
-
+// See b/27877699.
+TEST_P(MMapFileParamTest, NoSigBusOnPagesBeforeEOF) {
uintptr_t addr;
- ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
- fd_.get(), 0),
- SyscallSucceeds());
-
- // MMapFileTest makes a file kPageSize/2 long. The entire first page will be
- // accessible. Write just beyond that.
- size_t len = strlen(kFileContents);
- EXPECT_EXIT(std::copy(kFileContents, kFileContents + len,
- reinterpret_cast<volatile char*>(addr + kPageSize)),
- ::testing::KilledBySignal(SIGBUS), "");
-}
-
-// Tests that SIGBUS is not raised when writing to a file-mapped page before
-// EOF, even if part of the mapping extends beyond EOF.
-TEST_F(MMapFileTest, NoSigBusOnPagesBeforeEOF) {
- uintptr_t addr;
- ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE,
- fd_.get(), 0),
+ ASSERT_THAT(addr = Map(0, 2 * kPageSize, prot(), flags(), fd_.get(), 0),
SyscallSucceeds());
// The test passes if this survives.
- size_t len = strlen(kFileContents);
- std::copy(kFileContents, kFileContents + len,
- reinterpret_cast<volatile char*>(addr));
-}
-
-// Tests that SIGBUS is not raised when writing to a file-mapped page containing
-// EOF, *after* the EOF for a private mapping.
-TEST_F(MMapFileTest, NoSigBusOnPageContainingEOFWritePrivate) {
- uintptr_t addr;
- ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE,
- fd_.get(), 0),
- SyscallSucceeds());
-
- // The test passes if this survives. (Technically addr+kPageSize/2 is already
- // beyond EOF, but +1 to check for fencepost errors.)
- size_t len = strlen(kFileContents);
- std::copy(kFileContents, kFileContents + len,
- reinterpret_cast<volatile char*>(addr + (kPageSize / 2) + 1));
-}
-
-// Tests that SIGBUS is not raised when reading from a file-mapped page
-// containing EOF, *after* the EOF for a shared mapping.
-//
-// FIXME(b/37222275): Parameterize.
-TEST_F(MMapFileTest, NoSigBusOnPageContainingEOFReadShared) {
- uintptr_t addr;
- ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ, MAP_SHARED, fd_.get(), 0),
- SyscallSucceeds());
-
- // The test passes if this survives. (Technically addr+kPageSize/2 is already
- // beyond EOF, but +1 to check for fencepost errors.)
auto* start = reinterpret_cast<volatile char*>(addr + (kPageSize / 2) + 1);
size_t len = strlen(kFileContents);
- std::vector<char> in(len);
- std::copy(start, start + len, in.data());
+ if (prot() & PROT_WRITE) {
+ std::copy(kFileContents, kFileContents + len, start);
+ } else {
+ std::vector<char> in(len);
+ std::copy(start, start + len, in.data());
+ }
}
-// Tests that SIGBUS is not raised when writing to a file-mapped page containing
-// EOF, *after* the EOF for a shared mapping.
-//
-// FIXME(b/37222275): Parameterize.
-TEST_F(MMapFileTest, NoSigBusOnPageContainingEOFWriteShared) {
+// Tests that SIGBUS is not raised when reading or writing from a file-mapped
+// page containing EOF, *after* the EOF.
+TEST_P(MMapFileParamTest, NoSigBusOnPageContainingEOF) {
uintptr_t addr;
- ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
- fd_.get(), 0),
+ ASSERT_THAT(addr = Map(0, 2 * kPageSize, prot(), flags(), fd_.get(), 0),
SyscallSucceeds());
// The test passes if this survives. (Technically addr+kPageSize/2 is already
// beyond EOF, but +1 to check for fencepost errors.)
+ auto* start = reinterpret_cast<volatile char*>(addr + (kPageSize / 2) + 1);
size_t len = strlen(kFileContents);
- std::copy(kFileContents, kFileContents + len,
- reinterpret_cast<volatile char*>(addr + (kPageSize / 2) + 1));
+ if (prot() & PROT_WRITE) {
+ std::copy(kFileContents, kFileContents + len, start);
+ } else {
+ std::vector<char> in(len);
+ std::copy(start, start + len, in.data());
+ }
}
// Tests that reading from writable shared file-mapped pages succeeds.
@@ -1733,6 +1655,15 @@ TEST(MMapNoFixtureTest, Map32Bit) {
#endif // defined(__x86_64__)
+INSTANTIATE_TEST_SUITE_P(
+ ReadWriteSharedPrivate, MMapFileParamTest,
+ ::testing::Combine(::testing::ValuesIn({
+ PROT_READ,
+ PROT_WRITE,
+ PROT_READ | PROT_WRITE,
+ }),
+ ::testing::ValuesIn({MAP_SHARED, MAP_PRIVATE})));
+
} // namespace
} // namespace testing
diff --git a/test/syscalls/linux/partial_bad_buffer.cc b/test/syscalls/linux/partial_bad_buffer.cc
index 33822ee57..df7129acc 100644
--- a/test/syscalls/linux/partial_bad_buffer.cc
+++ b/test/syscalls/linux/partial_bad_buffer.cc
@@ -18,7 +18,9 @@
#include <netinet/tcp.h>
#include <sys/mman.h>
#include <sys/socket.h>
+#include <sys/stat.h>
#include <sys/syscall.h>
+#include <sys/types.h>
#include <sys/uio.h>
#include <unistd.h>
@@ -62,9 +64,9 @@ class PartialBadBufferTest : public ::testing::Test {
// Write some initial data.
size_t size = sizeof(kMessage) - 1;
EXPECT_THAT(WriteFd(fd_, &kMessage, size), SyscallSucceedsWithValue(size));
-
ASSERT_THAT(lseek(fd_, 0, SEEK_SET), SyscallSucceeds());
+ // Map a useable buffer.
addr_ = mmap(0, 2 * kPageSize, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
ASSERT_NE(addr_, MAP_FAILED);
@@ -79,6 +81,15 @@ class PartialBadBufferTest : public ::testing::Test {
bad_buffer_ = buf + kPageSize - 1;
}
+ off_t Size() {
+ struct stat st;
+ int rc = fstat(fd_, &st);
+ if (rc < 0) {
+ return static_cast<off_t>(rc);
+ }
+ return st.st_size;
+ }
+
void TearDown() override {
EXPECT_THAT(munmap(addr_, 2 * kPageSize), SyscallSucceeds()) << addr_;
EXPECT_THAT(close(fd_), SyscallSucceeds());
@@ -165,97 +176,99 @@ TEST_F(PartialBadBufferTest, PreadvSmall) {
}
TEST_F(PartialBadBufferTest, WriteBig) {
- // FIXME(b/24788078): The sentry write syscalls will return immediately
- // if Access returns an error, but Access may not return an error
- // and the sentry will instead perform a partial write.
- SKIP_IF(IsRunningOnGvisor());
+ off_t orig_size = Size();
+ int n;
- EXPECT_THAT(RetryEINTR(write)(fd_, bad_buffer_, kPageSize),
- SyscallFailsWithErrno(EFAULT));
+ ASSERT_THAT(lseek(fd_, orig_size, SEEK_SET), SyscallSucceeds());
+ EXPECT_THAT(
+ (n = RetryEINTR(write)(fd_, bad_buffer_, kPageSize)),
+ AnyOf(SyscallFailsWithErrno(EFAULT), SyscallSucceedsWithValue(1)));
+ EXPECT_EQ(Size(), orig_size + (n >= 0 ? n : 0));
}
TEST_F(PartialBadBufferTest, WriteSmall) {
- // FIXME(b/24788078): The sentry write syscalls will return immediately
- // if Access returns an error, but Access may not return an error
- // and the sentry will instead perform a partial write.
- SKIP_IF(IsRunningOnGvisor());
+ off_t orig_size = Size();
+ int n;
- EXPECT_THAT(RetryEINTR(write)(fd_, bad_buffer_, 10),
- SyscallFailsWithErrno(EFAULT));
+ ASSERT_THAT(lseek(fd_, orig_size, SEEK_SET), SyscallSucceeds());
+ EXPECT_THAT(
+ (n = RetryEINTR(write)(fd_, bad_buffer_, 10)),
+ AnyOf(SyscallFailsWithErrno(EFAULT), SyscallSucceedsWithValue(1)));
+ EXPECT_EQ(Size(), orig_size + (n >= 0 ? n : 0));
}
TEST_F(PartialBadBufferTest, PwriteBig) {
- // FIXME(b/24788078): The sentry write syscalls will return immediately
- // if Access returns an error, but Access may not return an error
- // and the sentry will instead perform a partial write.
- SKIP_IF(IsRunningOnGvisor());
+ off_t orig_size = Size();
+ int n;
- EXPECT_THAT(RetryEINTR(pwrite)(fd_, bad_buffer_, kPageSize, 0),
- SyscallFailsWithErrno(EFAULT));
+ EXPECT_THAT(
+ (n = RetryEINTR(pwrite)(fd_, bad_buffer_, kPageSize, orig_size)),
+ AnyOf(SyscallFailsWithErrno(EFAULT), SyscallSucceedsWithValue(1)));
+ EXPECT_EQ(Size(), orig_size + (n >= 0 ? n : 0));
}
TEST_F(PartialBadBufferTest, PwriteSmall) {
- // FIXME(b/24788078): The sentry write syscalls will return immediately
- // if Access returns an error, but Access may not return an error
- // and the sentry will instead perform a partial write.
- SKIP_IF(IsRunningOnGvisor());
+ off_t orig_size = Size();
+ int n;
- EXPECT_THAT(RetryEINTR(pwrite)(fd_, bad_buffer_, 10, 0),
- SyscallFailsWithErrno(EFAULT));
+ EXPECT_THAT(
+ (n = RetryEINTR(pwrite)(fd_, bad_buffer_, 10, orig_size)),
+ AnyOf(SyscallFailsWithErrno(EFAULT), SyscallSucceedsWithValue(1)));
+ EXPECT_EQ(Size(), orig_size + (n >= 0 ? n : 0));
}
TEST_F(PartialBadBufferTest, WritevBig) {
- // FIXME(b/24788078): The sentry write syscalls will return immediately
- // if Access returns an error, but Access may not return an error
- // and the sentry will instead perform a partial write.
- SKIP_IF(IsRunningOnGvisor());
-
struct iovec vec;
vec.iov_base = bad_buffer_;
vec.iov_len = kPageSize;
+ off_t orig_size = Size();
+ int n;
- EXPECT_THAT(RetryEINTR(writev)(fd_, &vec, 1), SyscallFailsWithErrno(EFAULT));
+ ASSERT_THAT(lseek(fd_, orig_size, SEEK_SET), SyscallSucceeds());
+ EXPECT_THAT(
+ (n = RetryEINTR(writev)(fd_, &vec, 1)),
+ AnyOf(SyscallFailsWithErrno(EFAULT), SyscallSucceedsWithValue(1)));
+ EXPECT_EQ(Size(), orig_size + (n >= 0 ? n : 0));
}
TEST_F(PartialBadBufferTest, WritevSmall) {
- // FIXME(b/24788078): The sentry write syscalls will return immediately
- // if Access returns an error, but Access may not return an error
- // and the sentry will instead perform a partial write.
- SKIP_IF(IsRunningOnGvisor());
-
struct iovec vec;
vec.iov_base = bad_buffer_;
vec.iov_len = 10;
+ off_t orig_size = Size();
+ int n;
- EXPECT_THAT(RetryEINTR(writev)(fd_, &vec, 1), SyscallFailsWithErrno(EFAULT));
+ ASSERT_THAT(lseek(fd_, orig_size, SEEK_SET), SyscallSucceeds());
+ EXPECT_THAT(
+ (n = RetryEINTR(writev)(fd_, &vec, 1)),
+ AnyOf(SyscallFailsWithErrno(EFAULT), SyscallSucceedsWithValue(1)));
+ EXPECT_EQ(Size(), orig_size + (n >= 0 ? n : 0));
}
TEST_F(PartialBadBufferTest, PwritevBig) {
- // FIXME(b/24788078): The sentry write syscalls will return immediately
- // if Access returns an error, but Access may not return an error
- // and the sentry will instead perform a partial write.
- SKIP_IF(IsRunningOnGvisor());
-
struct iovec vec;
vec.iov_base = bad_buffer_;
vec.iov_len = kPageSize;
+ off_t orig_size = Size();
+ int n;
- EXPECT_THAT(RetryEINTR(pwritev)(fd_, &vec, 1, 0),
- SyscallFailsWithErrno(EFAULT));
+ EXPECT_THAT(
+ (n = RetryEINTR(pwritev)(fd_, &vec, 1, orig_size)),
+ AnyOf(SyscallFailsWithErrno(EFAULT), SyscallSucceedsWithValue(1)));
+ EXPECT_EQ(Size(), orig_size + (n >= 0 ? n : 0));
}
TEST_F(PartialBadBufferTest, PwritevSmall) {
- // FIXME(b/24788078): The sentry write syscalls will return immediately
- // if Access returns an error, but Access may not return an error
- // and the sentry will instead perform a partial write.
- SKIP_IF(IsRunningOnGvisor());
-
struct iovec vec;
vec.iov_base = bad_buffer_;
vec.iov_len = 10;
+ off_t orig_size = Size();
+ int n;
- EXPECT_THAT(RetryEINTR(pwritev)(fd_, &vec, 1, 0),
- SyscallFailsWithErrno(EFAULT));
+ EXPECT_THAT(
+ (n = RetryEINTR(pwritev)(fd_, &vec, 1, orig_size)),
+ AnyOf(SyscallFailsWithErrno(EFAULT), SyscallSucceedsWithValue(1)));
+ EXPECT_EQ(Size(), orig_size + (n >= 0 ? n : 0));
}
// getdents returns EFAULT when the you claim the buffer is large enough, but
@@ -283,29 +296,6 @@ TEST_F(PartialBadBufferTest, GetdentsOneEntry) {
SyscallSucceedsWithValue(Gt(0)));
}
-// Verify that when write returns EFAULT the kernel hasn't silently written
-// the initial valid bytes.
-TEST_F(PartialBadBufferTest, WriteEfaultIsntPartial) {
- // FIXME(b/24788078): The sentry write syscalls will return immediately
- // if Access returns an error, but Access may not return an error
- // and the sentry will instead perform a partial write.
- SKIP_IF(IsRunningOnGvisor());
-
- bad_buffer_[0] = 'A';
- EXPECT_THAT(RetryEINTR(write)(fd_, bad_buffer_, 10),
- SyscallFailsWithErrno(EFAULT));
-
- size_t size = 255;
- char buf[255];
- memset(buf, 0, size);
-
- EXPECT_THAT(RetryEINTR(pread)(fd_, buf, size, 0),
- SyscallSucceedsWithValue(sizeof(kMessage) - 1));
-
- // 'A' has not been written.
- EXPECT_STREQ(buf, kMessage);
-}
-
PosixErrorOr<sockaddr_storage> InetLoopbackAddr(int family) {
struct sockaddr_storage addr;
memset(&addr, 0, sizeof(addr));
diff --git a/test/syscalls/linux/pipe.cc b/test/syscalls/linux/pipe.cc
index c0b354e65..ac9b21b24 100644
--- a/test/syscalls/linux/pipe.cc
+++ b/test/syscalls/linux/pipe.cc
@@ -212,6 +212,20 @@ TEST(Pipe2Test, BadOptions) {
EXPECT_THAT(pipe2(fds, 0xDEAD), SyscallFailsWithErrno(EINVAL));
}
+// Tests that opening named pipes with O_TRUNC shouldn't cause an error, but
+// calls to (f)truncate should.
+TEST(NamedPipeTest, Truncate) {
+ const std::string tmp_path = NewTempAbsPath();
+ SKIP_IF(mkfifo(tmp_path.c_str(), 0644) != 0);
+
+ ASSERT_THAT(open(tmp_path.c_str(), O_NONBLOCK | O_RDONLY), SyscallSucceeds());
+ FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Open(tmp_path.c_str(), O_RDWR | O_NONBLOCK | O_TRUNC));
+
+ ASSERT_THAT(truncate(tmp_path.c_str(), 0), SyscallFailsWithErrno(EINVAL));
+ ASSERT_THAT(ftruncate(fd.get(), 0), SyscallFailsWithErrno(EINVAL));
+}
+
TEST_P(PipeTest, Seek) {
SKIP_IF(!CreateBlocking());
diff --git a/test/syscalls/linux/poll.cc b/test/syscalls/linux/poll.cc
index 9e5aa7fd0..c42472474 100644
--- a/test/syscalls/linux/poll.cc
+++ b/test/syscalls/linux/poll.cc
@@ -275,7 +275,8 @@ TEST_F(PollTest, Nfds) {
// Each entry in the 'fds' array refers to the eventfd and polls for
// "writable" events (events=POLLOUT). This essentially guarantees that the
// poll() is a no-op and allows negative testing of the 'nfds' parameter.
- std::vector<struct pollfd> fds(max_fds, {.fd = efd.get(), .events = POLLOUT});
+ std::vector<struct pollfd> fds(max_fds + 1,
+ {.fd = efd.get(), .events = POLLOUT});
// Verify that 'nfds' up to RLIMIT_NOFILE are allowed.
EXPECT_THAT(RetryEINTR(poll)(fds.data(), 1, 1), SyscallSucceedsWithValue(1));
diff --git a/test/syscalls/linux/preadv2.cc b/test/syscalls/linux/preadv2.cc
index c9246367d..cd936ea90 100644
--- a/test/syscalls/linux/preadv2.cc
+++ b/test/syscalls/linux/preadv2.cc
@@ -202,7 +202,7 @@ TEST(Preadv2Test, TestInvalidOffset) {
iov[0].iov_len = 0;
EXPECT_THAT(preadv2(fd.get(), iov.get(), /*iovcnt=*/1, /*offset=*/-8,
- /*flags=*/RWF_HIPRI),
+ /*flags=*/0),
SyscallFailsWithErrno(EINVAL));
}
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index 512de5ee0..8cf08991b 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -37,6 +37,7 @@
#include <map>
#include <memory>
#include <ostream>
+#include <regex>
#include <string>
#include <unordered_set>
#include <utility>
@@ -51,6 +52,7 @@
#include "absl/strings/str_split.h"
#include "absl/strings/string_view.h"
#include "absl/synchronization/mutex.h"
+#include "absl/synchronization/notification.h"
#include "absl/time/clock.h"
#include "absl/time/time.h"
#include "test/util/capability_util.h"
@@ -1988,6 +1990,44 @@ TEST(Proc, GetdentsEnoent) {
SyscallFailsWithErrno(ENOENT));
}
+void CheckSyscwFromIOFile(const std::string& path, const std::string& regex) {
+ std::string output;
+ ASSERT_NO_ERRNO(GetContents(path, &output));
+ ASSERT_THAT(output, ContainsRegex(absl::StrCat("syscw:\\s+", regex, "\n")));
+}
+
+// Checks that there is variable accounting of IO between threads/tasks.
+TEST(Proc, PidTidIOAccounting) {
+ absl::Notification notification;
+
+ // Run a thread with a bunch of writes. Check that io account records exactly
+ // the number of write calls. File open/close is there to prevent buffering.
+ ScopedThread writer([&notification] {
+ const int num_writes = 100;
+ for (int i = 0; i < num_writes; i++) {
+ auto path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ ASSERT_NO_ERRNO(SetContents(path.path(), "a"));
+ }
+ notification.Notify();
+ const std::string& writer_dir =
+ absl::StrCat("/proc/", getpid(), "/task/", gettid(), "/io");
+
+ CheckSyscwFromIOFile(writer_dir, std::to_string(num_writes));
+ });
+
+ // Run a thread and do no writes. Check that no writes are recorded.
+ ScopedThread noop([&notification] {
+ notification.WaitForNotification();
+ const std::string& noop_dir =
+ absl::StrCat("/proc/", getpid(), "/task/", gettid(), "/io");
+
+ CheckSyscwFromIOFile(noop_dir, "0");
+ });
+
+ writer.Join();
+ noop.Join();
+}
+
} // namespace
} // namespace testing
} // namespace gvisor
diff --git a/test/syscalls/linux/proc_net_tcp.cc b/test/syscalls/linux/proc_net_tcp.cc
index 2659f6a98..5b6e3e3cd 100644
--- a/test/syscalls/linux/proc_net_tcp.cc
+++ b/test/syscalls/linux/proc_net_tcp.cc
@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+#include <netinet/tcp.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/types.h>
diff --git a/test/syscalls/linux/proc_net_udp.cc b/test/syscalls/linux/proc_net_udp.cc
index f06f1a24b..786b4b4af 100644
--- a/test/syscalls/linux/proc_net_udp.cc
+++ b/test/syscalls/linux/proc_net_udp.cc
@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+#include <netinet/tcp.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/types.h>
diff --git a/test/syscalls/linux/readv_common.cc b/test/syscalls/linux/readv_common.cc
index 9658f7d42..2694dc64f 100644
--- a/test/syscalls/linux/readv_common.cc
+++ b/test/syscalls/linux/readv_common.cc
@@ -19,12 +19,53 @@
#include <unistd.h>
#include "gtest/gtest.h"
-#include "test/syscalls/linux/file_base.h"
#include "test/util/test_util.h"
namespace gvisor {
namespace testing {
+// MatchesStringLength checks that a tuple argument of (struct iovec *, int)
+// corresponding to an iovec array and its length, contains data that matches
+// the string length strlen.
+MATCHER_P(MatchesStringLength, strlen, "") {
+ struct iovec* iovs = arg.first;
+ int niov = arg.second;
+ int offset = 0;
+ for (int i = 0; i < niov; i++) {
+ offset += iovs[i].iov_len;
+ }
+ if (offset != static_cast<int>(strlen)) {
+ *result_listener << offset;
+ return false;
+ }
+ return true;
+}
+
+// MatchesStringValue checks that a tuple argument of (struct iovec *, int)
+// corresponding to an iovec array and its length, contains data that matches
+// the string value str.
+MATCHER_P(MatchesStringValue, str, "") {
+ struct iovec* iovs = arg.first;
+ int len = strlen(str);
+ int niov = arg.second;
+ int offset = 0;
+ for (int i = 0; i < niov; i++) {
+ struct iovec iov = iovs[i];
+ if (len < offset) {
+ *result_listener << "strlen " << len << " < offset " << offset;
+ return false;
+ }
+ if (strncmp(static_cast<char*>(iov.iov_base), &str[offset], iov.iov_len)) {
+ absl::string_view iovec_string(static_cast<char*>(iov.iov_base),
+ iov.iov_len);
+ *result_listener << iovec_string << " @offset " << offset;
+ return false;
+ }
+ offset += iov.iov_len;
+ }
+ return true;
+}
+
extern const char kReadvTestData[] =
"127.0.0.1 localhost"
""
@@ -113,7 +154,7 @@ void ReadBuffersOverlapping(int fd) {
char* expected_ptr = expected.data();
memcpy(expected_ptr, &kReadvTestData[overlap_bytes], overlap_bytes);
memcpy(&expected_ptr[overlap_bytes], &kReadvTestData[overlap_bytes],
- kReadvTestDataSize);
+ kReadvTestDataSize - overlap_bytes);
struct iovec iovs[2];
iovs[0].iov_base = buffer.data();
diff --git a/test/syscalls/linux/readv_socket.cc b/test/syscalls/linux/readv_socket.cc
index 9b6972201..dd6fb7008 100644
--- a/test/syscalls/linux/readv_socket.cc
+++ b/test/syscalls/linux/readv_socket.cc
@@ -19,7 +19,6 @@
#include <unistd.h>
#include "gtest/gtest.h"
-#include "test/syscalls/linux/file_base.h"
#include "test/syscalls/linux/readv_common.h"
#include "test/util/test_util.h"
@@ -28,9 +27,30 @@ namespace testing {
namespace {
-class ReadvSocketTest : public SocketTest {
+class ReadvSocketTest : public ::testing::Test {
+ public:
void SetUp() override {
- SocketTest::SetUp();
+ test_unix_stream_socket_[0] = -1;
+ test_unix_stream_socket_[1] = -1;
+ test_unix_dgram_socket_[0] = -1;
+ test_unix_dgram_socket_[1] = -1;
+ test_unix_seqpacket_socket_[0] = -1;
+ test_unix_seqpacket_socket_[1] = -1;
+
+ ASSERT_THAT(socketpair(AF_UNIX, SOCK_STREAM, 0, test_unix_stream_socket_),
+ SyscallSucceeds());
+ ASSERT_THAT(fcntl(test_unix_stream_socket_[0], F_SETFL, O_NONBLOCK),
+ SyscallSucceeds());
+ ASSERT_THAT(socketpair(AF_UNIX, SOCK_DGRAM, 0, test_unix_dgram_socket_),
+ SyscallSucceeds());
+ ASSERT_THAT(fcntl(test_unix_dgram_socket_[0], F_SETFL, O_NONBLOCK),
+ SyscallSucceeds());
+ ASSERT_THAT(
+ socketpair(AF_UNIX, SOCK_SEQPACKET, 0, test_unix_seqpacket_socket_),
+ SyscallSucceeds());
+ ASSERT_THAT(fcntl(test_unix_seqpacket_socket_[0], F_SETFL, O_NONBLOCK),
+ SyscallSucceeds());
+
ASSERT_THAT(
write(test_unix_stream_socket_[1], kReadvTestData, kReadvTestDataSize),
SyscallSucceedsWithValue(kReadvTestDataSize));
@@ -40,11 +60,22 @@ class ReadvSocketTest : public SocketTest {
ASSERT_THAT(write(test_unix_seqpacket_socket_[1], kReadvTestData,
kReadvTestDataSize),
SyscallSucceedsWithValue(kReadvTestDataSize));
- // FIXME(b/69821513): Enable when possible.
- // ASSERT_THAT(write(test_tcp_socket_[1], kReadvTestData,
- // kReadvTestDataSize),
- // SyscallSucceedsWithValue(kReadvTestDataSize));
}
+
+ void TearDown() override {
+ close(test_unix_stream_socket_[0]);
+ close(test_unix_stream_socket_[1]);
+
+ close(test_unix_dgram_socket_[0]);
+ close(test_unix_dgram_socket_[1]);
+
+ close(test_unix_seqpacket_socket_[0]);
+ close(test_unix_seqpacket_socket_[1]);
+ }
+
+ int test_unix_stream_socket_[2];
+ int test_unix_dgram_socket_[2];
+ int test_unix_seqpacket_socket_[2];
};
TEST_F(ReadvSocketTest, ReadOneBufferPerByte_StreamSocket) {
diff --git a/test/syscalls/linux/rseq.cc b/test/syscalls/linux/rseq.cc
new file mode 100644
index 000000000..106c045e3
--- /dev/null
+++ b/test/syscalls/linux/rseq.cc
@@ -0,0 +1,198 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <signal.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/rseq/test.h"
+#include "test/syscalls/linux/rseq/uapi.h"
+#include "test/util/logging.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Syscall test for rseq (restartable sequences).
+//
+// We must be very careful about how these tests are written. Each thread may
+// only have one struct rseq registration, which may be done automatically at
+// thread start (as of 2019-11-13, glibc does *not* support rseq and thus does
+// not do so).
+//
+// Testing of rseq is thus done primarily in a child process with no
+// registration. This means exec'ing a nostdlib binary, as rseq registration can
+// only be cleared by execve (or knowing the old rseq address), and glibc (based
+// on the current unmerged patches) register rseq before calling main()).
+
+int RSeq(struct rseq* rseq, uint32_t rseq_len, int flags, uint32_t sig) {
+ return syscall(kRseqSyscall, rseq, rseq_len, flags, sig);
+}
+
+// Returns true if this kernel supports the rseq syscall.
+PosixErrorOr<bool> RSeqSupported() {
+ // We have to be careful here, there are three possible cases:
+ //
+ // 1. rseq is not supported -> ENOSYS
+ // 2. rseq is supported and not registered -> success, but we should
+ // unregister.
+ // 3. rseq is supported and registered -> EINVAL (most likely).
+
+ // The only validation done on new registrations is that rseq is aligned and
+ // writable.
+ rseq rseq = {};
+ int ret = RSeq(&rseq, sizeof(rseq), 0, 0);
+ if (ret == 0) {
+ // Successfully registered, rseq is supported. Unregister.
+ ret = RSeq(&rseq, sizeof(rseq), kRseqFlagUnregister, 0);
+ if (ret != 0) {
+ return PosixError(errno);
+ }
+ return true;
+ }
+
+ switch (errno) {
+ case ENOSYS:
+ // Not supported.
+ return false;
+ case EINVAL:
+ // Supported, but already registered. EINVAL returned because we provided
+ // a different address.
+ return true;
+ default:
+ // Unknown error.
+ return PosixError(errno);
+ }
+}
+
+constexpr char kRseqBinary[] = "test/syscalls/linux/rseq/rseq";
+
+void RunChildTest(std::string test_case, int want_status) {
+ std::string path = RunfilePath(kRseqBinary);
+
+ pid_t child_pid = -1;
+ int execve_errno = 0;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec(path, {path, test_case}, {}, &child_pid, &execve_errno));
+
+ ASSERT_GT(child_pid, 0);
+ ASSERT_EQ(execve_errno, 0);
+
+ int status = 0;
+ ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+ ASSERT_EQ(status, want_status);
+}
+
+// Test that rseq must be aligned.
+TEST(RseqTest, Unaligned) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+ RunChildTest(kRseqTestUnaligned, 0);
+}
+
+// Sanity test that registration works.
+TEST(RseqTest, Register) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+ RunChildTest(kRseqTestRegister, 0);
+}
+
+// Registration can't be done twice.
+TEST(RseqTest, DoubleRegister) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+ RunChildTest(kRseqTestDoubleRegister, 0);
+}
+
+// Registration can be done again after unregister.
+TEST(RseqTest, RegisterUnregister) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+ RunChildTest(kRseqTestRegisterUnregister, 0);
+}
+
+// The pointer to rseq must match on register/unregister.
+TEST(RseqTest, UnregisterDifferentPtr) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+ RunChildTest(kRseqTestUnregisterDifferentPtr, 0);
+}
+
+// The signature must match on register/unregister.
+TEST(RseqTest, UnregisterDifferentSignature) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+ RunChildTest(kRseqTestUnregisterDifferentSignature, 0);
+}
+
+// The CPU ID is initialized.
+TEST(RseqTest, CPU) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+ RunChildTest(kRseqTestCPU, 0);
+}
+
+// Critical section is eventually aborted.
+TEST(RseqTest, Abort) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+ RunChildTest(kRseqTestAbort, 0);
+}
+
+// Abort may be before the critical section.
+TEST(RseqTest, AbortBefore) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+ RunChildTest(kRseqTestAbortBefore, 0);
+}
+
+// Signature must match.
+TEST(RseqTest, AbortSignature) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+ RunChildTest(kRseqTestAbortSignature, SIGSEGV);
+}
+
+// Abort must not be in the critical section.
+TEST(RseqTest, AbortPreCommit) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+ RunChildTest(kRseqTestAbortPreCommit, SIGSEGV);
+}
+
+// rseq.rseq_cs is cleared on abort.
+TEST(RseqTest, AbortClearsCS) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+ RunChildTest(kRseqTestAbortClearsCS, 0);
+}
+
+// rseq.rseq_cs is cleared on abort outside of critical section.
+TEST(RseqTest, InvalidAbortClearsCS) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+ RunChildTest(kRseqTestInvalidAbortClearsCS, 0);
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/rseq/BUILD b/test/syscalls/linux/rseq/BUILD
new file mode 100644
index 000000000..5cfe4e56f
--- /dev/null
+++ b/test/syscalls/linux/rseq/BUILD
@@ -0,0 +1,59 @@
+# This package contains a standalone rseq test binary. This binary must not
+# depend on libc, which might use rseq itself.
+
+load("@bazel_tools//tools/cpp:cc_flags_supplier.bzl", "cc_flags_supplier")
+load("@rules_cc//cc:defs.bzl", "cc_library")
+
+package(licenses = ["notice"])
+
+genrule(
+ name = "rseq_binary",
+ srcs = [
+ "critical.h",
+ "critical.S",
+ "rseq.cc",
+ "syscalls.h",
+ "start.S",
+ "test.h",
+ "types.h",
+ "uapi.h",
+ ],
+ outs = ["rseq"],
+ cmd = " ".join([
+ "$(CC)",
+ "$(CC_FLAGS) ",
+ "-I.",
+ "-Wall",
+ "-Werror",
+ "-O2",
+ "-std=c++17",
+ "-static",
+ "-nostdlib",
+ "-ffreestanding",
+ "-o",
+ "$(location rseq)",
+ "$(location critical.S)",
+ "$(location rseq.cc)",
+ "$(location start.S)",
+ ]),
+ toolchains = [
+ ":no_pie_cc_flags",
+ "@bazel_tools//tools/cpp:current_cc_toolchain",
+ ],
+ visibility = ["//:sandbox"],
+)
+
+cc_flags_supplier(
+ name = "no_pie_cc_flags",
+ features = ["-pie"],
+)
+
+cc_library(
+ name = "lib",
+ testonly = 1,
+ hdrs = [
+ "test.h",
+ "uapi.h",
+ ],
+ visibility = ["//:sandbox"],
+)
diff --git a/test/syscalls/linux/rseq/critical.S b/test/syscalls/linux/rseq/critical.S
new file mode 100644
index 000000000..8c0687e6d
--- /dev/null
+++ b/test/syscalls/linux/rseq/critical.S
@@ -0,0 +1,66 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Restartable sequences critical sections.
+
+// Loops continuously until aborted.
+//
+// void rseq_loop(struct rseq* r, struct rseq_cs* cs)
+
+ .text
+ .globl rseq_loop
+ .type rseq_loop, @function
+
+rseq_loop:
+ jmp begin
+
+ // Abort block before the critical section.
+ // Abort signature is 4 nops for simplicity.
+ .byte 0x90, 0x90, 0x90, 0x90
+ .globl rseq_loop_early_abort
+rseq_loop_early_abort:
+ ret
+
+begin:
+ // r->rseq_cs = cs
+ movq %rsi, 8(%rdi)
+
+ // N.B. rseq_cs will be cleared by any preempt, even outside the critical
+ // section. Thus it must be set in or immediately before the critical section
+ // to ensure it is not cleared before the section begins.
+ .globl rseq_loop_start
+rseq_loop_start:
+ jmp rseq_loop_start
+
+ // "Pre-commit": extra instructions inside the critical section. These are
+ // used as the abort point in TestAbortPreCommit, which is not valid.
+ .globl rseq_loop_pre_commit
+rseq_loop_pre_commit:
+ // Extra abort signature + nop for TestAbortPostCommit.
+ .byte 0x90, 0x90, 0x90, 0x90
+ nop
+
+ // "Post-commit": never reached in this case.
+ .globl rseq_loop_post_commit
+rseq_loop_post_commit:
+
+ // Abort signature is 4 nops for simplicity.
+ .byte 0x90, 0x90, 0x90, 0x90
+
+ .globl rseq_loop_abort
+rseq_loop_abort:
+ ret
+
+ .size rseq_loop,.-rseq_loop
+ .section .note.GNU-stack,"",@progbits
diff --git a/test/syscalls/linux/rseq/critical.h b/test/syscalls/linux/rseq/critical.h
new file mode 100644
index 000000000..ac987a25e
--- /dev/null
+++ b/test/syscalls/linux/rseq/critical.h
@@ -0,0 +1,39 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_RSEQ_CRITICAL_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_RSEQ_CRITICAL_H_
+
+#include "test/syscalls/linux/rseq/types.h"
+#include "test/syscalls/linux/rseq/uapi.h"
+
+constexpr uint32_t kRseqSignature = 0x90909090;
+
+extern "C" {
+
+extern void rseq_loop(struct rseq* r, struct rseq_cs* cs);
+extern void* rseq_loop_early_abort;
+extern void* rseq_loop_start;
+extern void* rseq_loop_pre_commit;
+extern void* rseq_loop_post_commit;
+extern void* rseq_loop_abort;
+
+extern int rseq_getpid(struct rseq* r, struct rseq_cs* cs);
+extern void* rseq_getpid_start;
+extern void* rseq_getpid_post_commit;
+extern void* rseq_getpid_abort;
+
+} // extern "C"
+
+#endif // GVISOR_TEST_SYSCALLS_LINUX_RSEQ_CRITICAL_H_
diff --git a/test/syscalls/linux/rseq/rseq.cc b/test/syscalls/linux/rseq/rseq.cc
new file mode 100644
index 000000000..f036db26d
--- /dev/null
+++ b/test/syscalls/linux/rseq/rseq.cc
@@ -0,0 +1,366 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/rseq/critical.h"
+#include "test/syscalls/linux/rseq/syscalls.h"
+#include "test/syscalls/linux/rseq/test.h"
+#include "test/syscalls/linux/rseq/types.h"
+#include "test/syscalls/linux/rseq/uapi.h"
+
+namespace gvisor {
+namespace testing {
+
+extern "C" int main(int argc, char** argv, char** envp);
+
+// Standalone initialization before calling main().
+extern "C" void __init(uintptr_t* sp) {
+ int argc = sp[0];
+ char** argv = reinterpret_cast<char**>(&sp[1]);
+ char** envp = &argv[argc + 1];
+
+ // Call main() and exit.
+ sys_exit_group(main(argc, argv, envp));
+
+ // sys_exit_group does not return
+}
+
+int strcmp(const char* s1, const char* s2) {
+ const unsigned char* p1 = reinterpret_cast<const unsigned char*>(s1);
+ const unsigned char* p2 = reinterpret_cast<const unsigned char*>(s2);
+
+ while (*p1 == *p2) {
+ if (!*p1) {
+ return 0;
+ }
+ ++p1;
+ ++p2;
+ }
+ return static_cast<int>(*p1) - static_cast<int>(*p2);
+}
+
+int sys_rseq(struct rseq* rseq, uint32_t rseq_len, int flags, uint32_t sig) {
+ return raw_syscall(kRseqSyscall, rseq, rseq_len, flags, sig);
+}
+
+// Test that rseq must be aligned.
+int TestUnaligned() {
+ constexpr uintptr_t kRequiredAlignment = alignof(rseq);
+
+ char buf[2 * kRequiredAlignment] = {};
+ uintptr_t ptr = reinterpret_cast<uintptr_t>(&buf[0]);
+ if ((ptr & (kRequiredAlignment - 1)) == 0) {
+ // buf is already aligned. Misalign it.
+ ptr++;
+ }
+
+ int ret = sys_rseq(reinterpret_cast<rseq*>(ptr), sizeof(rseq), 0, 0);
+ if (sys_errno(ret) != EINVAL) {
+ return 1;
+ }
+ return 0;
+}
+
+// Sanity test that registration works.
+int TestRegister() {
+ struct rseq r = {};
+ if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != 0) {
+ return 1;
+ }
+ return 0;
+};
+
+// Registration can't be done twice.
+int TestDoubleRegister() {
+ struct rseq r = {};
+ if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != 0) {
+ return 1;
+ }
+
+ if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != EBUSY) {
+ return 1;
+ }
+
+ return 0;
+};
+
+// Registration can be done again after unregister.
+int TestRegisterUnregister() {
+ struct rseq r = {};
+ if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != 0) {
+ return 1;
+ }
+
+ if (int ret = sys_rseq(&r, sizeof(r), kRseqFlagUnregister, 0);
+ sys_errno(ret) != 0) {
+ return 1;
+ }
+
+ if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != 0) {
+ return 1;
+ }
+
+ return 0;
+};
+
+// The pointer to rseq must match on register/unregister.
+int TestUnregisterDifferentPtr() {
+ struct rseq r = {};
+ if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != 0) {
+ return 1;
+ }
+
+ struct rseq r2 = {};
+ if (int ret = sys_rseq(&r2, sizeof(r2), kRseqFlagUnregister, 0);
+ sys_errno(ret) != EINVAL) {
+ return 1;
+ }
+
+ return 0;
+};
+
+// The signature must match on register/unregister.
+int TestUnregisterDifferentSignature() {
+ constexpr int kSignature = 0;
+
+ struct rseq r = {};
+ if (int ret = sys_rseq(&r, sizeof(r), 0, kSignature); sys_errno(ret) != 0) {
+ return 1;
+ }
+
+ if (int ret = sys_rseq(&r, sizeof(r), kRseqFlagUnregister, kSignature + 1);
+ sys_errno(ret) != EPERM) {
+ return 1;
+ }
+
+ return 0;
+};
+
+// The CPU ID is initialized.
+int TestCPU() {
+ struct rseq r = {};
+ r.cpu_id = kRseqCPUIDUninitialized;
+
+ if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != 0) {
+ return 1;
+ }
+
+ if (__atomic_load_n(&r.cpu_id, __ATOMIC_RELAXED) < 0) {
+ return 1;
+ }
+ if (__atomic_load_n(&r.cpu_id_start, __ATOMIC_RELAXED) < 0) {
+ return 1;
+ }
+
+ return 0;
+};
+
+// Critical section is eventually aborted.
+int TestAbort() {
+ struct rseq r = {};
+ if (int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature);
+ sys_errno(ret) != 0) {
+ return 1;
+ }
+
+ struct rseq_cs cs = {};
+ cs.version = 0;
+ cs.flags = 0;
+ cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start);
+ cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) -
+ reinterpret_cast<uint64_t>(&rseq_loop_start);
+ cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_abort);
+
+ // Loops until abort. If this returns then abort occurred.
+ rseq_loop(&r, &cs);
+
+ return 0;
+};
+
+// Abort may be before the critical section.
+int TestAbortBefore() {
+ struct rseq r = {};
+ if (int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature);
+ sys_errno(ret) != 0) {
+ return 1;
+ }
+
+ struct rseq_cs cs = {};
+ cs.version = 0;
+ cs.flags = 0;
+ cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start);
+ cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) -
+ reinterpret_cast<uint64_t>(&rseq_loop_start);
+ cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_early_abort);
+
+ // Loops until abort. If this returns then abort occurred.
+ rseq_loop(&r, &cs);
+
+ return 0;
+};
+
+// Signature must match.
+int TestAbortSignature() {
+ struct rseq r = {};
+ if (int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature + 1);
+ sys_errno(ret) != 0) {
+ return 1;
+ }
+
+ struct rseq_cs cs = {};
+ cs.version = 0;
+ cs.flags = 0;
+ cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start);
+ cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) -
+ reinterpret_cast<uint64_t>(&rseq_loop_start);
+ cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_abort);
+
+ // Loops until abort. This should SIGSEGV on abort.
+ rseq_loop(&r, &cs);
+
+ return 1;
+};
+
+// Abort must not be in the critical section.
+int TestAbortPreCommit() {
+ struct rseq r = {};
+ if (int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature + 1);
+ sys_errno(ret) != 0) {
+ return 1;
+ }
+
+ struct rseq_cs cs = {};
+ cs.version = 0;
+ cs.flags = 0;
+ cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start);
+ cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) -
+ reinterpret_cast<uint64_t>(&rseq_loop_start);
+ cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_pre_commit);
+
+ // Loops until abort. This should SIGSEGV on abort.
+ rseq_loop(&r, &cs);
+
+ return 1;
+};
+
+// rseq.rseq_cs is cleared on abort.
+int TestAbortClearsCS() {
+ struct rseq r = {};
+ if (int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature);
+ sys_errno(ret) != 0) {
+ return 1;
+ }
+
+ struct rseq_cs cs = {};
+ cs.version = 0;
+ cs.flags = 0;
+ cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start);
+ cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) -
+ reinterpret_cast<uint64_t>(&rseq_loop_start);
+ cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_abort);
+
+ // Loops until abort. If this returns then abort occurred.
+ rseq_loop(&r, &cs);
+
+ if (__atomic_load_n(&r.rseq_cs, __ATOMIC_RELAXED)) {
+ return 1;
+ }
+
+ return 0;
+};
+
+// rseq.rseq_cs is cleared on abort outside of critical section.
+int TestInvalidAbortClearsCS() {
+ struct rseq r = {};
+ if (int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature);
+ sys_errno(ret) != 0) {
+ return 1;
+ }
+
+ struct rseq_cs cs = {};
+ cs.version = 0;
+ cs.flags = 0;
+ cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start);
+ cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) -
+ reinterpret_cast<uint64_t>(&rseq_loop_start);
+ cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_abort);
+
+ __atomic_store_n(&r.rseq_cs, &cs, __ATOMIC_RELAXED);
+
+ // When the next abort condition occurs, the kernel will clear cs once it
+ // determines we aren't in the critical section.
+ while (1) {
+ if (!__atomic_load_n(&r.rseq_cs, __ATOMIC_RELAXED)) {
+ break;
+ }
+ }
+
+ return 0;
+};
+
+// Exit codes:
+// 0 - Pass
+// 1 - Fail
+// 2 - Missing argument
+// 3 - Unknown test case
+extern "C" int main(int argc, char** argv, char** envp) {
+ if (argc != 2) {
+ // Usage: rseq <test case>
+ return 2;
+ }
+
+ if (strcmp(argv[1], kRseqTestUnaligned) == 0) {
+ return TestUnaligned();
+ }
+ if (strcmp(argv[1], kRseqTestRegister) == 0) {
+ return TestRegister();
+ }
+ if (strcmp(argv[1], kRseqTestDoubleRegister) == 0) {
+ return TestDoubleRegister();
+ }
+ if (strcmp(argv[1], kRseqTestRegisterUnregister) == 0) {
+ return TestRegisterUnregister();
+ }
+ if (strcmp(argv[1], kRseqTestUnregisterDifferentPtr) == 0) {
+ return TestUnregisterDifferentPtr();
+ }
+ if (strcmp(argv[1], kRseqTestUnregisterDifferentSignature) == 0) {
+ return TestUnregisterDifferentSignature();
+ }
+ if (strcmp(argv[1], kRseqTestCPU) == 0) {
+ return TestCPU();
+ }
+ if (strcmp(argv[1], kRseqTestAbort) == 0) {
+ return TestAbort();
+ }
+ if (strcmp(argv[1], kRseqTestAbortBefore) == 0) {
+ return TestAbortBefore();
+ }
+ if (strcmp(argv[1], kRseqTestAbortSignature) == 0) {
+ return TestAbortSignature();
+ }
+ if (strcmp(argv[1], kRseqTestAbortPreCommit) == 0) {
+ return TestAbortPreCommit();
+ }
+ if (strcmp(argv[1], kRseqTestAbortClearsCS) == 0) {
+ return TestAbortClearsCS();
+ }
+ if (strcmp(argv[1], kRseqTestInvalidAbortClearsCS) == 0) {
+ return TestInvalidAbortClearsCS();
+ }
+
+ return 3;
+}
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/rseq/start.S b/test/syscalls/linux/rseq/start.S
new file mode 100644
index 000000000..b9611b276
--- /dev/null
+++ b/test/syscalls/linux/rseq/start.S
@@ -0,0 +1,45 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+ .text
+ .align 4
+ .type _start,@function
+ .globl _start
+
+_start:
+ movq %rsp,%rdi
+ call __init
+ hlt
+
+ .size _start,.-_start
+ .section .note.GNU-stack,"",@progbits
+
+ .text
+ .globl raw_syscall
+ .type raw_syscall, @function
+
+raw_syscall:
+ mov %rdi,%rax // syscall #
+ mov %rsi,%rdi // arg0
+ mov %rdx,%rsi // arg1
+ mov %rcx,%rdx // arg2
+ mov %r8,%r10 // arg3 (goes in r10 instead of rcx for system calls)
+ mov %r9,%r8 // arg4
+ mov 0x8(%rsp),%r9 // arg5
+ syscall
+ ret
+
+ .size raw_syscall,.-raw_syscall
+ .section .note.GNU-stack,"",@progbits
diff --git a/test/syscalls/linux/rseq/syscalls.h b/test/syscalls/linux/rseq/syscalls.h
new file mode 100644
index 000000000..e5299c188
--- /dev/null
+++ b/test/syscalls/linux/rseq/syscalls.h
@@ -0,0 +1,66 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_RSEQ_SYSCALLS_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_RSEQ_SYSCALLS_H_
+
+#include "test/syscalls/linux/rseq/types.h"
+
+#ifdef __x86_64__
+// Syscall numbers.
+constexpr int kGetpid = 39;
+constexpr int kExitGroup = 231;
+#else
+#error "Unknown architecture"
+#endif
+
+namespace gvisor {
+namespace testing {
+
+// Standalone system call interfaces.
+// Note that these are all "raw" system call interfaces which encode
+// errors by setting the return value to a small negative number.
+// Use sys_errno() to check system call return values for errors.
+
+// Maximum Linux error number.
+constexpr int kMaxErrno = 4095;
+
+// Errno values.
+#define EPERM 1
+#define EFAULT 14
+#define EBUSY 16
+#define EINVAL 22
+
+// Get the error number from a raw system call return value.
+// Returns a positive error number or 0 if there was no error.
+static inline int sys_errno(uintptr_t rval) {
+ if (rval >= static_cast<uintptr_t>(-kMaxErrno)) {
+ return -static_cast<int>(rval);
+ }
+ return 0;
+}
+
+extern "C" uintptr_t raw_syscall(int number, ...);
+
+static inline void sys_exit_group(int status) {
+ raw_syscall(kExitGroup, status);
+}
+static inline int sys_getpid() {
+ return static_cast<int>(raw_syscall(kGetpid));
+}
+
+} // namespace testing
+} // namespace gvisor
+
+#endif // GVISOR_TEST_SYSCALLS_LINUX_RSEQ_SYSCALLS_H_
diff --git a/test/syscalls/linux/rseq/test.h b/test/syscalls/linux/rseq/test.h
new file mode 100644
index 000000000..3b7bb74b1
--- /dev/null
+++ b/test/syscalls/linux/rseq/test.h
@@ -0,0 +1,43 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_RSEQ_TEST_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_RSEQ_TEST_H_
+
+namespace gvisor {
+namespace testing {
+
+// Test cases supported by rseq binary.
+
+inline constexpr char kRseqTestUnaligned[] = "unaligned";
+inline constexpr char kRseqTestRegister[] = "register";
+inline constexpr char kRseqTestDoubleRegister[] = "double-register";
+inline constexpr char kRseqTestRegisterUnregister[] = "register-unregister";
+inline constexpr char kRseqTestUnregisterDifferentPtr[] =
+ "unregister-different-ptr";
+inline constexpr char kRseqTestUnregisterDifferentSignature[] =
+ "unregister-different-signature";
+inline constexpr char kRseqTestCPU[] = "cpu";
+inline constexpr char kRseqTestAbort[] = "abort";
+inline constexpr char kRseqTestAbortBefore[] = "abort-before";
+inline constexpr char kRseqTestAbortSignature[] = "abort-signature";
+inline constexpr char kRseqTestAbortPreCommit[] = "abort-precommit";
+inline constexpr char kRseqTestAbortClearsCS[] = "abort-clears-cs";
+inline constexpr char kRseqTestInvalidAbortClearsCS[] =
+ "invalid-abort-clears-cs";
+
+} // namespace testing
+} // namespace gvisor
+
+#endif // GVISOR_TEST_SYSCALLS_LINUX_RSEQ_TEST_H_
diff --git a/test/syscalls/linux/rseq/types.h b/test/syscalls/linux/rseq/types.h
new file mode 100644
index 000000000..b6afe9817
--- /dev/null
+++ b/test/syscalls/linux/rseq/types.h
@@ -0,0 +1,31 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_RSEQ_TYPES_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_RSEQ_TYPES_H_
+
+using size_t = __SIZE_TYPE__;
+using uintptr_t = __UINTPTR_TYPE__;
+
+using uint8_t = __UINT8_TYPE__;
+using uint16_t = __UINT16_TYPE__;
+using uint32_t = __UINT32_TYPE__;
+using uint64_t = __UINT64_TYPE__;
+
+using int8_t = __INT8_TYPE__;
+using int16_t = __INT16_TYPE__;
+using int32_t = __INT32_TYPE__;
+using int64_t = __INT64_TYPE__;
+
+#endif // GVISOR_TEST_SYSCALLS_LINUX_RSEQ_TYPES_H_
diff --git a/test/syscalls/linux/rseq/uapi.h b/test/syscalls/linux/rseq/uapi.h
new file mode 100644
index 000000000..e3ff0579a
--- /dev/null
+++ b/test/syscalls/linux/rseq/uapi.h
@@ -0,0 +1,54 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_RSEQ_UAPI_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_RSEQ_UAPI_H_
+
+// User-kernel ABI for restartable sequences.
+
+// Standard types.
+//
+// N.B. This header will be included in targets that do have the standard
+// library, so we can't shadow the standard type names.
+using __u32 = __UINT32_TYPE__;
+using __u64 = __UINT64_TYPE__;
+
+#ifdef __x86_64__
+// Syscall numbers.
+constexpr int kRseqSyscall = 334;
+#else
+#error "Unknown architecture"
+#endif // __x86_64__
+
+struct rseq_cs {
+ __u32 version;
+ __u32 flags;
+ __u64 start_ip;
+ __u64 post_commit_offset;
+ __u64 abort_ip;
+} __attribute__((aligned(4 * sizeof(__u64))));
+
+// N.B. alignment is enforced by the kernel.
+struct rseq {
+ __u32 cpu_id_start;
+ __u32 cpu_id;
+ struct rseq_cs* rseq_cs;
+ __u32 flags;
+} __attribute__((aligned(4 * sizeof(__u64))));
+
+constexpr int kRseqFlagUnregister = 1 << 0;
+
+constexpr int kRseqCPUIDUninitialized = -1;
+
+#endif // GVISOR_TEST_SYSCALLS_LINUX_RSEQ_UAPI_H_
diff --git a/test/syscalls/linux/sigaltstack.cc b/test/syscalls/linux/sigaltstack.cc
index 6fd3989a4..a778fa639 100644
--- a/test/syscalls/linux/sigaltstack.cc
+++ b/test/syscalls/linux/sigaltstack.cc
@@ -95,13 +95,7 @@ TEST(SigaltstackTest, ResetByExecve) {
auto const cleanup_sigstack =
ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaltstack(stack));
- std::string full_path;
- char* test_src = getenv("TEST_SRCDIR");
- if (test_src) {
- full_path = JoinPath(test_src, "../../linux/sigaltstack_check");
- }
-
- ASSERT_FALSE(full_path.empty());
+ std::string full_path = RunfilePath("test/syscalls/linux/sigaltstack_check");
pid_t child_pid = -1;
int execve_errno = 0;
diff --git a/test/syscalls/linux/socket_bind_to_device_distribution.cc b/test/syscalls/linux/socket_bind_to_device_distribution.cc
index 5767181a1..5ed57625c 100644
--- a/test/syscalls/linux/socket_bind_to_device_distribution.cc
+++ b/test/syscalls/linux/socket_bind_to_device_distribution.cc
@@ -183,7 +183,14 @@ TEST_P(BindToDeviceDistributionTest, Tcp) {
}
// Receive some data from a socket to be sure that the connect()
// system call has been completed on another side.
- int data;
+ // Do a short read and then close the socket to trigger a RST. This
+ // ensures that both ends of the connection are cleaned up and no
+ // goroutines hang around in TIME-WAIT. We do this so that this test
+ // does not timeout under gotsan runs where lots of goroutines can
+ // cause the test to use absurd amounts of memory.
+ //
+ // See: https://tools.ietf.org/html/rfc2525#page-50 section 2.17
+ uint16_t data;
EXPECT_THAT(
RetryEINTR(recv)(fd.ValueOrDie().get(), &data, sizeof(data), 0),
SyscallSucceedsWithValue(sizeof(data)));
@@ -198,15 +205,29 @@ TEST_P(BindToDeviceDistributionTest, Tcp) {
}
for (int i = 0; i < kConnectAttempts; i++) {
- FileDescriptor const fd = ASSERT_NO_ERRNO_AND_VALUE(
+ const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(
Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
ASSERT_THAT(
RetryEINTR(connect)(fd.get(), reinterpret_cast<sockaddr*>(&conn_addr),
connector.addr_len),
SyscallSucceeds());
+ // Do two separate sends to ensure two segments are received. This is
+ // required for netstack where read is incorrectly assuming a whole
+ // segment is read when endpoint.Read() is called which is technically
+ // incorrect as the syscall that invoked endpoint.Read() may only
+ // consume it partially. This results in a case where a close() of
+ // such a socket does not trigger a RST in netstack due to the
+ // endpoint assuming that the endpoint has no unread data.
EXPECT_THAT(RetryEINTR(send)(fd.get(), &i, sizeof(i), 0),
SyscallSucceedsWithValue(sizeof(i)));
+
+ // TODO(gvisor.dev/issue/1449): Remove this block once netstack correctly
+ // generates a RST.
+ if (IsRunningOnGvisor()) {
+ EXPECT_THAT(RetryEINTR(send)(fd.get(), &i, sizeof(i), 0),
+ SyscallSucceedsWithValue(sizeof(i)));
+ }
}
// Join threads to be sure that all connections have been counted.
diff --git a/test/syscalls/linux/socket_bind_to_device_sequence.cc b/test/syscalls/linux/socket_bind_to_device_sequence.cc
index e4641c62e..34b1058a9 100644
--- a/test/syscalls/linux/socket_bind_to_device_sequence.cc
+++ b/test/syscalls/linux/socket_bind_to_device_sequence.cc
@@ -97,12 +97,22 @@ class BindToDeviceSequenceTest : public ::testing::TestWithParam<SocketKind> {
sockets_to_close_.erase(socket_id);
}
- // Bind a socket with the reuse option and bind_to_device options. Checks
+ // SetDevice changes the bind_to_device option. It does not bind or re-bind.
+ void SetDevice(int socket_id, int device_id) {
+ auto socket_fd = sockets_to_close_[socket_id]->get();
+ string device_name;
+ ASSERT_NO_FATAL_FAILURE(GetDevice(device_id, &device_name));
+ EXPECT_THAT(setsockopt(socket_fd, SOL_SOCKET, SO_BINDTODEVICE,
+ device_name.c_str(), device_name.size() + 1),
+ SyscallSucceedsWithValue(0));
+ }
+
+ // Bind a socket with the reuse options and bind_to_device options. Checks
// that all steps succeed and that the bind command's error matches want.
// Sets the socket_id to uniquely identify the socket bound if it is not
// nullptr.
- void BindSocket(bool reuse, int device_id = 0, int want = 0,
- int *socket_id = nullptr) {
+ void BindSocket(bool reuse_port, bool reuse_addr, int device_id = 0,
+ int want = 0, int *socket_id = nullptr) {
next_socket_id_++;
sockets_to_close_[next_socket_id_] = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
auto socket_fd = sockets_to_close_[next_socket_id_]->get();
@@ -110,13 +120,20 @@ class BindToDeviceSequenceTest : public ::testing::TestWithParam<SocketKind> {
*socket_id = next_socket_id_;
}
- // If reuse is indicated, do that.
- if (reuse) {
+ // If reuse_port is indicated, do that.
+ if (reuse_port) {
EXPECT_THAT(setsockopt(socket_fd, SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
sizeof(kSockOptOn)),
SyscallSucceedsWithValue(0));
}
+ // If reuse_addr is indicated, do that.
+ if (reuse_addr) {
+ EXPECT_THAT(setsockopt(socket_fd, SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceedsWithValue(0));
+ }
+
// If the device is non-zero, bind to that device.
if (device_id != 0) {
string device_name;
@@ -182,129 +199,308 @@ class BindToDeviceSequenceTest : public ::testing::TestWithParam<SocketKind> {
};
TEST_P(BindToDeviceSequenceTest, BindTwiceWithDeviceFails) {
- ASSERT_NO_FATAL_FAILURE(
- BindSocket(/* reuse */ false, /* bind_to_device */ 3));
- ASSERT_NO_FATAL_FAILURE(
- BindSocket(/* reuse */ false, /* bind_to_device */ 3, EADDRINUSE));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(
+ /* reuse_port */ false, /* reuse_addr */ false, /* bind_to_device */ 3));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 3, EADDRINUSE));
}
TEST_P(BindToDeviceSequenceTest, BindToDevice) {
- ASSERT_NO_FATAL_FAILURE(
- BindSocket(/* reuse */ false, /* bind_to_device */ 1));
- ASSERT_NO_FATAL_FAILURE(
- BindSocket(/* reuse */ false, /* bind_to_device */ 2));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(
+ /* reuse_port */ false, /* reuse_addr */ false, /* bind_to_device */ 1));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(
+ /* reuse_port */ false, /* reuse_addr */ false, /* bind_to_device */ 2));
}
TEST_P(BindToDeviceSequenceTest, BindToDeviceAndThenWithoutDevice) {
- ASSERT_NO_FATAL_FAILURE(
- BindSocket(/* reuse */ false, /* bind_to_device */ 123));
- ASSERT_NO_FATAL_FAILURE(
- BindSocket(/* reuse */ false, /* bind_to_device */ 0, EADDRINUSE));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 123));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 0, EADDRINUSE));
}
TEST_P(BindToDeviceSequenceTest, BindWithoutDevice) {
- ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse */ false));
- ASSERT_NO_FATAL_FAILURE(
- BindSocket(/* reuse */ false, /* bind_to_device */ 123, EADDRINUSE));
- ASSERT_NO_FATAL_FAILURE(
- BindSocket(/* reuse */ true, /* bind_to_device */ 123, EADDRINUSE));
- ASSERT_NO_FATAL_FAILURE(
- BindSocket(/* reuse */ false, /* bind_to_device */ 0, EADDRINUSE));
- ASSERT_NO_FATAL_FAILURE(
- BindSocket(/* reuse */ true, /* bind_to_device */ 0, EADDRINUSE));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 123, EADDRINUSE));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 123, EADDRINUSE));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 0, EADDRINUSE));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 0, EADDRINUSE));
}
TEST_P(BindToDeviceSequenceTest, BindWithDevice) {
- ASSERT_NO_FATAL_FAILURE(
- BindSocket(/* reuse */ false, /* bind_to_device */ 123, 0));
- ASSERT_NO_FATAL_FAILURE(
- BindSocket(/* reuse */ false, /* bind_to_device */ 123, EADDRINUSE));
- ASSERT_NO_FATAL_FAILURE(
- BindSocket(/* reuse */ true, /* bind_to_device */ 123, EADDRINUSE));
- ASSERT_NO_FATAL_FAILURE(
- BindSocket(/* reuse */ false, /* bind_to_device */ 0, EADDRINUSE));
- ASSERT_NO_FATAL_FAILURE(
- BindSocket(/* reuse */ true, /* bind_to_device */ 0, EADDRINUSE));
- ASSERT_NO_FATAL_FAILURE(
- BindSocket(/* reuse */ true, /* bind_to_device */ 456, 0));
- ASSERT_NO_FATAL_FAILURE(
- BindSocket(/* reuse */ false, /* bind_to_device */ 789, 0));
- ASSERT_NO_FATAL_FAILURE(
- BindSocket(/* reuse */ false, /* bind_to_device */ 0, EADDRINUSE));
- ASSERT_NO_FATAL_FAILURE(
- BindSocket(/* reuse */ true, /* bind_to_device */ 0, EADDRINUSE));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 123, 0));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 123, EADDRINUSE));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 123, EADDRINUSE));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 0, EADDRINUSE));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 0, EADDRINUSE));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 456, 0));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 789, 0));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 0, EADDRINUSE));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 0, EADDRINUSE));
}
TEST_P(BindToDeviceSequenceTest, BindWithReuse) {
- ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse */ true));
- ASSERT_NO_FATAL_FAILURE(
- BindSocket(/* reuse */ false, /* bind_to_device */ 123, EADDRINUSE));
ASSERT_NO_FATAL_FAILURE(
- BindSocket(/* reuse */ true, /* bind_to_device */ 123));
- ASSERT_NO_FATAL_FAILURE(
- BindSocket(/* reuse */ false, /* bind_to_device */ 0, EADDRINUSE));
- ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse */ true, /* bind_to_device */ 0));
+ BindSocket(/* reusePort */ true, /* reuse_addr */ false));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 123, EADDRINUSE));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(
+ /* reuse_port */ true, /* reuse_addr */ false,
+ /* bind_to_device */ 123));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 0, EADDRINUSE));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(
+ /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 0));
}
TEST_P(BindToDeviceSequenceTest, BindingWithReuseAndDevice) {
- ASSERT_NO_FATAL_FAILURE(
- BindSocket(/* reuse */ true, /* bind_to_device */ 123));
- ASSERT_NO_FATAL_FAILURE(
- BindSocket(/* reuse */ false, /* bind_to_device */ 123, EADDRINUSE));
- ASSERT_NO_FATAL_FAILURE(
- BindSocket(/* reuse */ true, /* bind_to_device */ 123));
- ASSERT_NO_FATAL_FAILURE(
- BindSocket(/* reuse */ false, /* bind_to_device */ 0, EADDRINUSE));
- ASSERT_NO_FATAL_FAILURE(
- BindSocket(/* reuse */ true, /* bind_to_device */ 456));
- ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse */ true));
- ASSERT_NO_FATAL_FAILURE(
- BindSocket(/* reuse */ true, /* bind_to_device */ 789));
- ASSERT_NO_FATAL_FAILURE(
- BindSocket(/* reuse */ false, /* bind_to_device */ 999, EADDRINUSE));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(
+ /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 123));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 123, EADDRINUSE));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(
+ /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 123));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 0, EADDRINUSE));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(
+ /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 456));
+ ASSERT_NO_FATAL_FAILURE(
+ BindSocket(/* reuse_port */ true, /* reuse_addr */ false));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(
+ /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 789));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 999, EADDRINUSE));
}
TEST_P(BindToDeviceSequenceTest, MixingReuseAndNotReuseByBindingToDevice) {
- ASSERT_NO_FATAL_FAILURE(
- BindSocket(/* reuse */ true, /* bind_to_device */ 123, 0));
- ASSERT_NO_FATAL_FAILURE(
- BindSocket(/* reuse */ false, /* bind_to_device */ 456, 0));
- ASSERT_NO_FATAL_FAILURE(
- BindSocket(/* reuse */ true, /* bind_to_device */ 789, 0));
- ASSERT_NO_FATAL_FAILURE(
- BindSocket(/* reuse */ false, /* bind_to_device */ 999, 0));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 123, 0));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 456, 0));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 789, 0));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 999, 0));
}
TEST_P(BindToDeviceSequenceTest, CannotBindTo0AfterMixingReuseAndNotReuse) {
- ASSERT_NO_FATAL_FAILURE(
- BindSocket(/* reuse */ true, /* bind_to_device */ 123));
- ASSERT_NO_FATAL_FAILURE(
- BindSocket(/* reuse */ false, /* bind_to_device */ 456));
- ASSERT_NO_FATAL_FAILURE(
- BindSocket(/* reuse */ true, /* bind_to_device */ 0, EADDRINUSE));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(
+ /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 123));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 456));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 0, EADDRINUSE));
}
TEST_P(BindToDeviceSequenceTest, BindAndRelease) {
- ASSERT_NO_FATAL_FAILURE(
- BindSocket(/* reuse */ true, /* bind_to_device */ 123));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(
+ /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 123));
int to_release;
- ASSERT_NO_FATAL_FAILURE(
- BindSocket(/* reuse */ true, /* bind_to_device */ 0, 0, &to_release));
- ASSERT_NO_FATAL_FAILURE(
- BindSocket(/* reuse */ false, /* bind_to_device */ 345, EADDRINUSE));
- ASSERT_NO_FATAL_FAILURE(
- BindSocket(/* reuse */ true, /* bind_to_device */ 789));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 0, 0, &to_release));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 345, EADDRINUSE));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(
+ /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 789));
// Release the bind to device 0 and try again.
ASSERT_NO_FATAL_FAILURE(ReleaseSocket(to_release));
- ASSERT_NO_FATAL_FAILURE(
- BindSocket(/* reuse */ false, /* bind_to_device */ 345));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 345));
}
TEST_P(BindToDeviceSequenceTest, BindTwiceWithReuseOnce) {
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 123));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindWithReuseAddr) {
+ // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
+ SKIP_IF(IsRunningOnGvisor());
+
ASSERT_NO_FATAL_FAILURE(
- BindSocket(/* reuse */ false, /* bind_to_device */ 123));
- ASSERT_NO_FATAL_FAILURE(
- BindSocket(/* reuse */ true, /* bind_to_device */ 0, EADDRINUSE));
+ BindSocket(/* reusePort */ false, /* reuse_addr */ true));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 123, EADDRINUSE));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(
+ /* reuse_port */ false, /* reuse_addr */ true, /* bind_to_device */ 123));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 0, EADDRINUSE));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(
+ /* reuse_port */ false, /* reuse_addr */ true, /* bind_to_device */ 0));
+}
+
+TEST_P(BindToDeviceSequenceTest,
+ CannotBindTo0AfterMixingReuseAddrAndNotReuseAddr) {
+ ASSERT_NO_FATAL_FAILURE(BindSocket(
+ /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 123));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 456));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ true,
+ /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindReuseAddrReusePortThenReusePort) {
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ true,
+ /* bind_to_device */ 0));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 0));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ true,
+ /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindReuseAddrReusePortThenReuseAddr) {
+ // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
+ SKIP_IF(IsRunningOnGvisor());
+
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ true,
+ /* bind_to_device */ 0));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ true,
+ /* bind_to_device */ 0));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindDoubleReuseAddrReusePortThenReusePort) {
+ ASSERT_NO_FATAL_FAILURE(BindSocket(
+ /* reuse_port */ true, /* reuse_addr */ true, /* bind_to_device */ 0));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ true,
+ /* bind_to_device */ 0));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 0));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ true,
+ /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindDoubleReuseAddrReusePortThenReuseAddr) {
+ // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
+ SKIP_IF(IsRunningOnGvisor());
+
+ ASSERT_NO_FATAL_FAILURE(BindSocket(
+ /* reuse_port */ true, /* reuse_addr */ true, /* bind_to_device */ 0));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ true,
+ /* bind_to_device */ 0));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ true,
+ /* bind_to_device */ 0));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindReusePortThenReuseAddrReusePort) {
+ ASSERT_NO_FATAL_FAILURE(BindSocket(
+ /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 0));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ true,
+ /* bind_to_device */ 0));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ true,
+ /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindReuseAddrThenReuseAddr) {
+ ASSERT_NO_FATAL_FAILURE(BindSocket(
+ /* reuse_port */ false, /* reuse_addr */ true, /* bind_to_device */ 0));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 0, EADDRINUSE));
+}
+
+// This behavior seems like a bug?
+TEST_P(BindToDeviceSequenceTest,
+ BindReuseAddrThenReuseAddrReusePortThenReuseAddr) {
+ // FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
+ SKIP_IF(IsRunningOnGvisor());
+
+ ASSERT_NO_FATAL_FAILURE(BindSocket(
+ /* reuse_port */ false, /* reuse_addr */ true, /* bind_to_device */ 0));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ true,
+ /* bind_to_device */ 0));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 0));
+}
+
+// Repro test for gvisor.dev/issue/1217. Not replicated in ports_test.go as this
+// test is different from the others and wouldn't fit well there.
+TEST_P(BindToDeviceSequenceTest, BindAndReleaseDifferentDevice) {
+ int to_release;
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 3, 0, &to_release));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 3, EADDRINUSE));
+ // Change the device. Since the socket was already bound, this should have no
+ // effect.
+ SetDevice(to_release, 2);
+ // Release the bind to device 3 and try again.
+ ASSERT_NO_FATAL_FAILURE(ReleaseSocket(to_release));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(
+ /* reuse_port */ false, /* reuse_addr */ false, /* bind_to_device */ 3));
}
INSTANTIATE_TEST_SUITE_P(BindToDeviceTest, BindToDeviceSequenceTest,
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index 96a1731cf..2f9821555 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -13,12 +13,10 @@
// limitations under the License.
#include <arpa/inet.h>
-#include <linux/tcp.h>
#include <netinet/in.h>
+#include <netinet/tcp.h>
#include <poll.h>
#include <string.h>
-#include <sys/epoll.h>
-#include <sys/socket.h>
#include <atomic>
#include <iostream>
@@ -34,6 +32,7 @@
#include "absl/strings/str_cat.h"
#include "absl/time/clock.h"
#include "absl/time/time.h"
+#include "test/syscalls/linux/ip_socket_test_util.h"
#include "test/syscalls/linux/socket_test_util.h"
#include "test/util/file_descriptor.h"
#include "test/util/posix_error.h"
@@ -46,6 +45,8 @@ namespace testing {
namespace {
+using ::testing::Gt;
+
PosixErrorOr<uint16_t> AddrPort(int family, sockaddr_storage const& addr) {
switch (family) {
case AF_INET:
@@ -102,19 +103,172 @@ TEST(BadSocketPairArgs, ValidateErrForBadCallsToSocketPair) {
SyscallFailsWithErrno(EAFNOSUPPORT));
}
-TEST_P(SocketInetLoopbackTest, TCP) {
- auto const& param = GetParam();
+enum class Operation {
+ Bind,
+ Connect,
+ SendTo,
+};
- TestAddress const& listener = param.listener;
- TestAddress const& connector = param.connector;
+std::string OperationToString(Operation operation) {
+ switch (operation) {
+ case Operation::Bind:
+ return "Bind";
+ case Operation::Connect:
+ return "Connect";
+ case Operation::SendTo:
+ return "SendTo";
+ }
+}
+
+using OperationSequence = std::vector<Operation>;
+
+using DualStackSocketTest =
+ ::testing::TestWithParam<std::tuple<TestAddress, OperationSequence>>;
+
+TEST_P(DualStackSocketTest, AddressOperations) {
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET6, SOCK_DGRAM, 0));
+
+ const TestAddress& addr = std::get<0>(GetParam());
+ const OperationSequence& operations = std::get<1>(GetParam());
+
+ auto addr_in = reinterpret_cast<const sockaddr*>(&addr.addr);
+
+ // sockets may only be bound once. Both `connect` and `sendto` cause a socket
+ // to be bound.
+ bool bound = false;
+ for (const Operation& operation : operations) {
+ bool sockname = false;
+ bool peername = false;
+ switch (operation) {
+ case Operation::Bind: {
+ ASSERT_NO_ERRNO(SetAddrPort(
+ addr.family(), const_cast<sockaddr_storage*>(&addr.addr), 0));
+
+ int bind_ret = bind(fd.get(), addr_in, addr.addr_len);
+
+ // Dual stack sockets may only be bound to AF_INET6.
+ if (!bound && addr.family() == AF_INET6) {
+ EXPECT_THAT(bind_ret, SyscallSucceeds());
+ bound = true;
+
+ sockname = true;
+ } else {
+ EXPECT_THAT(bind_ret, SyscallFailsWithErrno(EINVAL));
+ }
+ break;
+ }
+ case Operation::Connect: {
+ ASSERT_NO_ERRNO(SetAddrPort(
+ addr.family(), const_cast<sockaddr_storage*>(&addr.addr), 1337));
+
+ EXPECT_THAT(connect(fd.get(), addr_in, addr.addr_len),
+ SyscallSucceeds())
+ << GetAddrStr(addr_in);
+ bound = true;
+
+ sockname = true;
+ peername = true;
+
+ break;
+ }
+ case Operation::SendTo: {
+ const char payload[] = "hello";
+ ASSERT_NO_ERRNO(SetAddrPort(
+ addr.family(), const_cast<sockaddr_storage*>(&addr.addr), 1337));
+
+ ssize_t sendto_ret = sendto(fd.get(), &payload, sizeof(payload), 0,
+ addr_in, addr.addr_len);
+
+ EXPECT_THAT(sendto_ret, SyscallSucceedsWithValue(sizeof(payload)));
+ sockname = !bound;
+ bound = true;
+ break;
+ }
+ }
+
+ if (sockname) {
+ sockaddr_storage sock_addr;
+ socklen_t addrlen = sizeof(sock_addr);
+ ASSERT_THAT(getsockname(fd.get(), reinterpret_cast<sockaddr*>(&sock_addr),
+ &addrlen),
+ SyscallSucceeds());
+ ASSERT_EQ(addrlen, sizeof(struct sockaddr_in6));
+
+ auto sock_addr_in6 = reinterpret_cast<const sockaddr_in6*>(&sock_addr);
+
+ if (operation == Operation::SendTo) {
+ EXPECT_EQ(sock_addr_in6->sin6_family, AF_INET6);
+ EXPECT_TRUE(IN6_IS_ADDR_UNSPECIFIED(sock_addr_in6->sin6_addr.s6_addr32))
+ << OperationToString(operation) << " getsocknam="
+ << GetAddrStr(reinterpret_cast<sockaddr*>(&sock_addr));
+
+ EXPECT_NE(sock_addr_in6->sin6_port, 0);
+ } else if (IN6_IS_ADDR_V4MAPPED(
+ reinterpret_cast<const sockaddr_in6*>(addr_in)
+ ->sin6_addr.s6_addr32)) {
+ EXPECT_TRUE(IN6_IS_ADDR_V4MAPPED(sock_addr_in6->sin6_addr.s6_addr32))
+ << OperationToString(operation) << " getsocknam="
+ << GetAddrStr(reinterpret_cast<sockaddr*>(&sock_addr));
+ }
+ }
+ if (peername) {
+ sockaddr_storage peer_addr;
+ socklen_t addrlen = sizeof(peer_addr);
+ ASSERT_THAT(getpeername(fd.get(), reinterpret_cast<sockaddr*>(&peer_addr),
+ &addrlen),
+ SyscallSucceeds());
+ ASSERT_EQ(addrlen, sizeof(struct sockaddr_in6));
+
+ if (addr.family() == AF_INET ||
+ IN6_IS_ADDR_V4MAPPED(reinterpret_cast<const sockaddr_in6*>(addr_in)
+ ->sin6_addr.s6_addr32)) {
+ EXPECT_TRUE(IN6_IS_ADDR_V4MAPPED(
+ reinterpret_cast<const sockaddr_in6*>(&peer_addr)
+ ->sin6_addr.s6_addr32))
+ << OperationToString(operation) << " getpeername="
+ << GetAddrStr(reinterpret_cast<sockaddr*>(&peer_addr));
+ }
+ }
+ }
+}
+
+// TODO(gvisor.dev/issues/1556): uncomment V4MappedAny.
+INSTANTIATE_TEST_SUITE_P(
+ All, DualStackSocketTest,
+ ::testing::Combine(
+ ::testing::Values(V4Any(), V4Loopback(), /*V4MappedAny(),*/
+ V4MappedLoopback(), V6Any(), V6Loopback()),
+ ::testing::ValuesIn<OperationSequence>(
+ {{Operation::Bind, Operation::Connect, Operation::SendTo},
+ {Operation::Bind, Operation::SendTo, Operation::Connect},
+ {Operation::Connect, Operation::Bind, Operation::SendTo},
+ {Operation::Connect, Operation::SendTo, Operation::Bind},
+ {Operation::SendTo, Operation::Bind, Operation::Connect},
+ {Operation::SendTo, Operation::Connect, Operation::Bind}})),
+ [](::testing::TestParamInfo<
+ std::tuple<TestAddress, OperationSequence>> const& info) {
+ const TestAddress& addr = std::get<0>(info.param);
+ const OperationSequence& operations = std::get<1>(info.param);
+ std::string s = addr.description;
+ for (const Operation& operation : operations) {
+ absl::StrAppend(&s, OperationToString(operation));
+ }
+ return s;
+ });
+
+void tcpSimpleConnectTest(TestAddress const& listener,
+ TestAddress const& connector, bool unbound) {
// Create the listening socket.
const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
sockaddr_storage listen_addr = listener.addr;
- ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
- listener.addr_len),
- SyscallSucceeds());
+ if (!unbound) {
+ ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+ listener.addr_len),
+ SyscallSucceeds());
+ }
ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds());
// Get the port bound by the listening socket.
@@ -148,6 +302,23 @@ TEST_P(SocketInetLoopbackTest, TCP) {
ASSERT_THAT(shutdown(conn_fd.get(), SHUT_RDWR), SyscallSucceeds());
}
+TEST_P(SocketInetLoopbackTest, TCP) {
+ auto const& param = GetParam();
+ TestAddress const& listener = param.listener;
+ TestAddress const& connector = param.connector;
+
+ tcpSimpleConnectTest(listener, connector, true);
+}
+
+TEST_P(SocketInetLoopbackTest, TCPListenUnbound) {
+ auto const& param = GetParam();
+
+ TestAddress const& listener = param.listener;
+ TestAddress const& connector = param.connector;
+
+ tcpSimpleConnectTest(listener, connector, false);
+}
+
TEST_P(SocketInetLoopbackTest, TCPListenClose) {
auto const& param = GetParam();
@@ -206,7 +377,7 @@ TEST_P(SocketInetLoopbackTest, TCPListenClose) {
}
// TODO(b/138400178): Fix cooperative S/R failure when ds.reset() is invoked
// before function end.
- // ds.reset()
+ // ds.reset();
}
TEST_P(SocketInetLoopbackTest, TCPbacklog) {
@@ -362,7 +533,7 @@ TEST_P(SocketInetLoopbackTest, TCPFinWait2Test_NoRandomSave) {
// Sleep for a little over the linger timeout to reduce flakiness in
// save/restore tests.
- absl::SleepFor(absl::Seconds(kTCPLingerTimeout + 1));
+ absl::SleepFor(absl::Seconds(kTCPLingerTimeout + 2));
ds.reset();
@@ -603,6 +774,60 @@ TEST_P(SocketInetLoopbackTest, TCPTimeWaitTest_NoRandomSave) {
SyscallSucceeds());
}
+TEST_P(SocketInetLoopbackTest, AcceptedInheritsTCPUserTimeout) {
+ auto const& param = GetParam();
+ TestAddress const& listener = param.listener;
+ TestAddress const& connector = param.connector;
+
+ // Create the listening socket.
+ const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+ sockaddr_storage listen_addr = listener.addr;
+ ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+ listener.addr_len),
+ SyscallSucceeds());
+ ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds());
+
+ // Get the port bound by the listening socket.
+ socklen_t addrlen = listener.addr_len;
+ ASSERT_THAT(getsockname(listen_fd.get(),
+ reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+ SyscallSucceeds());
+
+ const uint16_t port =
+ ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+ // Set the userTimeout on the listening socket.
+ constexpr int kUserTimeout = 10;
+ ASSERT_THAT(setsockopt(listen_fd.get(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+ &kUserTimeout, sizeof(kUserTimeout)),
+ SyscallSucceeds());
+
+ // Connect to the listening socket.
+ FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+
+ sockaddr_storage conn_addr = connector.addr;
+ ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+ ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(),
+ reinterpret_cast<sockaddr*>(&conn_addr),
+ connector.addr_len),
+ SyscallSucceeds());
+
+ // Accept the connection.
+ auto accepted =
+ ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+ // Verify that the accepted socket inherited the user timeout set on
+ // listening socket.
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ ASSERT_THAT(
+ getsockopt(accepted.get(), IPPROTO_TCP, TCP_USER_TIMEOUT, &get, &get_len),
+ SyscallSucceeds());
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kUserTimeout);
+}
+
INSTANTIATE_TEST_SUITE_P(
All, SocketInetLoopbackTest,
::testing::Values(
@@ -635,7 +860,9 @@ INSTANTIATE_TEST_SUITE_P(
using SocketInetReusePortTest = ::testing::TestWithParam<TestParam>;
-TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread) {
+// TODO(gvisor.dev/issue/940): Remove _NoRandomSave when portHint/stack.Seed is
+// saved/restored.
+TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread_NoRandomSave) {
auto const& param = GetParam();
TestAddress const& listener = param.listener;
@@ -643,6 +870,7 @@ TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread) {
sockaddr_storage listen_addr = listener.addr;
sockaddr_storage conn_addr = connector.addr;
constexpr int kThreadCount = 3;
+ constexpr int kConnectAttempts = 10000;
// Create the listening socket.
FileDescriptor listener_fds[kThreadCount];
@@ -676,7 +904,6 @@ TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread) {
ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
}
- constexpr int kConnectAttempts = 10000;
std::atomic<int> connects_received = ATOMIC_VAR_INIT(0);
std::unique_ptr<ScopedThread> listen_thread[kThreadCount];
int accept_counts[kThreadCount] = {};
@@ -701,7 +928,14 @@ TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread) {
}
// Receive some data from a socket to be sure that the connect()
// system call has been completed on another side.
- int data;
+ // Do a short read and then close the socket to trigger a RST. This
+ // ensures that both ends of the connection are cleaned up and no
+ // goroutines hang around in TIME-WAIT. We do this so that this test
+ // does not timeout under gotsan runs where lots of goroutines can
+ // cause the test to use absurd amounts of memory.
+ //
+ // See: https://tools.ietf.org/html/rfc2525#page-50 section 2.17
+ uint16_t data;
EXPECT_THAT(
RetryEINTR(recv)(fd.ValueOrDie().get(), &data, sizeof(data), 0),
SyscallSucceedsWithValue(sizeof(data)));
@@ -724,8 +958,22 @@ TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread) {
connector.addr_len),
SyscallSucceeds());
+ // Do two separate sends to ensure two segments are received. This is
+ // required for netstack where read is incorrectly assuming a whole
+ // segment is read when endpoint.Read() is called which is technically
+ // incorrect as the syscall that invoked endpoint.Read() may only
+ // consume it partially. This results in a case where a close() of
+ // such a socket does not trigger a RST in netstack due to the
+ // endpoint assuming that the endpoint has no unread data.
EXPECT_THAT(RetryEINTR(send)(fd.get(), &i, sizeof(i), 0),
SyscallSucceedsWithValue(sizeof(i)));
+
+ // TODO(gvisor.dev/issue/1449): Remove this block once netstack correctly
+ // generates a RST.
+ if (IsRunningOnGvisor()) {
+ EXPECT_THAT(RetryEINTR(send)(fd.get(), &i, sizeof(i), 0),
+ SyscallSucceedsWithValue(sizeof(i)));
+ }
}
});
@@ -920,41 +1168,44 @@ TEST_P(SocketInetReusePortTest, UdpPortReuseMultiThreadShort) {
SyscallSucceedsWithValue(sizeof(i)));
}
- int epollfd;
- ASSERT_THAT(epollfd = epoll_create1(0), SyscallSucceeds());
-
+ struct pollfd pollfds[kThreadCount];
for (int i = 0; i < kThreadCount; i++) {
- int fd = listener_fds[i].get();
- struct epoll_event ev;
- ev.data.fd = fd;
- ev.events = EPOLLIN;
- ASSERT_THAT(epoll_ctl(epollfd, EPOLL_CTL_ADD, fd, &ev), SyscallSucceeds());
+ pollfds[i].fd = listener_fds[i].get();
+ pollfds[i].events = POLLIN;
}
std::map<uint16_t, int> portToFD;
- for (int i = 0; i < kConnectAttempts * 2; i++) {
- struct sockaddr_storage addr = {};
- socklen_t addrlen = sizeof(addr);
- struct epoll_event ev;
- int data, fd;
+ int received = 0;
+ while (received < kConnectAttempts * 2) {
+ ASSERT_THAT(poll(pollfds, kThreadCount, -1),
+ SyscallSucceedsWithValue(Gt(0)));
- ASSERT_THAT(epoll_wait(epollfd, &ev, 1, -1), SyscallSucceedsWithValue(1));
+ for (int i = 0; i < kThreadCount; i++) {
+ if ((pollfds[i].revents & POLLIN) == 0) {
+ continue;
+ }
- fd = ev.data.fd;
- EXPECT_THAT(RetryEINTR(recvfrom)(fd, &data, sizeof(data), 0,
- reinterpret_cast<struct sockaddr*>(&addr),
- &addrlen),
- SyscallSucceedsWithValue(sizeof(data)));
- uint16_t const port =
- ASSERT_NO_ERRNO_AND_VALUE(AddrPort(connector.family(), addr));
- auto prev_port = portToFD.find(port);
- // Check that all packets from one client have been delivered to the same
- // server socket.
- if (prev_port == portToFD.end()) {
- portToFD[port] = ev.data.fd;
- } else {
- EXPECT_EQ(portToFD[port], ev.data.fd);
+ received++;
+
+ const int fd = pollfds[i].fd;
+ struct sockaddr_storage addr = {};
+ socklen_t addrlen = sizeof(addr);
+ int data;
+ EXPECT_THAT(RetryEINTR(recvfrom)(
+ fd, &data, sizeof(data), 0,
+ reinterpret_cast<struct sockaddr*>(&addr), &addrlen),
+ SyscallSucceedsWithValue(sizeof(data)));
+ uint16_t const port =
+ ASSERT_NO_ERRNO_AND_VALUE(AddrPort(connector.family(), addr));
+ auto prev_port = portToFD.find(port);
+ // Check that all packets from one client have been delivered to the
+ // same server socket.
+ if (prev_port == portToFD.end()) {
+ portToFD[port] = fd;
+ } else {
+ EXPECT_EQ(portToFD[port], fd);
+ }
}
}
}
@@ -1841,7 +2092,7 @@ TEST_P(SocketMultiProtocolInetLoopbackTest, NoReusePortFollowingReusePort) {
}
INSTANTIATE_TEST_SUITE_P(
- AllFamlies, SocketMultiProtocolInetLoopbackTest,
+ AllFamilies, SocketMultiProtocolInetLoopbackTest,
::testing::Values(ProtocolTestParam{"TCP", SOCK_STREAM},
ProtocolTestParam{"UDP", SOCK_DGRAM}),
DescribeProtocolTestParam);
diff --git a/test/syscalls/linux/socket_ip_loopback_blocking.cc b/test/syscalls/linux/socket_ip_loopback_blocking.cc
index e58eedaba..fda252dd7 100644
--- a/test/syscalls/linux/socket_ip_loopback_blocking.cc
+++ b/test/syscalls/linux/socket_ip_loopback_blocking.cc
@@ -23,6 +23,7 @@
namespace gvisor {
namespace testing {
+namespace {
std::vector<SocketPairKind> GetSocketPairs() {
return VecCat<SocketPairKind>(
@@ -43,5 +44,6 @@ INSTANTIATE_TEST_SUITE_P(
BlockingIPSockets, BlockingSocketPairTest,
::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+} // namespace
} // namespace testing
} // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_tcp_generic.cc b/test/syscalls/linux/socket_ip_tcp_generic.cc
index a37b49447..57ce8e169 100644
--- a/test/syscalls/linux/socket_ip_tcp_generic.cc
+++ b/test/syscalls/linux/socket_ip_tcp_generic.cc
@@ -24,6 +24,8 @@
#include <sys/un.h>
#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
#include "test/syscalls/linux/socket_test_util.h"
#include "test/util/test_util.h"
#include "test/util/thread_util.h"
@@ -789,5 +791,89 @@ TEST_P(TCPSocketPairTest, SetTCPLingerTimeout) {
EXPECT_EQ(get, kTCPLingerTimeout);
}
+TEST_P(TCPSocketPairTest, TestTCPCloseWithData) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ScopedThread t([&]() {
+ // Close one end to trigger sending of a FIN.
+ ASSERT_THAT(shutdown(sockets->second_fd(), SHUT_WR), SyscallSucceeds());
+ char buf[3];
+ ASSERT_THAT(read(sockets->second_fd(), buf, 3),
+ SyscallSucceedsWithValue(3));
+ absl::SleepFor(absl::Milliseconds(50));
+ ASSERT_THAT(close(sockets->release_second_fd()), SyscallSucceeds());
+ });
+
+ absl::SleepFor(absl::Milliseconds(50));
+ // Send some data then close.
+ constexpr char kStr[] = "abc";
+ ASSERT_THAT(write(sockets->first_fd(), kStr, 3), SyscallSucceedsWithValue(3));
+ t.Join();
+ ASSERT_THAT(close(sockets->release_first_fd()), SyscallSucceeds());
+}
+
+TEST_P(TCPSocketPairTest, TCPUserTimeoutDefault) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+ &get, &get_len),
+ SyscallSucceeds());
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, 0); // 0 ms (disabled).
+}
+
+TEST_P(TCPSocketPairTest, SetTCPUserTimeoutZero) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ constexpr int kZero = 0;
+ ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+ &kZero, sizeof(kZero)),
+ SyscallSucceeds());
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+ &get, &get_len),
+ SyscallSucceeds());
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, 0); // 0 ms (disabled).
+}
+
+TEST_P(TCPSocketPairTest, SetTCPUserTimeoutBelowZero) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ constexpr int kNeg = -10;
+ EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+ &kNeg, sizeof(kNeg)),
+ SyscallFailsWithErrno(EINVAL));
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+ &get, &get_len),
+ SyscallSucceeds());
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, 0); // 0 ms (disabled).
+}
+
+TEST_P(TCPSocketPairTest, SetTCPUserTimeoutAboveZero) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ constexpr int kAbove = 10;
+ ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+ &kAbove, sizeof(kAbove)),
+ SyscallSucceeds());
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+ &get, &get_len),
+ SyscallSucceeds());
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kAbove);
+}
+
} // namespace testing
} // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_tcp_loopback.cc b/test/syscalls/linux/socket_ip_tcp_loopback.cc
index 831de53b8..9db3037bc 100644
--- a/test/syscalls/linux/socket_ip_tcp_loopback.cc
+++ b/test/syscalls/linux/socket_ip_tcp_loopback.cc
@@ -21,6 +21,7 @@
namespace gvisor {
namespace testing {
+namespace {
std::vector<SocketPairKind> GetSocketPairs() {
return {
@@ -34,5 +35,6 @@ INSTANTIATE_TEST_SUITE_P(
AllUnixDomainSockets, AllSocketPairTest,
::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+} // namespace
} // namespace testing
} // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_udp_generic.cc b/test/syscalls/linux/socket_ip_udp_generic.cc
index 66eb68857..53290bed7 100644
--- a/test/syscalls/linux/socket_ip_udp_generic.cc
+++ b/test/syscalls/linux/socket_ip_udp_generic.cc
@@ -209,6 +209,46 @@ TEST_P(UDPSocketPairTest, SetMulticastLoopChar) {
EXPECT_EQ(get, kSockOptOn);
}
+// Ensure that Receiving TOS is off by default.
+TEST_P(UDPSocketPairTest, RecvTosDefault) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ ASSERT_THAT(
+ getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kSockOptOff);
+}
+
+// Test that setting and getting IP_RECVTOS works as expected.
+TEST_P(UDPSocketPairTest, SetRecvTos) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS,
+ &kSockOptOff, sizeof(kSockOptOff)),
+ SyscallSucceeds());
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ ASSERT_THAT(
+ getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kSockOptOff);
+
+ ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS,
+ &kSockOptOn, sizeof(kSockOptOn)),
+ SyscallSucceeds());
+
+ ASSERT_THAT(
+ getsockopt(sockets->first_fd(), IPPROTO_IP, IP_RECVTOS, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kSockOptOn);
+}
+
TEST_P(UDPSocketPairTest, ReuseAddrDefault) {
auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
diff --git a/test/syscalls/linux/socket_ip_unbound.cc b/test/syscalls/linux/socket_ip_unbound.cc
index b6754111f..ca597e267 100644
--- a/test/syscalls/linux/socket_ip_unbound.cc
+++ b/test/syscalls/linux/socket_ip_unbound.cc
@@ -129,6 +129,7 @@ TEST_P(IPUnboundSocketTest, InvalidNegativeTtl) {
struct TOSOption {
int level;
int option;
+ int cmsg_level;
};
constexpr int INET_ECN_MASK = 3;
@@ -139,10 +140,12 @@ static TOSOption GetTOSOption(int domain) {
case AF_INET:
opt.level = IPPROTO_IP;
opt.option = IP_TOS;
+ opt.cmsg_level = SOL_IP;
break;
case AF_INET6:
opt.level = IPPROTO_IPV6;
opt.option = IPV6_TCLASS;
+ opt.cmsg_level = SOL_IPV6;
break;
}
return opt;
@@ -386,6 +389,36 @@ TEST_P(IPUnboundSocketTest, NullTOS) {
SyscallFailsWithErrno(EFAULT));
}
+TEST_P(IPUnboundSocketTest, InsufficientBufferTOS) {
+ SKIP_IF(GetParam().protocol == IPPROTO_TCP);
+
+ auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ TOSOption t = GetTOSOption(GetParam().domain);
+
+ in_addr addr4;
+ in6_addr addr6;
+ ASSERT_THAT(inet_pton(AF_INET, "127.0.0.1", &addr4), ::testing::Eq(1));
+ ASSERT_THAT(inet_pton(AF_INET6, "fe80::", &addr6), ::testing::Eq(1));
+
+ cmsghdr cmsg = {};
+ cmsg.cmsg_len = sizeof(cmsg);
+ cmsg.cmsg_level = t.cmsg_level;
+ cmsg.cmsg_type = t.option;
+
+ msghdr msg = {};
+ msg.msg_control = &cmsg;
+ msg.msg_controllen = sizeof(cmsg);
+ if (GetParam().domain == AF_INET) {
+ msg.msg_name = &addr4;
+ msg.msg_namelen = sizeof(addr4);
+ } else {
+ msg.msg_name = &addr6;
+ msg.msg_namelen = sizeof(addr6);
+ }
+
+ EXPECT_THAT(sendmsg(socket->get(), &msg, 0), SyscallFailsWithErrno(EINVAL));
+}
+
INSTANTIATE_TEST_SUITE_P(
IPUnboundSockets, IPUnboundSocketTest,
::testing::ValuesIn(VecCat<SocketKind>(VecCat<SocketKind>(
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound.cc b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
index 6b1af6c17..aa6fb4e3f 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
@@ -1814,7 +1814,7 @@ TEST_P(IPv4UDPUnboundSocketTest, BindReusePortThenReuseAddr) {
SyscallFailsWithErrno(EADDRINUSE));
}
-TEST_P(IPv4UDPUnboundSocketTest, BindReuseAddrReusePortConvertableToReusePort) {
+TEST_P(IPv4UDPUnboundSocketTest, BindReuseAddrReusePortConvertibleToReusePort) {
auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
auto socket3 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
@@ -1855,7 +1855,7 @@ TEST_P(IPv4UDPUnboundSocketTest, BindReuseAddrReusePortConvertableToReusePort) {
SyscallFailsWithErrno(EADDRINUSE));
}
-TEST_P(IPv4UDPUnboundSocketTest, BindReuseAddrReusePortConvertableToReuseAddr) {
+TEST_P(IPv4UDPUnboundSocketTest, BindReuseAddrReusePortConvertibleToReuseAddr) {
// FIXME(b/129164367): Support SO_REUSEADDR on UDP sockets.
SKIP_IF(IsRunningOnGvisor());
diff --git a/test/syscalls/linux/socket_non_stream.cc b/test/syscalls/linux/socket_non_stream.cc
index d91c5ed39..c61817f14 100644
--- a/test/syscalls/linux/socket_non_stream.cc
+++ b/test/syscalls/linux/socket_non_stream.cc
@@ -113,7 +113,7 @@ TEST_P(NonStreamSocketPairTest, RecvmsgMsghdrFlagMsgTrunc) {
EXPECT_EQ(0, memcmp(received_data, sent_data, sizeof(received_data)));
// Check that msghdr flags were updated.
- EXPECT_EQ(msg.msg_flags, MSG_TRUNC);
+ EXPECT_EQ(msg.msg_flags & MSG_TRUNC, MSG_TRUNC);
}
// Stream sockets allow data sent with multiple sends to be peeked at in a
@@ -193,7 +193,7 @@ TEST_P(NonStreamSocketPairTest, MsgTruncTruncationRecvmsgMsghdrFlagMsgTrunc) {
EXPECT_EQ(0, memcmp(received_data, sent_data, sizeof(received_data)));
// Check that msghdr flags were updated.
- EXPECT_EQ(msg.msg_flags, MSG_TRUNC);
+ EXPECT_EQ(msg.msg_flags & MSG_TRUNC, MSG_TRUNC);
}
TEST_P(NonStreamSocketPairTest, MsgTruncSameSize) {
@@ -224,5 +224,114 @@ TEST_P(NonStreamSocketPairTest, MsgTruncNotFull) {
EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
}
+// This test tests reading from a socket with MSG_TRUNC and a zero length
+// receive buffer. The user should be able to get the message length.
+TEST_P(NonStreamSocketPairTest, RecvmsgMsgTruncZeroLen) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[10];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+ ASSERT_THAT(
+ RetryEINTR(send)(sockets->first_fd(), sent_data, sizeof(sent_data), 0),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ // The receive buffer is of zero length.
+ char received_data[0] = {};
+
+ struct iovec iov;
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ struct msghdr msg = {};
+ msg.msg_flags = -1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ // The syscall succeeds returning the full size of the message on the socket.
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, MSG_TRUNC),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ // Check that MSG_TRUNC is set on msghdr flags.
+ EXPECT_EQ(msg.msg_flags & MSG_TRUNC, MSG_TRUNC);
+}
+
+// This test tests reading from a socket with MSG_TRUNC | MSG_PEEK and a zero
+// length receive buffer. The user should be able to get the message length
+// without reading data off the socket.
+TEST_P(NonStreamSocketPairTest, RecvmsgMsgTruncMsgPeekZeroLen) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[10];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+ ASSERT_THAT(
+ RetryEINTR(send)(sockets->first_fd(), sent_data, sizeof(sent_data), 0),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ // The receive buffer is of zero length.
+ char peek_data[0] = {};
+
+ struct iovec peek_iov;
+ peek_iov.iov_base = peek_data;
+ peek_iov.iov_len = sizeof(peek_data);
+ struct msghdr peek_msg = {};
+ peek_msg.msg_flags = -1;
+ peek_msg.msg_iov = &peek_iov;
+ peek_msg.msg_iovlen = 1;
+
+ // The syscall succeeds returning the full size of the message on the socket.
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &peek_msg,
+ MSG_TRUNC | MSG_PEEK),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ // Check that MSG_TRUNC is set on msghdr flags because the receive buffer is
+ // smaller than the message size.
+ EXPECT_EQ(peek_msg.msg_flags & MSG_TRUNC, MSG_TRUNC);
+
+ char received_data[sizeof(sent_data)] = {};
+
+ struct iovec received_iov;
+ received_iov.iov_base = received_data;
+ received_iov.iov_len = sizeof(received_data);
+ struct msghdr received_msg = {};
+ received_msg.msg_flags = -1;
+ received_msg.msg_iov = &received_iov;
+ received_msg.msg_iovlen = 1;
+
+ // Next we can read the actual data.
+ ASSERT_THAT(
+ RetryEINTR(recvmsg)(sockets->second_fd(), &received_msg, MSG_TRUNC),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ // Check that MSG_TRUNC is not set on msghdr flags because we read the whole
+ // message.
+ EXPECT_EQ(received_msg.msg_flags & MSG_TRUNC, 0);
+}
+
+// This test tests reading from a socket with MSG_TRUNC | MSG_PEEK and a zero
+// length receive buffer and MSG_DONTWAIT. The user should be able to get an
+// EAGAIN or EWOULDBLOCK error response.
+TEST_P(NonStreamSocketPairTest, RecvmsgTruncPeekDontwaitZeroLen) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ // NOTE: We don't send any data on the socket.
+
+ // The receive buffer is of zero length.
+ char peek_data[0] = {};
+
+ struct iovec peek_iov;
+ peek_iov.iov_base = peek_data;
+ peek_iov.iov_len = sizeof(peek_data);
+ struct msghdr peek_msg = {};
+ peek_msg.msg_flags = -1;
+ peek_msg.msg_iov = &peek_iov;
+ peek_msg.msg_iovlen = 1;
+
+ // recvmsg fails with EAGAIN because no data is available on the socket.
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &peek_msg,
+ MSG_TRUNC | MSG_PEEK | MSG_DONTWAIT),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
} // namespace testing
} // namespace gvisor
diff --git a/test/syscalls/linux/socket_non_stream_blocking.cc b/test/syscalls/linux/socket_non_stream_blocking.cc
index 62d87c1af..b052f6e61 100644
--- a/test/syscalls/linux/socket_non_stream_blocking.cc
+++ b/test/syscalls/linux/socket_non_stream_blocking.cc
@@ -25,6 +25,7 @@
#include "test/syscalls/linux/socket_test_util.h"
#include "test/syscalls/linux/unix_domain_socket_test_util.h"
#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
namespace gvisor {
namespace testing {
@@ -44,5 +45,41 @@ TEST_P(BlockingNonStreamSocketPairTest, RecvLessThanBufferWaitAll) {
SyscallSucceedsWithValue(sizeof(sent_data)));
}
+// This test tests reading from a socket with MSG_TRUNC | MSG_PEEK and a zero
+// length receive buffer and MSG_DONTWAIT. The recvmsg call should block on
+// reading the data.
+TEST_P(BlockingNonStreamSocketPairTest,
+ RecvmsgTruncPeekDontwaitZeroLenBlocking) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ // NOTE: We don't initially send any data on the socket.
+ const int data_size = 10;
+ char sent_data[data_size];
+ RandomizeBuffer(sent_data, data_size);
+
+ // The receive buffer is of zero length.
+ char peek_data[0] = {};
+
+ struct iovec peek_iov;
+ peek_iov.iov_base = peek_data;
+ peek_iov.iov_len = sizeof(peek_data);
+ struct msghdr peek_msg = {};
+ peek_msg.msg_flags = -1;
+ peek_msg.msg_iov = &peek_iov;
+ peek_msg.msg_iovlen = 1;
+
+ ScopedThread t([&]() {
+ // The syscall succeeds returning the full size of the message on the
+ // socket. This should block until there is data on the socket.
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &peek_msg,
+ MSG_TRUNC | MSG_PEEK),
+ SyscallSucceedsWithValue(data_size));
+ });
+
+ absl::SleepFor(absl::Seconds(1));
+ ASSERT_THAT(RetryEINTR(send)(sockets->first_fd(), sent_data, data_size, 0),
+ SyscallSucceedsWithValue(data_size));
+}
+
} // namespace testing
} // namespace gvisor
diff --git a/test/syscalls/linux/socket_stream.cc b/test/syscalls/linux/socket_stream.cc
index 346443f96..6522b2e01 100644
--- a/test/syscalls/linux/socket_stream.cc
+++ b/test/syscalls/linux/socket_stream.cc
@@ -104,7 +104,60 @@ TEST_P(StreamSocketPairTest, RecvmsgMsghdrFlagsNoMsgTrunc) {
EXPECT_EQ(0, memcmp(received_data, sent_data, sizeof(received_data)));
// Check that msghdr flags were cleared (MSG_TRUNC was not set).
- EXPECT_EQ(msg.msg_flags, 0);
+ ASSERT_EQ(msg.msg_flags & MSG_TRUNC, 0);
+}
+
+TEST_P(StreamSocketPairTest, RecvmsgTruncZeroLen) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[10];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+ ASSERT_THAT(
+ RetryEINTR(send)(sockets->first_fd(), sent_data, sizeof(sent_data), 0),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ char received_data[0] = {};
+
+ struct iovec iov;
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ struct msghdr msg = {};
+ msg.msg_flags = -1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, MSG_TRUNC),
+ SyscallSucceedsWithValue(0));
+
+ // Check that msghdr flags were cleared (MSG_TRUNC was not set).
+ ASSERT_EQ(msg.msg_flags & MSG_TRUNC, 0);
+}
+
+TEST_P(StreamSocketPairTest, RecvmsgTruncPeekZeroLen) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[10];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+ ASSERT_THAT(
+ RetryEINTR(send)(sockets->first_fd(), sent_data, sizeof(sent_data), 0),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ char received_data[0] = {};
+
+ struct iovec iov;
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ struct msghdr msg = {};
+ msg.msg_flags = -1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(
+ RetryEINTR(recvmsg)(sockets->second_fd(), &msg, MSG_TRUNC | MSG_PEEK),
+ SyscallSucceedsWithValue(0));
+
+ // Check that msghdr flags were cleared (MSG_TRUNC was not set).
+ ASSERT_EQ(msg.msg_flags & MSG_TRUNC, 0);
}
TEST_P(StreamSocketPairTest, MsgTrunc) {
diff --git a/test/syscalls/linux/socket_unix.cc b/test/syscalls/linux/socket_unix.cc
index 8a28202a8..4cf1f76f1 100644
--- a/test/syscalls/linux/socket_unix.cc
+++ b/test/syscalls/linux/socket_unix.cc
@@ -65,6 +65,21 @@ TEST_P(UnixSocketPairTest, BindToBadName) {
SyscallFailsWithErrno(ENOENT));
}
+TEST_P(UnixSocketPairTest, BindToBadFamily) {
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ constexpr char kBadName[] = "/some/path/that/does/not/exist";
+ sockaddr_un sockaddr;
+ sockaddr.sun_family = AF_INET;
+ memcpy(sockaddr.sun_path, kBadName, sizeof(kBadName));
+
+ EXPECT_THAT(
+ bind(pair->first_fd(), reinterpret_cast<struct sockaddr*>(&sockaddr),
+ sizeof(sockaddr)),
+ SyscallFailsWithErrno(EINVAL));
+}
+
TEST_P(UnixSocketPairTest, RecvmmsgTimeoutAfterRecv) {
auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
char sent_data[10];
diff --git a/test/syscalls/linux/socket_unix_cmsg.cc b/test/syscalls/linux/socket_unix_cmsg.cc
index 1159c5229..a16899493 100644
--- a/test/syscalls/linux/socket_unix_cmsg.cc
+++ b/test/syscalls/linux/socket_unix_cmsg.cc
@@ -149,6 +149,35 @@ TEST_P(UnixSocketPairCmsgTest, BadFDPass) {
SyscallFailsWithErrno(EBADF));
}
+TEST_P(UnixSocketPairCmsgTest, ShortCmsg) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ int sent_fd = -1;
+
+ struct msghdr msg = {};
+ char control[CMSG_SPACE(sizeof(sent_fd))];
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_len = 1;
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ memcpy(CMSG_DATA(cmsg), &sent_fd, sizeof(sent_fd));
+
+ struct iovec iov;
+ iov.iov_base = sent_data;
+ iov.iov_len = sizeof(sent_data);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(sendmsg)(sockets->first_fd(), &msg, 0),
+ SyscallFailsWithErrno(EINVAL));
+}
+
// BasicFDPassNoSpace starts off by sending a single FD just like BasicFDPass.
// The difference is that when calling recvmsg, no space for FDs is provided,
// only space for the cmsg header.
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index 99863b0ed..33a5ac66c 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -814,6 +814,20 @@ TEST_P(TcpSocketTest, FullBuffer) {
t_ = -1;
}
+TEST_P(TcpSocketTest, PollAfterShutdown) {
+ ScopedThread client_thread([this]() {
+ EXPECT_THAT(shutdown(s_, SHUT_WR), SyscallSucceedsWithValue(0));
+ struct pollfd poll_fd = {s_, POLLIN | POLLERR | POLLHUP, 0};
+ EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, 10000),
+ SyscallSucceedsWithValue(1));
+ });
+
+ EXPECT_THAT(shutdown(t_, SHUT_WR), SyscallSucceedsWithValue(0));
+ struct pollfd poll_fd = {t_, POLLIN | POLLERR | POLLHUP, 0};
+ EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, 10000),
+ SyscallSucceedsWithValue(1));
+}
+
TEST_P(SimpleTcpSocketTest, NonBlockingConnectNoListener) {
// Initialize address to the loopback one.
sockaddr_storage addr =
@@ -967,6 +981,78 @@ TEST_P(SimpleTcpSocketTest, BlockingConnectRefused) {
EXPECT_THAT(close(s.release()), SyscallSucceeds());
}
+// Test that connecting to a non-listening port and thus receiving a RST is
+// handled appropriately by the socket - the port that the socket was bound to
+// is released and the expected error is returned.
+TEST_P(SimpleTcpSocketTest, CleanupOnConnectionRefused) {
+ // Create a socket that is known to not be listening. As is it bound but not
+ // listening, when another socket connects to the port, it will refuse..
+ FileDescriptor bound_s =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+ sockaddr_storage bound_addr =
+ ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam()));
+ socklen_t bound_addrlen = sizeof(bound_addr);
+
+ ASSERT_THAT(
+ bind(bound_s.get(), reinterpret_cast<struct sockaddr*>(&bound_addr),
+ bound_addrlen),
+ SyscallSucceeds());
+
+ // Get the addresses the socket is bound to because the port is chosen by the
+ // stack.
+ ASSERT_THAT(getsockname(bound_s.get(),
+ reinterpret_cast<struct sockaddr*>(&bound_addr),
+ &bound_addrlen),
+ SyscallSucceeds());
+
+ // Create, initialize, and bind the socket that is used to test connecting to
+ // the non-listening port.
+ FileDescriptor client_s =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+ // Initialize client address to the loopback one.
+ sockaddr_storage client_addr =
+ ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam()));
+ socklen_t client_addrlen = sizeof(client_addr);
+
+ ASSERT_THAT(
+ bind(client_s.get(), reinterpret_cast<struct sockaddr*>(&client_addr),
+ client_addrlen),
+ SyscallSucceeds());
+
+ ASSERT_THAT(getsockname(client_s.get(),
+ reinterpret_cast<struct sockaddr*>(&client_addr),
+ &client_addrlen),
+ SyscallSucceeds());
+
+ // Now the test: connect to the bound but not listening socket with the
+ // client socket. The bound socket should return a RST and cause the client
+ // socket to return an error and clean itself up immediately.
+ // The error being ECONNREFUSED diverges with RFC 793, page 37, but does what
+ // Linux does.
+ ASSERT_THAT(connect(client_s.get(),
+ reinterpret_cast<const struct sockaddr*>(&bound_addr),
+ bound_addrlen),
+ SyscallFailsWithErrno(ECONNREFUSED));
+
+ FileDescriptor new_s =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+ // Test binding to the address from the client socket. This should be okay
+ // if it was dropped correctly.
+ ASSERT_THAT(
+ bind(new_s.get(), reinterpret_cast<struct sockaddr*>(&client_addr),
+ client_addrlen),
+ SyscallSucceeds());
+
+ // Attempt #2, with the new socket and reused addr our connect should fail in
+ // the same way as before, not with an EADDRINUSE.
+ ASSERT_THAT(connect(client_s.get(),
+ reinterpret_cast<const struct sockaddr*>(&bound_addr),
+ bound_addrlen),
+ SyscallFailsWithErrno(ECONNREFUSED));
+}
+
// Test that we get an ECONNREFUSED with a nonblocking socket.
TEST_P(SimpleTcpSocketTest, NonBlockingConnectRefused) {
FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
@@ -1175,6 +1261,31 @@ TEST_P(SimpleTcpSocketTest, SetMaxSegFailsForInvalidMSSValues) {
}
}
+TEST_P(SimpleTcpSocketTest, SetTCPUserTimeout) {
+ FileDescriptor s =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+ {
+ constexpr int kTCPUserTimeout = -1;
+ EXPECT_THAT(setsockopt(s.get(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+ &kTCPUserTimeout, sizeof(kTCPUserTimeout)),
+ SyscallFailsWithErrno(EINVAL));
+ }
+
+ // kTCPUserTimeout is in milliseconds.
+ constexpr int kTCPUserTimeout = 100;
+ ASSERT_THAT(setsockopt(s.get(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+ &kTCPUserTimeout, sizeof(kTCPUserTimeout)),
+ SyscallSucceedsWithValue(0));
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ ASSERT_THAT(
+ getsockopt(s.get(), IPPROTO_TCP, TCP_USER_TIMEOUT, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kTCPUserTimeout);
+}
+
INSTANTIATE_TEST_SUITE_P(AllInetTests, SimpleTcpSocketTest,
::testing::Values(AF_INET, AF_INET6));
diff --git a/test/syscalls/linux/udp_socket.cc b/test/syscalls/linux/udp_socket.cc
index 111dbacdf..7a8ac30a4 100644
--- a/test/syscalls/linux/udp_socket.cc
+++ b/test/syscalls/linux/udp_socket.cc
@@ -12,1332 +12,13 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include <arpa/inet.h>
-#include <fcntl.h>
-#include <linux/errqueue.h>
-#include <netinet/in.h>
-#include <sys/ioctl.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-
-#include "gtest/gtest.h"
-#include "absl/base/macros.h"
-#include "absl/time/clock.h"
-#include "absl/time/time.h"
-#include "test/syscalls/linux/socket_test_util.h"
-#include "test/syscalls/linux/unix_domain_socket_test_util.h"
-#include "test/util/test_util.h"
-#include "test/util/thread_util.h"
+#include "test/syscalls/linux/udp_socket_test_cases.h"
namespace gvisor {
namespace testing {
namespace {
-// The initial port to be be used on gvisor.
-constexpr int TestPort = 40000;
-
-// Fixture for tests parameterized by the address family to use (AF_INET and
-// AF_INET6) when creating sockets.
-class UdpSocketTest : public ::testing::TestWithParam<AddressFamily> {
- protected:
- // Creates two sockets that will be used by test cases.
- void SetUp() override;
-
- // Closes the sockets created by SetUp().
- void TearDown() override {
- EXPECT_THAT(close(s_), SyscallSucceeds());
- EXPECT_THAT(close(t_), SyscallSucceeds());
-
- for (size_t i = 0; i < ABSL_ARRAYSIZE(ports_); ++i) {
- ASSERT_NO_ERRNO(FreeAvailablePort(ports_[i]));
- }
- }
-
- // First UDP socket.
- int s_;
-
- // Second UDP socket.
- int t_;
-
- // The length of the socket address.
- socklen_t addrlen_;
-
- // Initialized address pointing to loopback and port TestPort+i.
- struct sockaddr* addr_[3];
-
- // Initialize "any" address.
- struct sockaddr* anyaddr_;
-
- // Used ports.
- int ports_[3];
-
- private:
- // Storage for the loopback addresses.
- struct sockaddr_storage addr_storage_[3];
-
- // Storage for the "any" address.
- struct sockaddr_storage anyaddr_storage_;
-};
-
-// Gets a pointer to the port component of the given address.
-uint16_t* Port(struct sockaddr_storage* addr) {
- switch (addr->ss_family) {
- case AF_INET: {
- auto sin = reinterpret_cast<struct sockaddr_in*>(addr);
- return &sin->sin_port;
- }
- case AF_INET6: {
- auto sin6 = reinterpret_cast<struct sockaddr_in6*>(addr);
- return &sin6->sin6_port;
- }
- }
-
- return nullptr;
-}
-
-void UdpSocketTest::SetUp() {
- int type;
- if (GetParam() == AddressFamily::kIpv4) {
- type = AF_INET;
- auto sin = reinterpret_cast<struct sockaddr_in*>(&anyaddr_storage_);
- addrlen_ = sizeof(*sin);
- sin->sin_addr.s_addr = htonl(INADDR_ANY);
- } else {
- type = AF_INET6;
- auto sin6 = reinterpret_cast<struct sockaddr_in6*>(&anyaddr_storage_);
- addrlen_ = sizeof(*sin6);
- if (GetParam() == AddressFamily::kIpv6) {
- sin6->sin6_addr = IN6ADDR_ANY_INIT;
- } else {
- TestAddress const& v4_mapped_any = V4MappedAny();
- sin6->sin6_addr =
- reinterpret_cast<const struct sockaddr_in6*>(&v4_mapped_any.addr)
- ->sin6_addr;
- }
- }
- ASSERT_THAT(s_ = socket(type, SOCK_DGRAM, IPPROTO_UDP), SyscallSucceeds());
-
- ASSERT_THAT(t_ = socket(type, SOCK_DGRAM, IPPROTO_UDP), SyscallSucceeds());
-
- memset(&anyaddr_storage_, 0, sizeof(anyaddr_storage_));
- anyaddr_ = reinterpret_cast<struct sockaddr*>(&anyaddr_storage_);
- anyaddr_->sa_family = type;
-
- if (gvisor::testing::IsRunningOnGvisor()) {
- for (size_t i = 0; i < ABSL_ARRAYSIZE(ports_); ++i) {
- ports_[i] = TestPort + i;
- }
- } else {
- // When not under gvisor, use utility function to pick port. Assert that
- // all ports are different.
- std::string error;
- for (size_t i = 0; i < ABSL_ARRAYSIZE(ports_); ++i) {
- // Find an unused port, we specify port 0 to allow the kernel to provide
- // the port.
- bool unique = true;
- do {
- ports_[i] = ASSERT_NO_ERRNO_AND_VALUE(PortAvailable(
- 0, AddressFamily::kDualStack, SocketType::kUdp, false));
- ASSERT_GT(ports_[i], 0);
- for (size_t j = 0; j < i; ++j) {
- if (ports_[j] == ports_[i]) {
- unique = false;
- break;
- }
- }
- } while (!unique);
- }
- }
-
- // Initialize the sockaddrs.
- for (size_t i = 0; i < ABSL_ARRAYSIZE(addr_); ++i) {
- memset(&addr_storage_[i], 0, sizeof(addr_storage_[i]));
-
- addr_[i] = reinterpret_cast<struct sockaddr*>(&addr_storage_[i]);
- addr_[i]->sa_family = type;
-
- switch (type) {
- case AF_INET: {
- auto sin = reinterpret_cast<struct sockaddr_in*>(addr_[i]);
- sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
- sin->sin_port = htons(ports_[i]);
- break;
- }
- case AF_INET6: {
- auto sin6 = reinterpret_cast<struct sockaddr_in6*>(addr_[i]);
- sin6->sin6_addr = in6addr_loopback;
- sin6->sin6_port = htons(ports_[i]);
- break;
- }
- }
- }
-}
-
-TEST_P(UdpSocketTest, Creation) {
- int type = AF_INET6;
- if (GetParam() == AddressFamily::kIpv4) {
- type = AF_INET;
- }
-
- int s_;
-
- ASSERT_THAT(s_ = socket(type, SOCK_DGRAM, IPPROTO_UDP), SyscallSucceeds());
- EXPECT_THAT(close(s_), SyscallSucceeds());
-
- ASSERT_THAT(s_ = socket(type, SOCK_DGRAM, 0), SyscallSucceeds());
- EXPECT_THAT(close(s_), SyscallSucceeds());
-
- ASSERT_THAT(s_ = socket(type, SOCK_STREAM, IPPROTO_UDP), SyscallFails());
-}
-
-TEST_P(UdpSocketTest, Getsockname) {
- // Check that we're not bound.
- struct sockaddr_storage addr;
- socklen_t addrlen = sizeof(addr);
- EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
- SyscallSucceeds());
- EXPECT_EQ(addrlen, addrlen_);
- EXPECT_EQ(memcmp(&addr, anyaddr_, addrlen_), 0);
-
- // Bind, then check that we get the right address.
- ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-
- addrlen = sizeof(addr);
- EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
- SyscallSucceeds());
- EXPECT_EQ(addrlen, addrlen_);
- EXPECT_EQ(memcmp(&addr, addr_[0], addrlen_), 0);
-}
-
-TEST_P(UdpSocketTest, Getpeername) {
- // Check that we're not connected.
- struct sockaddr_storage addr;
- socklen_t addrlen = sizeof(addr);
- EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
- SyscallFailsWithErrno(ENOTCONN));
-
- // Connect, then check that we get the right address.
- ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
-
- addrlen = sizeof(addr);
- EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
- SyscallSucceeds());
- EXPECT_EQ(addrlen, addrlen_);
- EXPECT_EQ(memcmp(&addr, addr_[0], addrlen_), 0);
-}
-
-TEST_P(UdpSocketTest, SendNotConnected) {
- // Do send & write, they must fail.
- char buf[512];
- EXPECT_THAT(send(s_, buf, sizeof(buf), 0),
- SyscallFailsWithErrno(EDESTADDRREQ));
-
- EXPECT_THAT(write(s_, buf, sizeof(buf)), SyscallFailsWithErrno(EDESTADDRREQ));
-
- // Use sendto.
- ASSERT_THAT(sendto(s_, buf, sizeof(buf), 0, addr_[0], addrlen_),
- SyscallSucceedsWithValue(sizeof(buf)));
-
- // Check that we're bound now.
- struct sockaddr_storage addr;
- socklen_t addrlen = sizeof(addr);
- EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
- SyscallSucceeds());
- EXPECT_EQ(addrlen, addrlen_);
- EXPECT_NE(*Port(&addr), 0);
-}
-
-TEST_P(UdpSocketTest, ConnectBinds) {
- // Connect the socket.
- ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
-
- // Check that we're bound now.
- struct sockaddr_storage addr;
- socklen_t addrlen = sizeof(addr);
- EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
- SyscallSucceeds());
- EXPECT_EQ(addrlen, addrlen_);
- EXPECT_NE(*Port(&addr), 0);
-}
-
-TEST_P(UdpSocketTest, ReceiveNotBound) {
- char buf[512];
- EXPECT_THAT(recv(s_, buf, sizeof(buf), MSG_DONTWAIT),
- SyscallFailsWithErrno(EWOULDBLOCK));
-}
-
-TEST_P(UdpSocketTest, Bind) {
- ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-
- // Try to bind again.
- EXPECT_THAT(bind(s_, addr_[1], addrlen_), SyscallFailsWithErrno(EINVAL));
-
- // Check that we're still bound to the original address.
- struct sockaddr_storage addr;
- socklen_t addrlen = sizeof(addr);
- EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
- SyscallSucceeds());
- EXPECT_EQ(addrlen, addrlen_);
- EXPECT_EQ(memcmp(&addr, addr_[0], addrlen_), 0);
-}
-
-TEST_P(UdpSocketTest, BindInUse) {
- ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-
- // Try to bind again.
- EXPECT_THAT(bind(t_, addr_[0], addrlen_), SyscallFailsWithErrno(EADDRINUSE));
-}
-
-TEST_P(UdpSocketTest, ReceiveAfterConnect) {
- // Connect s_ to loopback:TestPort, and bind t_ to loopback:TestPort.
- ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
- ASSERT_THAT(bind(t_, addr_[0], addrlen_), SyscallSucceeds());
-
- // Get the address s_ was bound to during connect.
- struct sockaddr_storage addr;
- socklen_t addrlen = sizeof(addr);
- EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
- SyscallSucceeds());
- EXPECT_EQ(addrlen, addrlen_);
-
- // Send from t_ to s_.
- char buf[512];
- RandomizeBuffer(buf, sizeof(buf));
- ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0,
- reinterpret_cast<sockaddr*>(&addr), addrlen),
- SyscallSucceedsWithValue(sizeof(buf)));
-
- // Receive the data.
- char received[sizeof(buf)];
- EXPECT_THAT(recv(s_, received, sizeof(received), 0),
- SyscallSucceedsWithValue(sizeof(received)));
- EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
-}
-
-TEST_P(UdpSocketTest, ReceiveAfterDisconnect) {
- // Connect s_ to loopback:TestPort, and bind t_ to loopback:TestPort.
- ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
- ASSERT_THAT(bind(t_, addr_[0], addrlen_), SyscallSucceeds());
- ASSERT_THAT(connect(t_, addr_[1], addrlen_), SyscallSucceeds());
-
- // Get the address s_ was bound to during connect.
- struct sockaddr_storage addr;
- socklen_t addrlen = sizeof(addr);
- EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
- SyscallSucceeds());
- EXPECT_EQ(addrlen, addrlen_);
-
- for (int i = 0; i < 2; i++) {
- // Send from t_ to s_.
- char buf[512];
- RandomizeBuffer(buf, sizeof(buf));
- EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
- SyscallSucceeds());
- ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0,
- reinterpret_cast<sockaddr*>(&addr), addrlen),
- SyscallSucceedsWithValue(sizeof(buf)));
-
- // Receive the data.
- char received[sizeof(buf)];
- EXPECT_THAT(recv(s_, received, sizeof(received), 0),
- SyscallSucceedsWithValue(sizeof(received)));
- EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
-
- // Disconnect s_.
- struct sockaddr addr = {};
- addr.sa_family = AF_UNSPEC;
- ASSERT_THAT(connect(s_, &addr, sizeof(addr.sa_family)), SyscallSucceeds());
- // Connect s_ loopback:TestPort.
- ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
- }
-}
-
-TEST_P(UdpSocketTest, Connect) {
- ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
-
- // Check that we're connected to the right peer.
- struct sockaddr_storage peer;
- socklen_t peerlen = sizeof(peer);
- EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&peer), &peerlen),
- SyscallSucceeds());
- EXPECT_EQ(peerlen, addrlen_);
- EXPECT_EQ(memcmp(&peer, addr_[0], addrlen_), 0);
-
- // Try to bind after connect.
- EXPECT_THAT(bind(s_, addr_[1], addrlen_), SyscallFailsWithErrno(EINVAL));
-
- // Try to connect again.
- EXPECT_THAT(connect(s_, addr_[2], addrlen_), SyscallSucceeds());
-
- // Check that peer name changed.
- peerlen = sizeof(peer);
- EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&peer), &peerlen),
- SyscallSucceeds());
- EXPECT_EQ(peerlen, addrlen_);
- EXPECT_EQ(memcmp(&peer, addr_[2], addrlen_), 0);
-}
-
-void ConnectAny(AddressFamily family, int sockfd, uint16_t port) {
- struct sockaddr_storage addr = {};
-
- // Precondition check.
- {
- socklen_t addrlen = sizeof(addr);
- EXPECT_THAT(
- getsockname(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen),
- SyscallSucceeds());
-
- if (family == AddressFamily::kIpv4) {
- auto addr_out = reinterpret_cast<struct sockaddr_in*>(&addr);
- EXPECT_EQ(addrlen, sizeof(*addr_out));
- EXPECT_EQ(addr_out->sin_addr.s_addr, htonl(INADDR_ANY));
- } else {
- auto addr_out = reinterpret_cast<struct sockaddr_in6*>(&addr);
- EXPECT_EQ(addrlen, sizeof(*addr_out));
- struct in6_addr any = IN6ADDR_ANY_INIT;
- EXPECT_EQ(memcmp(&addr_out->sin6_addr, &any, sizeof(in6_addr)), 0);
- }
-
- {
- socklen_t addrlen = sizeof(addr);
- EXPECT_THAT(
- getpeername(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen),
- SyscallFailsWithErrno(ENOTCONN));
- }
-
- struct sockaddr_storage baddr = {};
- if (family == AddressFamily::kIpv4) {
- auto addr_in = reinterpret_cast<struct sockaddr_in*>(&baddr);
- addrlen = sizeof(*addr_in);
- addr_in->sin_family = AF_INET;
- addr_in->sin_addr.s_addr = htonl(INADDR_ANY);
- addr_in->sin_port = port;
- } else {
- auto addr_in = reinterpret_cast<struct sockaddr_in6*>(&baddr);
- addrlen = sizeof(*addr_in);
- addr_in->sin6_family = AF_INET6;
- addr_in->sin6_port = port;
- if (family == AddressFamily::kIpv6) {
- addr_in->sin6_addr = IN6ADDR_ANY_INIT;
- } else {
- TestAddress const& v4_mapped_any = V4MappedAny();
- addr_in->sin6_addr =
- reinterpret_cast<const struct sockaddr_in6*>(&v4_mapped_any.addr)
- ->sin6_addr;
- }
- }
-
- // TODO(b/138658473): gVisor doesn't allow connecting to the zero port.
- if (port == 0) {
- SKIP_IF(IsRunningOnGvisor());
- }
-
- ASSERT_THAT(connect(sockfd, reinterpret_cast<sockaddr*>(&baddr), addrlen),
- SyscallSucceeds());
- }
-
- // Postcondition check.
- {
- socklen_t addrlen = sizeof(addr);
- EXPECT_THAT(
- getsockname(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen),
- SyscallSucceeds());
-
- if (family == AddressFamily::kIpv4) {
- auto addr_out = reinterpret_cast<struct sockaddr_in*>(&addr);
- EXPECT_EQ(addrlen, sizeof(*addr_out));
- EXPECT_EQ(addr_out->sin_addr.s_addr, htonl(INADDR_LOOPBACK));
- } else {
- auto addr_out = reinterpret_cast<struct sockaddr_in6*>(&addr);
- EXPECT_EQ(addrlen, sizeof(*addr_out));
- struct in6_addr loopback;
- if (family == AddressFamily::kIpv6) {
- loopback = IN6ADDR_LOOPBACK_INIT;
- } else {
- TestAddress const& v4_mapped_loopback = V4MappedLoopback();
- loopback = reinterpret_cast<const struct sockaddr_in6*>(
- &v4_mapped_loopback.addr)
- ->sin6_addr;
- }
-
- EXPECT_EQ(memcmp(&addr_out->sin6_addr, &loopback, sizeof(in6_addr)), 0);
- }
-
- addrlen = sizeof(addr);
- if (port == 0) {
- EXPECT_THAT(
- getpeername(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen),
- SyscallFailsWithErrno(ENOTCONN));
- } else {
- EXPECT_THAT(
- getpeername(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen),
- SyscallSucceeds());
- }
- }
-}
-
-TEST_P(UdpSocketTest, ConnectAny) { ConnectAny(GetParam(), s_, 0); }
-
-TEST_P(UdpSocketTest, ConnectAnyWithPort) {
- auto port = *Port(reinterpret_cast<struct sockaddr_storage*>(addr_[1]));
- ConnectAny(GetParam(), s_, port);
-}
-
-void DisconnectAfterConnectAny(AddressFamily family, int sockfd, int port) {
- struct sockaddr_storage addr = {};
-
- socklen_t addrlen = sizeof(addr);
- struct sockaddr_storage baddr = {};
- if (family == AddressFamily::kIpv4) {
- auto addr_in = reinterpret_cast<struct sockaddr_in*>(&baddr);
- addrlen = sizeof(*addr_in);
- addr_in->sin_family = AF_INET;
- addr_in->sin_addr.s_addr = htonl(INADDR_ANY);
- addr_in->sin_port = port;
- } else {
- auto addr_in = reinterpret_cast<struct sockaddr_in6*>(&baddr);
- addrlen = sizeof(*addr_in);
- addr_in->sin6_family = AF_INET6;
- addr_in->sin6_port = port;
- if (family == AddressFamily::kIpv6) {
- addr_in->sin6_addr = IN6ADDR_ANY_INIT;
- } else {
- TestAddress const& v4_mapped_any = V4MappedAny();
- addr_in->sin6_addr =
- reinterpret_cast<const struct sockaddr_in6*>(&v4_mapped_any.addr)
- ->sin6_addr;
- }
- }
-
- // TODO(b/138658473): gVisor doesn't allow connecting to the zero port.
- if (port == 0) {
- SKIP_IF(IsRunningOnGvisor());
- }
-
- ASSERT_THAT(connect(sockfd, reinterpret_cast<sockaddr*>(&baddr), addrlen),
- SyscallSucceeds());
- // Now the socket is bound to the loopback address.
-
- // Disconnect
- addrlen = sizeof(addr);
- addr.ss_family = AF_UNSPEC;
- ASSERT_THAT(connect(sockfd, reinterpret_cast<sockaddr*>(&addr), addrlen),
- SyscallSucceeds());
-
- // Check that after disconnect the socket is bound to the ANY address.
- EXPECT_THAT(getsockname(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen),
- SyscallSucceeds());
- if (family == AddressFamily::kIpv4) {
- auto addr_out = reinterpret_cast<struct sockaddr_in*>(&addr);
- EXPECT_EQ(addrlen, sizeof(*addr_out));
- EXPECT_EQ(addr_out->sin_addr.s_addr, htonl(INADDR_ANY));
- } else {
- auto addr_out = reinterpret_cast<struct sockaddr_in6*>(&addr);
- EXPECT_EQ(addrlen, sizeof(*addr_out));
- struct in6_addr loopback = IN6ADDR_ANY_INIT;
-
- EXPECT_EQ(memcmp(&addr_out->sin6_addr, &loopback, sizeof(in6_addr)), 0);
- }
-}
-
-TEST_P(UdpSocketTest, DisconnectAfterConnectAny) {
- DisconnectAfterConnectAny(GetParam(), s_, 0);
-}
-
-TEST_P(UdpSocketTest, DisconnectAfterConnectAnyWithPort) {
- auto port = *Port(reinterpret_cast<struct sockaddr_storage*>(addr_[1]));
- DisconnectAfterConnectAny(GetParam(), s_, port);
-}
-
-TEST_P(UdpSocketTest, DisconnectAfterBind) {
- ASSERT_THAT(bind(s_, addr_[1], addrlen_), SyscallSucceeds());
- // Connect the socket.
- ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
-
- struct sockaddr_storage addr = {};
- addr.ss_family = AF_UNSPEC;
- EXPECT_THAT(
- connect(s_, reinterpret_cast<sockaddr*>(&addr), sizeof(addr.ss_family)),
- SyscallSucceeds());
-
- // Check that we're still bound.
- socklen_t addrlen = sizeof(addr);
- EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
- SyscallSucceeds());
-
- EXPECT_EQ(addrlen, addrlen_);
- EXPECT_EQ(memcmp(&addr, addr_[1], addrlen_), 0);
-
- addrlen = sizeof(addr);
- EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
- SyscallFailsWithErrno(ENOTCONN));
-}
-
-TEST_P(UdpSocketTest, DisconnectAfterBindToAny) {
- struct sockaddr_storage baddr = {};
- socklen_t addrlen;
- auto port = *Port(reinterpret_cast<struct sockaddr_storage*>(addr_[1]));
- if (GetParam() == AddressFamily::kIpv4) {
- auto addr_in = reinterpret_cast<struct sockaddr_in*>(&baddr);
- addr_in->sin_family = AF_INET;
- addr_in->sin_port = port;
- addr_in->sin_addr.s_addr = htonl(INADDR_ANY);
- } else {
- auto addr_in = reinterpret_cast<struct sockaddr_in6*>(&baddr);
- addr_in->sin6_family = AF_INET6;
- addr_in->sin6_port = port;
- addr_in->sin6_scope_id = 0;
- addr_in->sin6_addr = IN6ADDR_ANY_INIT;
- }
- ASSERT_THAT(bind(s_, reinterpret_cast<sockaddr*>(&baddr), addrlen_),
- SyscallSucceeds());
- // Connect the socket.
- ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
-
- struct sockaddr_storage addr = {};
- addr.ss_family = AF_UNSPEC;
- EXPECT_THAT(
- connect(s_, reinterpret_cast<sockaddr*>(&addr), sizeof(addr.ss_family)),
- SyscallSucceeds());
-
- // Check that we're still bound.
- addrlen = sizeof(addr);
- EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
- SyscallSucceeds());
-
- EXPECT_EQ(addrlen, addrlen_);
- EXPECT_EQ(memcmp(&addr, &baddr, addrlen), 0);
-
- addrlen = sizeof(addr);
- EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
- SyscallFailsWithErrno(ENOTCONN));
-}
-
-TEST_P(UdpSocketTest, Disconnect) {
- for (int i = 0; i < 2; i++) {
- // Try to connect again.
- EXPECT_THAT(connect(s_, addr_[2], addrlen_), SyscallSucceeds());
-
- // Check that we're connected to the right peer.
- struct sockaddr_storage peer;
- socklen_t peerlen = sizeof(peer);
- EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&peer), &peerlen),
- SyscallSucceeds());
- EXPECT_EQ(peerlen, addrlen_);
- EXPECT_EQ(memcmp(&peer, addr_[2], addrlen_), 0);
-
- // Try to disconnect.
- struct sockaddr_storage addr = {};
- addr.ss_family = AF_UNSPEC;
- EXPECT_THAT(
- connect(s_, reinterpret_cast<sockaddr*>(&addr), sizeof(addr.ss_family)),
- SyscallSucceeds());
-
- peerlen = sizeof(peer);
- EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&peer), &peerlen),
- SyscallFailsWithErrno(ENOTCONN));
-
- // Check that we're still bound.
- socklen_t addrlen = sizeof(addr);
- EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
- SyscallSucceeds());
- EXPECT_EQ(addrlen, addrlen_);
- EXPECT_EQ(*Port(&addr), 0);
- }
-}
-
-TEST_P(UdpSocketTest, ConnectBadAddress) {
- struct sockaddr addr = {};
- addr.sa_family = addr_[0]->sa_family;
- ASSERT_THAT(connect(s_, &addr, sizeof(addr.sa_family)),
- SyscallFailsWithErrno(EINVAL));
-}
-
-TEST_P(UdpSocketTest, SendToAddressOtherThanConnected) {
- ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
-
- // Send to a different destination than we're connected to.
- char buf[512];
- EXPECT_THAT(sendto(s_, buf, sizeof(buf), 0, addr_[1], addrlen_),
- SyscallSucceedsWithValue(sizeof(buf)));
-}
-
-TEST_P(UdpSocketTest, ZerolengthWriteAllowed) {
- // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1.
- ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
- ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
-
- // Bind t_ to loopback:TestPort+1.
- ASSERT_THAT(bind(t_, addr_[1], addrlen_), SyscallSucceeds());
-
- char buf[3];
- // Send zero length packet from s_ to t_.
- ASSERT_THAT(write(s_, buf, 0), SyscallSucceedsWithValue(0));
- // Receive the packet.
- char received[3];
- EXPECT_THAT(read(t_, received, sizeof(received)),
- SyscallSucceedsWithValue(0));
-}
-
-TEST_P(UdpSocketTest, ZerolengthWriteAllowedNonBlockRead) {
- // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1.
- ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
- ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
-
- // Bind t_ to loopback:TestPort+1.
- ASSERT_THAT(bind(t_, addr_[1], addrlen_), SyscallSucceeds());
-
- // Set t_ to non-blocking.
- int opts = 0;
- ASSERT_THAT(opts = fcntl(t_, F_GETFL), SyscallSucceeds());
- ASSERT_THAT(fcntl(t_, F_SETFL, opts | O_NONBLOCK), SyscallSucceeds());
-
- char buf[3];
- // Send zero length packet from s_ to t_.
- ASSERT_THAT(write(s_, buf, 0), SyscallSucceedsWithValue(0));
- // Receive the packet.
- char received[3];
- EXPECT_THAT(read(t_, received, sizeof(received)),
- SyscallSucceedsWithValue(0));
- EXPECT_THAT(read(t_, received, sizeof(received)),
- SyscallFailsWithErrno(EAGAIN));
-}
-
-TEST_P(UdpSocketTest, SendAndReceiveNotConnected) {
- // Bind s_ to loopback.
- ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-
- // Send some data to s_.
- char buf[512];
- RandomizeBuffer(buf, sizeof(buf));
-
- ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_),
- SyscallSucceedsWithValue(sizeof(buf)));
-
- // Receive the data.
- char received[sizeof(buf)];
- EXPECT_THAT(recv(s_, received, sizeof(received), 0),
- SyscallSucceedsWithValue(sizeof(received)));
- EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
-}
-
-TEST_P(UdpSocketTest, SendAndReceiveConnected) {
- // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1.
- ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
- ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
-
- // Bind t_ to loopback:TestPort+1.
- ASSERT_THAT(bind(t_, addr_[1], addrlen_), SyscallSucceeds());
-
- // Send some data from t_ to s_.
- char buf[512];
- RandomizeBuffer(buf, sizeof(buf));
-
- ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_),
- SyscallSucceedsWithValue(sizeof(buf)));
-
- // Receive the data.
- char received[sizeof(buf)];
- EXPECT_THAT(recv(s_, received, sizeof(received), 0),
- SyscallSucceedsWithValue(sizeof(received)));
- EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
-}
-
-TEST_P(UdpSocketTest, ReceiveFromNotConnected) {
- // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1.
- ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
- ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
-
- // Bind t_ to loopback:TestPort+2.
- ASSERT_THAT(bind(t_, addr_[2], addrlen_), SyscallSucceeds());
-
- // Send some data from t_ to s_.
- char buf[512];
- ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_),
- SyscallSucceedsWithValue(sizeof(buf)));
-
- // Check that the data isn't_ received because it was sent from a different
- // address than we're connected.
- EXPECT_THAT(recv(s_, buf, sizeof(buf), MSG_DONTWAIT),
- SyscallFailsWithErrno(EWOULDBLOCK));
-}
-
-TEST_P(UdpSocketTest, ReceiveBeforeConnect) {
- // Bind s_ to loopback:TestPort.
- ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-
- // Bind t_ to loopback:TestPort+2.
- ASSERT_THAT(bind(t_, addr_[2], addrlen_), SyscallSucceeds());
-
- // Send some data from t_ to s_.
- char buf[512];
- RandomizeBuffer(buf, sizeof(buf));
-
- ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_),
- SyscallSucceedsWithValue(sizeof(buf)));
-
- // Connect to loopback:TestPort+1.
- ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
-
- // Receive the data. It works because it was sent before the connect.
- char received[sizeof(buf)];
- EXPECT_THAT(recv(s_, received, sizeof(received), 0),
- SyscallSucceedsWithValue(sizeof(received)));
- EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
-
- // Send again. This time it should not be received.
- ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_),
- SyscallSucceedsWithValue(sizeof(buf)));
-
- EXPECT_THAT(recv(s_, buf, sizeof(buf), MSG_DONTWAIT),
- SyscallFailsWithErrno(EWOULDBLOCK));
-}
-
-TEST_P(UdpSocketTest, ReceiveFrom) {
- // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1.
- ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
- ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
-
- // Bind t_ to loopback:TestPort+1.
- ASSERT_THAT(bind(t_, addr_[1], addrlen_), SyscallSucceeds());
-
- // Send some data from t_ to s_.
- char buf[512];
- RandomizeBuffer(buf, sizeof(buf));
-
- ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_),
- SyscallSucceedsWithValue(sizeof(buf)));
-
- // Receive the data and sender address.
- char received[sizeof(buf)];
- struct sockaddr_storage addr;
- socklen_t addrlen = sizeof(addr);
- EXPECT_THAT(recvfrom(s_, received, sizeof(received), 0,
- reinterpret_cast<sockaddr*>(&addr), &addrlen),
- SyscallSucceedsWithValue(sizeof(received)));
- EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
- EXPECT_EQ(addrlen, addrlen_);
- EXPECT_EQ(memcmp(&addr, addr_[1], addrlen_), 0);
-}
-
-TEST_P(UdpSocketTest, Listen) {
- ASSERT_THAT(listen(s_, SOMAXCONN), SyscallFailsWithErrno(EOPNOTSUPP));
-}
-
-TEST_P(UdpSocketTest, Accept) {
- ASSERT_THAT(accept(s_, nullptr, nullptr), SyscallFailsWithErrno(EOPNOTSUPP));
-}
-
-// This test validates that a read shutdown with pending data allows the read
-// to proceed with the data before returning EAGAIN.
-TEST_P(UdpSocketTest, ReadShutdownNonblockPendingData) {
- char received[512];
-
- // Bind t_ to loopback:TestPort+2.
- ASSERT_THAT(bind(t_, addr_[2], addrlen_), SyscallSucceeds());
- ASSERT_THAT(connect(t_, addr_[1], addrlen_), SyscallSucceeds());
-
- // Connect the socket, then try to shutdown again.
- ASSERT_THAT(bind(s_, addr_[1], addrlen_), SyscallSucceeds());
- ASSERT_THAT(connect(s_, addr_[2], addrlen_), SyscallSucceeds());
-
- // Verify that we get EWOULDBLOCK when there is nothing to read.
- EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
- SyscallFailsWithErrno(EWOULDBLOCK));
-
- const char* buf = "abc";
- EXPECT_THAT(write(t_, buf, 3), SyscallSucceedsWithValue(3));
-
- int opts = 0;
- ASSERT_THAT(opts = fcntl(s_, F_GETFL), SyscallSucceeds());
- ASSERT_THAT(fcntl(s_, F_SETFL, opts | O_NONBLOCK), SyscallSucceeds());
- ASSERT_THAT(opts = fcntl(s_, F_GETFL), SyscallSucceeds());
- ASSERT_NE(opts & O_NONBLOCK, 0);
-
- EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds());
-
- // We should get the data even though read has been shutdown.
- EXPECT_THAT(recv(s_, received, 2, 0), SyscallSucceedsWithValue(2));
-
- // Because we read less than the entire packet length, since it's a packet
- // based socket any subsequent reads should return EWOULDBLOCK.
- EXPECT_THAT(recv(s_, received, 1, 0), SyscallFailsWithErrno(EWOULDBLOCK));
-}
-
-// This test is validating that even after a socket is shutdown if it's
-// reconnected it will reset the shutdown state.
-TEST_P(UdpSocketTest, ReadShutdownSameSocketResetsShutdownState) {
- char received[512];
- EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
- SyscallFailsWithErrno(EWOULDBLOCK));
-
- EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallFailsWithErrno(ENOTCONN));
-
- EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
- SyscallFailsWithErrno(EWOULDBLOCK));
-
- // Connect the socket, then try to shutdown again.
- ASSERT_THAT(bind(s_, addr_[1], addrlen_), SyscallSucceeds());
- ASSERT_THAT(connect(s_, addr_[2], addrlen_), SyscallSucceeds());
-
- EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
- SyscallFailsWithErrno(EWOULDBLOCK));
-}
-
-TEST_P(UdpSocketTest, ReadShutdown) {
- char received[512];
- EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
- SyscallFailsWithErrno(EWOULDBLOCK));
-
- EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallFailsWithErrno(ENOTCONN));
-
- EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
- SyscallFailsWithErrno(EWOULDBLOCK));
-
- // Connect the socket, then try to shutdown again.
- ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
-
- EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
- SyscallFailsWithErrno(EWOULDBLOCK));
-
- EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds());
-
- EXPECT_THAT(recv(s_, received, sizeof(received), 0),
- SyscallSucceedsWithValue(0));
-}
-
-TEST_P(UdpSocketTest, ReadShutdownDifferentThread) {
- char received[512];
- EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
- SyscallFailsWithErrno(EWOULDBLOCK));
-
- // Connect the socket, then shutdown from another thread.
- ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
-
- EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
- SyscallFailsWithErrno(EWOULDBLOCK));
-
- ScopedThread t([&] {
- absl::SleepFor(absl::Milliseconds(200));
- EXPECT_THAT(shutdown(this->s_, SHUT_RD), SyscallSucceeds());
- });
- EXPECT_THAT(RetryEINTR(recv)(s_, received, sizeof(received), 0),
- SyscallSucceedsWithValue(0));
- t.Join();
-
- EXPECT_THAT(RetryEINTR(recv)(s_, received, sizeof(received), 0),
- SyscallSucceedsWithValue(0));
-}
-
-TEST_P(UdpSocketTest, WriteShutdown) {
- EXPECT_THAT(shutdown(s_, SHUT_WR), SyscallFailsWithErrno(ENOTCONN));
- ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
- EXPECT_THAT(shutdown(s_, SHUT_WR), SyscallSucceeds());
-}
-
-TEST_P(UdpSocketTest, SynchronousReceive) {
- // Bind s_ to loopback.
- ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-
- // Send some data to s_ from another thread.
- char buf[512];
- RandomizeBuffer(buf, sizeof(buf));
-
- // Receive the data prior to actually starting the other thread.
- char received[512];
- EXPECT_THAT(RetryEINTR(recv)(s_, received, sizeof(received), MSG_DONTWAIT),
- SyscallFailsWithErrno(EWOULDBLOCK));
-
- // Start the thread.
- ScopedThread t([&] {
- absl::SleepFor(absl::Milliseconds(200));
- ASSERT_THAT(
- sendto(this->t_, buf, sizeof(buf), 0, this->addr_[0], this->addrlen_),
- SyscallSucceedsWithValue(sizeof(buf)));
- });
-
- EXPECT_THAT(RetryEINTR(recv)(s_, received, sizeof(received), 0),
- SyscallSucceedsWithValue(512));
- EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
-}
-
-TEST_P(UdpSocketTest, BoundaryPreserved_SendRecv) {
- // Bind s_ to loopback:TestPort.
- ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-
- // Send 3 packets from t_ to s_.
- constexpr int psize = 100;
- char buf[3 * psize];
- RandomizeBuffer(buf, sizeof(buf));
-
- for (int i = 0; i < 3; ++i) {
- ASSERT_THAT(sendto(t_, buf + i * psize, psize, 0, addr_[0], addrlen_),
- SyscallSucceedsWithValue(psize));
- }
-
- // Receive the data as 3 separate packets.
- char received[6 * psize];
- for (int i = 0; i < 3; ++i) {
- EXPECT_THAT(recv(s_, received + i * psize, 3 * psize, 0),
- SyscallSucceedsWithValue(psize));
- }
- EXPECT_EQ(memcmp(buf, received, 3 * psize), 0);
-}
-
-TEST_P(UdpSocketTest, BoundaryPreserved_WritevReadv) {
- // Bind s_ to loopback:TestPort.
- ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-
- // Direct writes from t_ to s_.
- ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
-
- // Send 2 packets from t_ to s_, where each packet's data consists of 2
- // discontiguous iovecs.
- constexpr size_t kPieceSize = 100;
- char buf[4 * kPieceSize];
- RandomizeBuffer(buf, sizeof(buf));
-
- for (int i = 0; i < 2; i++) {
- struct iovec iov[2];
- for (int j = 0; j < 2; j++) {
- iov[j].iov_base = reinterpret_cast<void*>(
- reinterpret_cast<uintptr_t>(buf) + (i + 2 * j) * kPieceSize);
- iov[j].iov_len = kPieceSize;
- }
- ASSERT_THAT(writev(t_, iov, 2), SyscallSucceedsWithValue(2 * kPieceSize));
- }
-
- // Receive the data as 2 separate packets.
- char received[6 * kPieceSize];
- for (int i = 0; i < 2; i++) {
- struct iovec iov[3];
- for (int j = 0; j < 3; j++) {
- iov[j].iov_base = reinterpret_cast<void*>(
- reinterpret_cast<uintptr_t>(received) + (i + 2 * j) * kPieceSize);
- iov[j].iov_len = kPieceSize;
- }
- ASSERT_THAT(readv(s_, iov, 3), SyscallSucceedsWithValue(2 * kPieceSize));
- }
- EXPECT_EQ(memcmp(buf, received, 4 * kPieceSize), 0);
-}
-
-TEST_P(UdpSocketTest, BoundaryPreserved_SendMsgRecvMsg) {
- // Bind s_ to loopback:TestPort.
- ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-
- // Send 2 packets from t_ to s_, where each packet's data consists of 2
- // discontiguous iovecs.
- constexpr size_t kPieceSize = 100;
- char buf[4 * kPieceSize];
- RandomizeBuffer(buf, sizeof(buf));
-
- for (int i = 0; i < 2; i++) {
- struct iovec iov[2];
- for (int j = 0; j < 2; j++) {
- iov[j].iov_base = reinterpret_cast<void*>(
- reinterpret_cast<uintptr_t>(buf) + (i + 2 * j) * kPieceSize);
- iov[j].iov_len = kPieceSize;
- }
- struct msghdr msg = {};
- msg.msg_name = addr_[0];
- msg.msg_namelen = addrlen_;
- msg.msg_iov = iov;
- msg.msg_iovlen = 2;
- ASSERT_THAT(sendmsg(t_, &msg, 0), SyscallSucceedsWithValue(2 * kPieceSize));
- }
-
- // Receive the data as 2 separate packets.
- char received[6 * kPieceSize];
- for (int i = 0; i < 2; i++) {
- struct iovec iov[3];
- for (int j = 0; j < 3; j++) {
- iov[j].iov_base = reinterpret_cast<void*>(
- reinterpret_cast<uintptr_t>(received) + (i + 2 * j) * kPieceSize);
- iov[j].iov_len = kPieceSize;
- }
- struct msghdr msg = {};
- msg.msg_iov = iov;
- msg.msg_iovlen = 3;
- ASSERT_THAT(recvmsg(s_, &msg, 0), SyscallSucceedsWithValue(2 * kPieceSize));
- }
- EXPECT_EQ(memcmp(buf, received, 4 * kPieceSize), 0);
-}
-
-TEST_P(UdpSocketTest, FIONREADShutdown) {
- int n = -1;
- EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
- EXPECT_EQ(n, 0);
-
- // A UDP socket must be connected before it can be shutdown.
- ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
-
- n = -1;
- EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
- EXPECT_EQ(n, 0);
-
- EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds());
-
- n = -1;
- EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
- EXPECT_EQ(n, 0);
-}
-
-TEST_P(UdpSocketTest, FIONREADWriteShutdown) {
- int n = -1;
- EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
- EXPECT_EQ(n, 0);
-
- // Bind s_ to loopback:TestPort.
- ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-
- // A UDP socket must be connected before it can be shutdown.
- ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
-
- n = -1;
- EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
- EXPECT_EQ(n, 0);
-
- const char str[] = "abc";
- ASSERT_THAT(send(s_, str, sizeof(str), 0),
- SyscallSucceedsWithValue(sizeof(str)));
-
- n = -1;
- EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
- EXPECT_EQ(n, sizeof(str));
-
- EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds());
-
- n = -1;
- EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
- EXPECT_EQ(n, sizeof(str));
-}
-
-TEST_P(UdpSocketTest, FIONREAD) {
- // Bind s_ to loopback:TestPort.
- ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-
- // Check that the bound socket with an empty buffer reports an empty first
- // packet.
- int n = -1;
- EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
- EXPECT_EQ(n, 0);
-
- // Send 3 packets from t_ to s_.
- constexpr int psize = 100;
- char buf[3 * psize];
- RandomizeBuffer(buf, sizeof(buf));
-
- for (int i = 0; i < 3; ++i) {
- ASSERT_THAT(sendto(t_, buf + i * psize, psize, 0, addr_[0], addrlen_),
- SyscallSucceedsWithValue(psize));
-
- // Check that regardless of how many packets are in the queue, the size
- // reported is that of a single packet.
- n = -1;
- EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
- EXPECT_EQ(n, psize);
- }
-}
-
-TEST_P(UdpSocketTest, FIONREADZeroLengthPacket) {
- // Bind s_ to loopback:TestPort.
- ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-
- // Check that the bound socket with an empty buffer reports an empty first
- // packet.
- int n = -1;
- EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
- EXPECT_EQ(n, 0);
-
- // Send 3 packets from t_ to s_.
- constexpr int psize = 100;
- char buf[3 * psize];
- RandomizeBuffer(buf, sizeof(buf));
-
- for (int i = 0; i < 3; ++i) {
- ASSERT_THAT(sendto(t_, buf + i * psize, 0, 0, addr_[0], addrlen_),
- SyscallSucceedsWithValue(0));
-
- // Check that regardless of how many packets are in the queue, the size
- // reported is that of a single packet.
- n = -1;
- EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
- EXPECT_EQ(n, 0);
- }
-}
-
-TEST_P(UdpSocketTest, FIONREADZeroLengthWriteShutdown) {
- int n = -1;
- EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
- EXPECT_EQ(n, 0);
-
- // Bind s_ to loopback:TestPort.
- ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
-
- // A UDP socket must be connected before it can be shutdown.
- ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
-
- n = -1;
- EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
- EXPECT_EQ(n, 0);
-
- const char str[] = "abc";
- ASSERT_THAT(send(s_, str, 0, 0), SyscallSucceedsWithValue(0));
-
- n = -1;
- EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
- EXPECT_EQ(n, 0);
-
- EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds());
-
- n = -1;
- EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
- EXPECT_EQ(n, 0);
-}
-
-TEST_P(UdpSocketTest, ErrorQueue) {
- char cmsgbuf[CMSG_SPACE(sizeof(sock_extended_err))];
- msghdr msg;
- memset(&msg, 0, sizeof(msg));
- iovec iov;
- memset(&iov, 0, sizeof(iov));
- msg.msg_iov = &iov;
- msg.msg_iovlen = 1;
- msg.msg_control = cmsgbuf;
- msg.msg_controllen = sizeof(cmsgbuf);
-
- // recv*(MSG_ERRQUEUE) never blocks, even without MSG_DONTWAIT.
- EXPECT_THAT(RetryEINTR(recvmsg)(s_, &msg, MSG_ERRQUEUE),
- SyscallFailsWithErrno(EAGAIN));
-}
-
-TEST_P(UdpSocketTest, SoTimestampOffByDefault) {
- int v = -1;
- socklen_t optlen = sizeof(v);
- ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_TIMESTAMP, &v, &optlen),
- SyscallSucceeds());
- ASSERT_EQ(v, kSockOptOff);
- ASSERT_EQ(optlen, sizeof(v));
-}
-
-TEST_P(UdpSocketTest, SoTimestamp) {
- ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
- ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
-
- int v = 1;
- ASSERT_THAT(setsockopt(s_, SOL_SOCKET, SO_TIMESTAMP, &v, sizeof(v)),
- SyscallSucceeds());
-
- char buf[3];
- // Send zero length packet from t_ to s_.
- ASSERT_THAT(RetryEINTR(write)(t_, buf, 0), SyscallSucceedsWithValue(0));
-
- char cmsgbuf[CMSG_SPACE(sizeof(struct timeval))];
- msghdr msg;
- memset(&msg, 0, sizeof(msg));
- iovec iov;
- memset(&iov, 0, sizeof(iov));
- msg.msg_iov = &iov;
- msg.msg_iovlen = 1;
- msg.msg_control = cmsgbuf;
- msg.msg_controllen = sizeof(cmsgbuf);
-
- ASSERT_THAT(RetryEINTR(recvmsg)(s_, &msg, 0), SyscallSucceedsWithValue(0));
-
- struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
- ASSERT_NE(cmsg, nullptr);
- ASSERT_EQ(cmsg->cmsg_level, SOL_SOCKET);
- ASSERT_EQ(cmsg->cmsg_type, SO_TIMESTAMP);
- ASSERT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(struct timeval)));
-
- struct timeval tv = {};
- memcpy(&tv, CMSG_DATA(cmsg), sizeof(struct timeval));
-
- ASSERT_TRUE(tv.tv_sec != 0 || tv.tv_usec != 0);
-
- // There should be nothing to get via ioctl.
- ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv), SyscallFailsWithErrno(ENOENT));
-}
-
-TEST_P(UdpSocketTest, WriteShutdownNotConnected) {
- EXPECT_THAT(shutdown(s_, SHUT_WR), SyscallFailsWithErrno(ENOTCONN));
-}
-
-TEST_P(UdpSocketTest, TimestampIoctl) {
- ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
- ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
-
- char buf[3];
- // Send packet from t_ to s_.
- ASSERT_THAT(RetryEINTR(write)(t_, buf, sizeof(buf)),
- SyscallSucceedsWithValue(sizeof(buf)));
-
- // There should be no control messages.
- char recv_buf[sizeof(buf)];
- ASSERT_NO_FATAL_FAILURE(RecvNoCmsg(s_, recv_buf, sizeof(recv_buf)));
-
- // A nonzero timeval should be available via ioctl.
- struct timeval tv = {};
- ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv), SyscallSucceeds());
- ASSERT_TRUE(tv.tv_sec != 0 || tv.tv_usec != 0);
-}
-
-TEST_P(UdpSocketTest, TimetstampIoctlNothingRead) {
- ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
- ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
-
- struct timeval tv = {};
- ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv), SyscallFailsWithErrno(ENOENT));
-}
-
-// Test that the timestamp accessed via SIOCGSTAMP is still accessible after
-// SO_TIMESTAMP is enabled and used to retrieve a timestamp.
-TEST_P(UdpSocketTest, TimestampIoctlPersistence) {
- ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
- ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
-
- char buf[3];
- // Send packet from t_ to s_.
- ASSERT_THAT(RetryEINTR(write)(t_, buf, sizeof(buf)),
- SyscallSucceedsWithValue(sizeof(buf)));
- ASSERT_THAT(RetryEINTR(write)(t_, buf, 0), SyscallSucceedsWithValue(0));
-
- // There should be no control messages.
- char recv_buf[sizeof(buf)];
- ASSERT_NO_FATAL_FAILURE(RecvNoCmsg(s_, recv_buf, sizeof(recv_buf)));
-
- // A nonzero timeval should be available via ioctl.
- struct timeval tv = {};
- ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv), SyscallSucceeds());
- ASSERT_TRUE(tv.tv_sec != 0 || tv.tv_usec != 0);
-
- // Enable SO_TIMESTAMP and send a message.
- int v = 1;
- EXPECT_THAT(setsockopt(s_, SOL_SOCKET, SO_TIMESTAMP, &v, sizeof(v)),
- SyscallSucceeds());
- ASSERT_THAT(RetryEINTR(write)(t_, buf, 0), SyscallSucceedsWithValue(0));
-
- // There should be a message for SO_TIMESTAMP.
- char cmsgbuf[CMSG_SPACE(sizeof(struct timeval))];
- msghdr msg = {};
- iovec iov = {};
- msg.msg_iov = &iov;
- msg.msg_iovlen = 1;
- msg.msg_control = cmsgbuf;
- msg.msg_controllen = sizeof(cmsgbuf);
- ASSERT_THAT(RetryEINTR(recvmsg)(s_, &msg, 0), SyscallSucceedsWithValue(0));
- struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
- cmsg = CMSG_FIRSTHDR(&msg);
- ASSERT_NE(cmsg, nullptr);
-
- // The ioctl should return the exact same values as before.
- struct timeval tv2 = {};
- ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv2), SyscallSucceeds());
- ASSERT_EQ(tv.tv_sec, tv2.tv_sec);
- ASSERT_EQ(tv.tv_usec, tv2.tv_usec);
-}
-
INSTANTIATE_TEST_SUITE_P(AllInetTests, UdpSocketTest,
::testing::Values(AddressFamily::kIpv4,
AddressFamily::kIpv6,
diff --git a/test/syscalls/linux/udp_socket_errqueue_test_case.cc b/test/syscalls/linux/udp_socket_errqueue_test_case.cc
new file mode 100644
index 000000000..147978f46
--- /dev/null
+++ b/test/syscalls/linux/udp_socket_errqueue_test_case.cc
@@ -0,0 +1,54 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/udp_socket_test_cases.h"
+
+#include <arpa/inet.h>
+#include <fcntl.h>
+#include <linux/errqueue.h>
+#include <netinet/in.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include "gtest/gtest.h"
+#include "absl/base/macros.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+TEST_P(UdpSocketTest, ErrorQueue) {
+ char cmsgbuf[CMSG_SPACE(sizeof(sock_extended_err))];
+ msghdr msg;
+ memset(&msg, 0, sizeof(msg));
+ iovec iov;
+ memset(&iov, 0, sizeof(iov));
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsgbuf;
+ msg.msg_controllen = sizeof(cmsgbuf);
+
+ // recv*(MSG_ERRQUEUE) never blocks, even without MSG_DONTWAIT.
+ EXPECT_THAT(RetryEINTR(recvmsg)(s_, &msg, MSG_ERRQUEUE),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/udp_socket_test_cases.cc b/test/syscalls/linux/udp_socket_test_cases.cc
new file mode 100644
index 000000000..68e0a8109
--- /dev/null
+++ b/test/syscalls/linux/udp_socket_test_cases.cc
@@ -0,0 +1,1500 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/udp_socket_test_cases.h"
+
+#include <arpa/inet.h>
+#include <fcntl.h>
+#include <netinet/in.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include "gtest/gtest.h"
+#include "absl/base/macros.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Gets a pointer to the port component of the given address.
+uint16_t* Port(struct sockaddr_storage* addr) {
+ switch (addr->ss_family) {
+ case AF_INET: {
+ auto sin = reinterpret_cast<struct sockaddr_in*>(addr);
+ return &sin->sin_port;
+ }
+ case AF_INET6: {
+ auto sin6 = reinterpret_cast<struct sockaddr_in6*>(addr);
+ return &sin6->sin6_port;
+ }
+ }
+
+ return nullptr;
+}
+
+void UdpSocketTest::SetUp() {
+ int type;
+ if (GetParam() == AddressFamily::kIpv4) {
+ type = AF_INET;
+ auto sin = reinterpret_cast<struct sockaddr_in*>(&anyaddr_storage_);
+ addrlen_ = sizeof(*sin);
+ sin->sin_addr.s_addr = htonl(INADDR_ANY);
+ } else {
+ type = AF_INET6;
+ auto sin6 = reinterpret_cast<struct sockaddr_in6*>(&anyaddr_storage_);
+ addrlen_ = sizeof(*sin6);
+ if (GetParam() == AddressFamily::kIpv6) {
+ sin6->sin6_addr = IN6ADDR_ANY_INIT;
+ } else {
+ TestAddress const& v4_mapped_any = V4MappedAny();
+ sin6->sin6_addr =
+ reinterpret_cast<const struct sockaddr_in6*>(&v4_mapped_any.addr)
+ ->sin6_addr;
+ }
+ }
+ ASSERT_THAT(s_ = socket(type, SOCK_DGRAM, IPPROTO_UDP), SyscallSucceeds());
+
+ ASSERT_THAT(t_ = socket(type, SOCK_DGRAM, IPPROTO_UDP), SyscallSucceeds());
+
+ memset(&anyaddr_storage_, 0, sizeof(anyaddr_storage_));
+ anyaddr_ = reinterpret_cast<struct sockaddr*>(&anyaddr_storage_);
+ anyaddr_->sa_family = type;
+
+ if (gvisor::testing::IsRunningOnGvisor()) {
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(ports_); ++i) {
+ ports_[i] = TestPort + i;
+ }
+ } else {
+ // When not under gvisor, use utility function to pick port. Assert that
+ // all ports are different.
+ std::string error;
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(ports_); ++i) {
+ // Find an unused port, we specify port 0 to allow the kernel to provide
+ // the port.
+ bool unique = true;
+ do {
+ ports_[i] = ASSERT_NO_ERRNO_AND_VALUE(PortAvailable(
+ 0, AddressFamily::kDualStack, SocketType::kUdp, false));
+ ASSERT_GT(ports_[i], 0);
+ for (size_t j = 0; j < i; ++j) {
+ if (ports_[j] == ports_[i]) {
+ unique = false;
+ break;
+ }
+ }
+ } while (!unique);
+ }
+ }
+
+ // Initialize the sockaddrs.
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(addr_); ++i) {
+ memset(&addr_storage_[i], 0, sizeof(addr_storage_[i]));
+
+ addr_[i] = reinterpret_cast<struct sockaddr*>(&addr_storage_[i]);
+ addr_[i]->sa_family = type;
+
+ switch (type) {
+ case AF_INET: {
+ auto sin = reinterpret_cast<struct sockaddr_in*>(addr_[i]);
+ sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ sin->sin_port = htons(ports_[i]);
+ break;
+ }
+ case AF_INET6: {
+ auto sin6 = reinterpret_cast<struct sockaddr_in6*>(addr_[i]);
+ sin6->sin6_addr = in6addr_loopback;
+ sin6->sin6_port = htons(ports_[i]);
+ break;
+ }
+ }
+ }
+}
+
+TEST_P(UdpSocketTest, Creation) {
+ int type = AF_INET6;
+ if (GetParam() == AddressFamily::kIpv4) {
+ type = AF_INET;
+ }
+
+ int s_;
+
+ ASSERT_THAT(s_ = socket(type, SOCK_DGRAM, IPPROTO_UDP), SyscallSucceeds());
+ EXPECT_THAT(close(s_), SyscallSucceeds());
+
+ ASSERT_THAT(s_ = socket(type, SOCK_DGRAM, 0), SyscallSucceeds());
+ EXPECT_THAT(close(s_), SyscallSucceeds());
+
+ ASSERT_THAT(s_ = socket(type, SOCK_STREAM, IPPROTO_UDP), SyscallFails());
+}
+
+TEST_P(UdpSocketTest, Getsockname) {
+ // Check that we're not bound.
+ struct sockaddr_storage addr;
+ socklen_t addrlen = sizeof(addr);
+ EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+ SyscallSucceeds());
+ EXPECT_EQ(addrlen, addrlen_);
+ EXPECT_EQ(memcmp(&addr, anyaddr_, addrlen_), 0);
+
+ // Bind, then check that we get the right address.
+ ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+ addrlen = sizeof(addr);
+ EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+ SyscallSucceeds());
+ EXPECT_EQ(addrlen, addrlen_);
+ EXPECT_EQ(memcmp(&addr, addr_[0], addrlen_), 0);
+}
+
+TEST_P(UdpSocketTest, Getpeername) {
+ // Check that we're not connected.
+ struct sockaddr_storage addr;
+ socklen_t addrlen = sizeof(addr);
+ EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+ SyscallFailsWithErrno(ENOTCONN));
+
+ // Connect, then check that we get the right address.
+ ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+ addrlen = sizeof(addr);
+ EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+ SyscallSucceeds());
+ EXPECT_EQ(addrlen, addrlen_);
+ EXPECT_EQ(memcmp(&addr, addr_[0], addrlen_), 0);
+}
+
+TEST_P(UdpSocketTest, SendNotConnected) {
+ // Do send & write, they must fail.
+ char buf[512];
+ EXPECT_THAT(send(s_, buf, sizeof(buf), 0),
+ SyscallFailsWithErrno(EDESTADDRREQ));
+
+ EXPECT_THAT(write(s_, buf, sizeof(buf)), SyscallFailsWithErrno(EDESTADDRREQ));
+
+ // Use sendto.
+ ASSERT_THAT(sendto(s_, buf, sizeof(buf), 0, addr_[0], addrlen_),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ // Check that we're bound now.
+ struct sockaddr_storage addr;
+ socklen_t addrlen = sizeof(addr);
+ EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+ SyscallSucceeds());
+ EXPECT_EQ(addrlen, addrlen_);
+ EXPECT_NE(*Port(&addr), 0);
+}
+
+TEST_P(UdpSocketTest, ConnectBinds) {
+ // Connect the socket.
+ ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+ // Check that we're bound now.
+ struct sockaddr_storage addr;
+ socklen_t addrlen = sizeof(addr);
+ EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+ SyscallSucceeds());
+ EXPECT_EQ(addrlen, addrlen_);
+ EXPECT_NE(*Port(&addr), 0);
+}
+
+TEST_P(UdpSocketTest, ReceiveNotBound) {
+ char buf[512];
+ EXPECT_THAT(recv(s_, buf, sizeof(buf), MSG_DONTWAIT),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+}
+
+TEST_P(UdpSocketTest, Bind) {
+ ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+ // Try to bind again.
+ EXPECT_THAT(bind(s_, addr_[1], addrlen_), SyscallFailsWithErrno(EINVAL));
+
+ // Check that we're still bound to the original address.
+ struct sockaddr_storage addr;
+ socklen_t addrlen = sizeof(addr);
+ EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+ SyscallSucceeds());
+ EXPECT_EQ(addrlen, addrlen_);
+ EXPECT_EQ(memcmp(&addr, addr_[0], addrlen_), 0);
+}
+
+TEST_P(UdpSocketTest, BindInUse) {
+ ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+ // Try to bind again.
+ EXPECT_THAT(bind(t_, addr_[0], addrlen_), SyscallFailsWithErrno(EADDRINUSE));
+}
+
+TEST_P(UdpSocketTest, ReceiveAfterConnect) {
+ // Connect s_ to loopback:TestPort, and bind t_ to loopback:TestPort.
+ ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+ ASSERT_THAT(bind(t_, addr_[0], addrlen_), SyscallSucceeds());
+
+ // Get the address s_ was bound to during connect.
+ struct sockaddr_storage addr;
+ socklen_t addrlen = sizeof(addr);
+ EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+ SyscallSucceeds());
+ EXPECT_EQ(addrlen, addrlen_);
+
+ // Send from t_ to s_.
+ char buf[512];
+ RandomizeBuffer(buf, sizeof(buf));
+ ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0,
+ reinterpret_cast<sockaddr*>(&addr), addrlen),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ // Receive the data.
+ char received[sizeof(buf)];
+ EXPECT_THAT(recv(s_, received, sizeof(received), 0),
+ SyscallSucceedsWithValue(sizeof(received)));
+ EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
+}
+
+TEST_P(UdpSocketTest, ReceiveAfterDisconnect) {
+ // Connect s_ to loopback:TestPort, and bind t_ to loopback:TestPort.
+ ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+ ASSERT_THAT(bind(t_, addr_[0], addrlen_), SyscallSucceeds());
+ ASSERT_THAT(connect(t_, addr_[1], addrlen_), SyscallSucceeds());
+
+ // Get the address s_ was bound to during connect.
+ struct sockaddr_storage addr;
+ socklen_t addrlen = sizeof(addr);
+ EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+ SyscallSucceeds());
+ EXPECT_EQ(addrlen, addrlen_);
+
+ for (int i = 0; i < 2; i++) {
+ // Send from t_ to s_.
+ char buf[512];
+ RandomizeBuffer(buf, sizeof(buf));
+ EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+ SyscallSucceeds());
+ ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0,
+ reinterpret_cast<sockaddr*>(&addr), addrlen),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ // Receive the data.
+ char received[sizeof(buf)];
+ EXPECT_THAT(recv(s_, received, sizeof(received), 0),
+ SyscallSucceedsWithValue(sizeof(received)));
+ EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
+
+ // Disconnect s_.
+ struct sockaddr addr = {};
+ addr.sa_family = AF_UNSPEC;
+ ASSERT_THAT(connect(s_, &addr, sizeof(addr.sa_family)), SyscallSucceeds());
+ // Connect s_ loopback:TestPort.
+ ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+ }
+}
+
+TEST_P(UdpSocketTest, Connect) {
+ ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+ // Check that we're connected to the right peer.
+ struct sockaddr_storage peer;
+ socklen_t peerlen = sizeof(peer);
+ EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&peer), &peerlen),
+ SyscallSucceeds());
+ EXPECT_EQ(peerlen, addrlen_);
+ EXPECT_EQ(memcmp(&peer, addr_[0], addrlen_), 0);
+
+ // Try to bind after connect.
+ EXPECT_THAT(bind(s_, addr_[1], addrlen_), SyscallFailsWithErrno(EINVAL));
+
+ // Try to connect again.
+ EXPECT_THAT(connect(s_, addr_[2], addrlen_), SyscallSucceeds());
+
+ // Check that peer name changed.
+ peerlen = sizeof(peer);
+ EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&peer), &peerlen),
+ SyscallSucceeds());
+ EXPECT_EQ(peerlen, addrlen_);
+ EXPECT_EQ(memcmp(&peer, addr_[2], addrlen_), 0);
+}
+
+void ConnectAny(AddressFamily family, int sockfd, uint16_t port) {
+ struct sockaddr_storage addr = {};
+
+ // Precondition check.
+ {
+ socklen_t addrlen = sizeof(addr);
+ EXPECT_THAT(
+ getsockname(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+ SyscallSucceeds());
+
+ if (family == AddressFamily::kIpv4) {
+ auto addr_out = reinterpret_cast<struct sockaddr_in*>(&addr);
+ EXPECT_EQ(addrlen, sizeof(*addr_out));
+ EXPECT_EQ(addr_out->sin_addr.s_addr, htonl(INADDR_ANY));
+ } else {
+ auto addr_out = reinterpret_cast<struct sockaddr_in6*>(&addr);
+ EXPECT_EQ(addrlen, sizeof(*addr_out));
+ struct in6_addr any = IN6ADDR_ANY_INIT;
+ EXPECT_EQ(memcmp(&addr_out->sin6_addr, &any, sizeof(in6_addr)), 0);
+ }
+
+ {
+ socklen_t addrlen = sizeof(addr);
+ EXPECT_THAT(
+ getpeername(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+ SyscallFailsWithErrno(ENOTCONN));
+ }
+
+ struct sockaddr_storage baddr = {};
+ if (family == AddressFamily::kIpv4) {
+ auto addr_in = reinterpret_cast<struct sockaddr_in*>(&baddr);
+ addrlen = sizeof(*addr_in);
+ addr_in->sin_family = AF_INET;
+ addr_in->sin_addr.s_addr = htonl(INADDR_ANY);
+ addr_in->sin_port = port;
+ } else {
+ auto addr_in = reinterpret_cast<struct sockaddr_in6*>(&baddr);
+ addrlen = sizeof(*addr_in);
+ addr_in->sin6_family = AF_INET6;
+ addr_in->sin6_port = port;
+ if (family == AddressFamily::kIpv6) {
+ addr_in->sin6_addr = IN6ADDR_ANY_INIT;
+ } else {
+ TestAddress const& v4_mapped_any = V4MappedAny();
+ addr_in->sin6_addr =
+ reinterpret_cast<const struct sockaddr_in6*>(&v4_mapped_any.addr)
+ ->sin6_addr;
+ }
+ }
+
+ // TODO(b/138658473): gVisor doesn't allow connecting to the zero port.
+ if (port == 0) {
+ SKIP_IF(IsRunningOnGvisor());
+ }
+
+ ASSERT_THAT(connect(sockfd, reinterpret_cast<sockaddr*>(&baddr), addrlen),
+ SyscallSucceeds());
+ }
+
+ // Postcondition check.
+ {
+ socklen_t addrlen = sizeof(addr);
+ EXPECT_THAT(
+ getsockname(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+ SyscallSucceeds());
+
+ if (family == AddressFamily::kIpv4) {
+ auto addr_out = reinterpret_cast<struct sockaddr_in*>(&addr);
+ EXPECT_EQ(addrlen, sizeof(*addr_out));
+ EXPECT_EQ(addr_out->sin_addr.s_addr, htonl(INADDR_LOOPBACK));
+ } else {
+ auto addr_out = reinterpret_cast<struct sockaddr_in6*>(&addr);
+ EXPECT_EQ(addrlen, sizeof(*addr_out));
+ struct in6_addr loopback;
+ if (family == AddressFamily::kIpv6) {
+ loopback = IN6ADDR_LOOPBACK_INIT;
+ } else {
+ TestAddress const& v4_mapped_loopback = V4MappedLoopback();
+ loopback = reinterpret_cast<const struct sockaddr_in6*>(
+ &v4_mapped_loopback.addr)
+ ->sin6_addr;
+ }
+
+ EXPECT_EQ(memcmp(&addr_out->sin6_addr, &loopback, sizeof(in6_addr)), 0);
+ }
+
+ addrlen = sizeof(addr);
+ if (port == 0) {
+ EXPECT_THAT(
+ getpeername(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+ SyscallFailsWithErrno(ENOTCONN));
+ } else {
+ EXPECT_THAT(
+ getpeername(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+ SyscallSucceeds());
+ }
+ }
+}
+
+TEST_P(UdpSocketTest, ConnectAny) { ConnectAny(GetParam(), s_, 0); }
+
+TEST_P(UdpSocketTest, ConnectAnyWithPort) {
+ auto port = *Port(reinterpret_cast<struct sockaddr_storage*>(addr_[1]));
+ ConnectAny(GetParam(), s_, port);
+}
+
+void DisconnectAfterConnectAny(AddressFamily family, int sockfd, int port) {
+ struct sockaddr_storage addr = {};
+
+ socklen_t addrlen = sizeof(addr);
+ struct sockaddr_storage baddr = {};
+ if (family == AddressFamily::kIpv4) {
+ auto addr_in = reinterpret_cast<struct sockaddr_in*>(&baddr);
+ addrlen = sizeof(*addr_in);
+ addr_in->sin_family = AF_INET;
+ addr_in->sin_addr.s_addr = htonl(INADDR_ANY);
+ addr_in->sin_port = port;
+ } else {
+ auto addr_in = reinterpret_cast<struct sockaddr_in6*>(&baddr);
+ addrlen = sizeof(*addr_in);
+ addr_in->sin6_family = AF_INET6;
+ addr_in->sin6_port = port;
+ if (family == AddressFamily::kIpv6) {
+ addr_in->sin6_addr = IN6ADDR_ANY_INIT;
+ } else {
+ TestAddress const& v4_mapped_any = V4MappedAny();
+ addr_in->sin6_addr =
+ reinterpret_cast<const struct sockaddr_in6*>(&v4_mapped_any.addr)
+ ->sin6_addr;
+ }
+ }
+
+ // TODO(b/138658473): gVisor doesn't allow connecting to the zero port.
+ if (port == 0) {
+ SKIP_IF(IsRunningOnGvisor());
+ }
+
+ ASSERT_THAT(connect(sockfd, reinterpret_cast<sockaddr*>(&baddr), addrlen),
+ SyscallSucceeds());
+ // Now the socket is bound to the loopback address.
+
+ // Disconnect
+ addrlen = sizeof(addr);
+ addr.ss_family = AF_UNSPEC;
+ ASSERT_THAT(connect(sockfd, reinterpret_cast<sockaddr*>(&addr), addrlen),
+ SyscallSucceeds());
+
+ // Check that after disconnect the socket is bound to the ANY address.
+ EXPECT_THAT(getsockname(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+ SyscallSucceeds());
+ if (family == AddressFamily::kIpv4) {
+ auto addr_out = reinterpret_cast<struct sockaddr_in*>(&addr);
+ EXPECT_EQ(addrlen, sizeof(*addr_out));
+ EXPECT_EQ(addr_out->sin_addr.s_addr, htonl(INADDR_ANY));
+ } else {
+ auto addr_out = reinterpret_cast<struct sockaddr_in6*>(&addr);
+ EXPECT_EQ(addrlen, sizeof(*addr_out));
+ struct in6_addr loopback = IN6ADDR_ANY_INIT;
+
+ EXPECT_EQ(memcmp(&addr_out->sin6_addr, &loopback, sizeof(in6_addr)), 0);
+ }
+}
+
+TEST_P(UdpSocketTest, DisconnectAfterConnectAny) {
+ DisconnectAfterConnectAny(GetParam(), s_, 0);
+}
+
+TEST_P(UdpSocketTest, DisconnectAfterConnectAnyWithPort) {
+ auto port = *Port(reinterpret_cast<struct sockaddr_storage*>(addr_[1]));
+ DisconnectAfterConnectAny(GetParam(), s_, port);
+}
+
+TEST_P(UdpSocketTest, DisconnectAfterBind) {
+ ASSERT_THAT(bind(s_, addr_[1], addrlen_), SyscallSucceeds());
+ // Connect the socket.
+ ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+ struct sockaddr_storage addr = {};
+ addr.ss_family = AF_UNSPEC;
+ EXPECT_THAT(
+ connect(s_, reinterpret_cast<sockaddr*>(&addr), sizeof(addr.ss_family)),
+ SyscallSucceeds());
+
+ // Check that we're still bound.
+ socklen_t addrlen = sizeof(addr);
+ EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+ SyscallSucceeds());
+
+ EXPECT_EQ(addrlen, addrlen_);
+ EXPECT_EQ(memcmp(&addr, addr_[1], addrlen_), 0);
+
+ addrlen = sizeof(addr);
+ EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+ SyscallFailsWithErrno(ENOTCONN));
+}
+
+TEST_P(UdpSocketTest, BindToAnyConnnectToLocalhost) {
+ struct sockaddr_storage baddr = {};
+ auto port = *Port(reinterpret_cast<struct sockaddr_storage*>(addr_[1]));
+ if (GetParam() == AddressFamily::kIpv4) {
+ auto addr_in = reinterpret_cast<struct sockaddr_in*>(&baddr);
+ addr_in->sin_family = AF_INET;
+ addr_in->sin_port = port;
+ addr_in->sin_addr.s_addr = htonl(INADDR_ANY);
+ } else {
+ auto addr_in = reinterpret_cast<struct sockaddr_in6*>(&baddr);
+ addr_in->sin6_family = AF_INET6;
+ addr_in->sin6_port = port;
+ addr_in->sin6_scope_id = 0;
+ addr_in->sin6_addr = IN6ADDR_ANY_INIT;
+ }
+ ASSERT_THAT(bind(s_, reinterpret_cast<sockaddr*>(&baddr), addrlen_),
+ SyscallSucceeds());
+ // Connect the socket.
+ ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+ struct sockaddr_storage addr = {};
+ socklen_t addrlen = sizeof(addr);
+ EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+ SyscallSucceeds());
+
+ // If the socket is bound to ANY and connected to a loopback address,
+ // getsockname() has to return the loopback address.
+ if (GetParam() == AddressFamily::kIpv4) {
+ auto addr_out = reinterpret_cast<struct sockaddr_in*>(&addr);
+ EXPECT_EQ(addrlen, sizeof(*addr_out));
+ EXPECT_EQ(addr_out->sin_addr.s_addr, htonl(INADDR_LOOPBACK));
+ } else {
+ auto addr_out = reinterpret_cast<struct sockaddr_in6*>(&addr);
+ struct in6_addr loopback = IN6ADDR_LOOPBACK_INIT;
+ EXPECT_EQ(addrlen, sizeof(*addr_out));
+ EXPECT_EQ(memcmp(&addr_out->sin6_addr, &loopback, sizeof(in6_addr)), 0);
+ }
+}
+
+TEST_P(UdpSocketTest, DisconnectAfterBindToAny) {
+ struct sockaddr_storage baddr = {};
+ socklen_t addrlen;
+ auto port = *Port(reinterpret_cast<struct sockaddr_storage*>(addr_[1]));
+ if (GetParam() == AddressFamily::kIpv4) {
+ auto addr_in = reinterpret_cast<struct sockaddr_in*>(&baddr);
+ addr_in->sin_family = AF_INET;
+ addr_in->sin_port = port;
+ addr_in->sin_addr.s_addr = htonl(INADDR_ANY);
+ } else {
+ auto addr_in = reinterpret_cast<struct sockaddr_in6*>(&baddr);
+ addr_in->sin6_family = AF_INET6;
+ addr_in->sin6_port = port;
+ addr_in->sin6_scope_id = 0;
+ addr_in->sin6_addr = IN6ADDR_ANY_INIT;
+ }
+ ASSERT_THAT(bind(s_, reinterpret_cast<sockaddr*>(&baddr), addrlen_),
+ SyscallSucceeds());
+ // Connect the socket.
+ ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+ struct sockaddr_storage addr = {};
+ addr.ss_family = AF_UNSPEC;
+ EXPECT_THAT(
+ connect(s_, reinterpret_cast<sockaddr*>(&addr), sizeof(addr.ss_family)),
+ SyscallSucceeds());
+
+ // Check that we're still bound.
+ addrlen = sizeof(addr);
+ EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+ SyscallSucceeds());
+
+ EXPECT_EQ(addrlen, addrlen_);
+ EXPECT_EQ(memcmp(&addr, &baddr, addrlen), 0);
+
+ addrlen = sizeof(addr);
+ EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+ SyscallFailsWithErrno(ENOTCONN));
+}
+
+TEST_P(UdpSocketTest, Disconnect) {
+ for (int i = 0; i < 2; i++) {
+ // Try to connect again.
+ EXPECT_THAT(connect(s_, addr_[2], addrlen_), SyscallSucceeds());
+
+ // Check that we're connected to the right peer.
+ struct sockaddr_storage peer;
+ socklen_t peerlen = sizeof(peer);
+ EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&peer), &peerlen),
+ SyscallSucceeds());
+ EXPECT_EQ(peerlen, addrlen_);
+ EXPECT_EQ(memcmp(&peer, addr_[2], addrlen_), 0);
+
+ // Try to disconnect.
+ struct sockaddr_storage addr = {};
+ addr.ss_family = AF_UNSPEC;
+ EXPECT_THAT(
+ connect(s_, reinterpret_cast<sockaddr*>(&addr), sizeof(addr.ss_family)),
+ SyscallSucceeds());
+
+ peerlen = sizeof(peer);
+ EXPECT_THAT(getpeername(s_, reinterpret_cast<sockaddr*>(&peer), &peerlen),
+ SyscallFailsWithErrno(ENOTCONN));
+
+ // Check that we're still bound.
+ socklen_t addrlen = sizeof(addr);
+ EXPECT_THAT(getsockname(s_, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+ SyscallSucceeds());
+ EXPECT_EQ(addrlen, addrlen_);
+ EXPECT_EQ(*Port(&addr), 0);
+ }
+}
+
+TEST_P(UdpSocketTest, ConnectBadAddress) {
+ struct sockaddr addr = {};
+ addr.sa_family = addr_[0]->sa_family;
+ ASSERT_THAT(connect(s_, &addr, sizeof(addr.sa_family)),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(UdpSocketTest, SendToAddressOtherThanConnected) {
+ ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+ // Send to a different destination than we're connected to.
+ char buf[512];
+ EXPECT_THAT(sendto(s_, buf, sizeof(buf), 0, addr_[1], addrlen_),
+ SyscallSucceedsWithValue(sizeof(buf)));
+}
+
+TEST_P(UdpSocketTest, ZerolengthWriteAllowed) {
+ // TODO(gvisor.dev/issue/1202): Hostinet does not support zero length writes.
+ SKIP_IF(IsRunningWithHostinet());
+
+ // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1.
+ ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+ ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
+
+ // Bind t_ to loopback:TestPort+1.
+ ASSERT_THAT(bind(t_, addr_[1], addrlen_), SyscallSucceeds());
+
+ char buf[3];
+ // Send zero length packet from s_ to t_.
+ ASSERT_THAT(write(s_, buf, 0), SyscallSucceedsWithValue(0));
+ // Receive the packet.
+ char received[3];
+ EXPECT_THAT(read(t_, received, sizeof(received)),
+ SyscallSucceedsWithValue(0));
+}
+
+TEST_P(UdpSocketTest, ZerolengthWriteAllowedNonBlockRead) {
+ // TODO(gvisor.dev/issue/1202): Hostinet does not support zero length writes.
+ SKIP_IF(IsRunningWithHostinet());
+
+ // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1.
+ ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+ ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
+
+ // Bind t_ to loopback:TestPort+1.
+ ASSERT_THAT(bind(t_, addr_[1], addrlen_), SyscallSucceeds());
+
+ // Set t_ to non-blocking.
+ int opts = 0;
+ ASSERT_THAT(opts = fcntl(t_, F_GETFL), SyscallSucceeds());
+ ASSERT_THAT(fcntl(t_, F_SETFL, opts | O_NONBLOCK), SyscallSucceeds());
+
+ char buf[3];
+ // Send zero length packet from s_ to t_.
+ ASSERT_THAT(write(s_, buf, 0), SyscallSucceedsWithValue(0));
+ // Receive the packet.
+ char received[3];
+ EXPECT_THAT(read(t_, received, sizeof(received)),
+ SyscallSucceedsWithValue(0));
+ EXPECT_THAT(read(t_, received, sizeof(received)),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST_P(UdpSocketTest, SendAndReceiveNotConnected) {
+ // Bind s_ to loopback.
+ ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+ // Send some data to s_.
+ char buf[512];
+ RandomizeBuffer(buf, sizeof(buf));
+
+ ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ // Receive the data.
+ char received[sizeof(buf)];
+ EXPECT_THAT(recv(s_, received, sizeof(received), 0),
+ SyscallSucceedsWithValue(sizeof(received)));
+ EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
+}
+
+TEST_P(UdpSocketTest, SendAndReceiveConnected) {
+ // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1.
+ ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+ ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
+
+ // Bind t_ to loopback:TestPort+1.
+ ASSERT_THAT(bind(t_, addr_[1], addrlen_), SyscallSucceeds());
+
+ // Send some data from t_ to s_.
+ char buf[512];
+ RandomizeBuffer(buf, sizeof(buf));
+
+ ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ // Receive the data.
+ char received[sizeof(buf)];
+ EXPECT_THAT(recv(s_, received, sizeof(received), 0),
+ SyscallSucceedsWithValue(sizeof(received)));
+ EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
+}
+
+TEST_P(UdpSocketTest, ReceiveFromNotConnected) {
+ // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1.
+ ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+ ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
+
+ // Bind t_ to loopback:TestPort+2.
+ ASSERT_THAT(bind(t_, addr_[2], addrlen_), SyscallSucceeds());
+
+ // Send some data from t_ to s_.
+ char buf[512];
+ ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ // Check that the data isn't_ received because it was sent from a different
+ // address than we're connected.
+ EXPECT_THAT(recv(s_, buf, sizeof(buf), MSG_DONTWAIT),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+}
+
+TEST_P(UdpSocketTest, ReceiveBeforeConnect) {
+ // Bind s_ to loopback:TestPort.
+ ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+ // Bind t_ to loopback:TestPort+2.
+ ASSERT_THAT(bind(t_, addr_[2], addrlen_), SyscallSucceeds());
+
+ // Send some data from t_ to s_.
+ char buf[512];
+ RandomizeBuffer(buf, sizeof(buf));
+
+ ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ // Connect to loopback:TestPort+1.
+ ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
+
+ // Receive the data. It works because it was sent before the connect.
+ char received[sizeof(buf)];
+ EXPECT_THAT(recv(s_, received, sizeof(received), 0),
+ SyscallSucceedsWithValue(sizeof(received)));
+ EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
+
+ // Send again. This time it should not be received.
+ ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ EXPECT_THAT(recv(s_, buf, sizeof(buf), MSG_DONTWAIT),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+}
+
+TEST_P(UdpSocketTest, ReceiveFrom) {
+ // Bind s_ to loopback:TestPort, and connect to loopback:TestPort+1.
+ ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+ ASSERT_THAT(connect(s_, addr_[1], addrlen_), SyscallSucceeds());
+
+ // Bind t_ to loopback:TestPort+1.
+ ASSERT_THAT(bind(t_, addr_[1], addrlen_), SyscallSucceeds());
+
+ // Send some data from t_ to s_.
+ char buf[512];
+ RandomizeBuffer(buf, sizeof(buf));
+
+ ASSERT_THAT(sendto(t_, buf, sizeof(buf), 0, addr_[0], addrlen_),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ // Receive the data and sender address.
+ char received[sizeof(buf)];
+ struct sockaddr_storage addr;
+ socklen_t addrlen = sizeof(addr);
+ EXPECT_THAT(recvfrom(s_, received, sizeof(received), 0,
+ reinterpret_cast<sockaddr*>(&addr), &addrlen),
+ SyscallSucceedsWithValue(sizeof(received)));
+ EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
+ EXPECT_EQ(addrlen, addrlen_);
+ EXPECT_EQ(memcmp(&addr, addr_[1], addrlen_), 0);
+}
+
+TEST_P(UdpSocketTest, Listen) {
+ ASSERT_THAT(listen(s_, SOMAXCONN), SyscallFailsWithErrno(EOPNOTSUPP));
+}
+
+TEST_P(UdpSocketTest, Accept) {
+ ASSERT_THAT(accept(s_, nullptr, nullptr), SyscallFailsWithErrno(EOPNOTSUPP));
+}
+
+// This test validates that a read shutdown with pending data allows the read
+// to proceed with the data before returning EAGAIN.
+TEST_P(UdpSocketTest, ReadShutdownNonblockPendingData) {
+ char received[512];
+
+ // Bind t_ to loopback:TestPort+2.
+ ASSERT_THAT(bind(t_, addr_[2], addrlen_), SyscallSucceeds());
+ ASSERT_THAT(connect(t_, addr_[1], addrlen_), SyscallSucceeds());
+
+ // Connect the socket, then try to shutdown again.
+ ASSERT_THAT(bind(s_, addr_[1], addrlen_), SyscallSucceeds());
+ ASSERT_THAT(connect(s_, addr_[2], addrlen_), SyscallSucceeds());
+
+ // Verify that we get EWOULDBLOCK when there is nothing to read.
+ EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+
+ const char* buf = "abc";
+ EXPECT_THAT(write(t_, buf, 3), SyscallSucceedsWithValue(3));
+
+ int opts = 0;
+ ASSERT_THAT(opts = fcntl(s_, F_GETFL), SyscallSucceeds());
+ ASSERT_THAT(fcntl(s_, F_SETFL, opts | O_NONBLOCK), SyscallSucceeds());
+ ASSERT_THAT(opts = fcntl(s_, F_GETFL), SyscallSucceeds());
+ ASSERT_NE(opts & O_NONBLOCK, 0);
+
+ EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds());
+
+ // We should get the data even though read has been shutdown.
+ EXPECT_THAT(recv(s_, received, 2, 0), SyscallSucceedsWithValue(2));
+
+ // Because we read less than the entire packet length, since it's a packet
+ // based socket any subsequent reads should return EWOULDBLOCK.
+ EXPECT_THAT(recv(s_, received, 1, 0), SyscallFailsWithErrno(EWOULDBLOCK));
+}
+
+// This test is validating that even after a socket is shutdown if it's
+// reconnected it will reset the shutdown state.
+TEST_P(UdpSocketTest, ReadShutdownSameSocketResetsShutdownState) {
+ char received[512];
+ EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+
+ EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallFailsWithErrno(ENOTCONN));
+
+ EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+
+ // Connect the socket, then try to shutdown again.
+ ASSERT_THAT(bind(s_, addr_[1], addrlen_), SyscallSucceeds());
+ ASSERT_THAT(connect(s_, addr_[2], addrlen_), SyscallSucceeds());
+
+ EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+}
+
+TEST_P(UdpSocketTest, ReadShutdown) {
+ // TODO(gvisor.dev/issue/1202): Calling recv() after shutdown without
+ // MSG_DONTWAIT blocks indefinitely.
+ SKIP_IF(IsRunningWithHostinet());
+
+ char received[512];
+ EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+
+ EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallFailsWithErrno(ENOTCONN));
+
+ EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+
+ // Connect the socket, then try to shutdown again.
+ ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+ EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+
+ EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds());
+
+ EXPECT_THAT(recv(s_, received, sizeof(received), 0),
+ SyscallSucceedsWithValue(0));
+}
+
+TEST_P(UdpSocketTest, ReadShutdownDifferentThread) {
+ // TODO(gvisor.dev/issue/1202): Calling recv() after shutdown without
+ // MSG_DONTWAIT blocks indefinitely.
+ SKIP_IF(IsRunningWithHostinet());
+
+ char received[512];
+ EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+
+ // Connect the socket, then shutdown from another thread.
+ ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+ EXPECT_THAT(recv(s_, received, sizeof(received), MSG_DONTWAIT),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+
+ ScopedThread t([&] {
+ absl::SleepFor(absl::Milliseconds(200));
+ EXPECT_THAT(shutdown(this->s_, SHUT_RD), SyscallSucceeds());
+ });
+ EXPECT_THAT(RetryEINTR(recv)(s_, received, sizeof(received), 0),
+ SyscallSucceedsWithValue(0));
+ t.Join();
+
+ EXPECT_THAT(RetryEINTR(recv)(s_, received, sizeof(received), 0),
+ SyscallSucceedsWithValue(0));
+}
+
+TEST_P(UdpSocketTest, WriteShutdown) {
+ EXPECT_THAT(shutdown(s_, SHUT_WR), SyscallFailsWithErrno(ENOTCONN));
+ ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+ EXPECT_THAT(shutdown(s_, SHUT_WR), SyscallSucceeds());
+}
+
+TEST_P(UdpSocketTest, SynchronousReceive) {
+ // Bind s_ to loopback.
+ ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+ // Send some data to s_ from another thread.
+ char buf[512];
+ RandomizeBuffer(buf, sizeof(buf));
+
+ // Receive the data prior to actually starting the other thread.
+ char received[512];
+ EXPECT_THAT(RetryEINTR(recv)(s_, received, sizeof(received), MSG_DONTWAIT),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+
+ // Start the thread.
+ ScopedThread t([&] {
+ absl::SleepFor(absl::Milliseconds(200));
+ ASSERT_THAT(
+ sendto(this->t_, buf, sizeof(buf), 0, this->addr_[0], this->addrlen_),
+ SyscallSucceedsWithValue(sizeof(buf)));
+ });
+
+ EXPECT_THAT(RetryEINTR(recv)(s_, received, sizeof(received), 0),
+ SyscallSucceedsWithValue(512));
+ EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
+}
+
+TEST_P(UdpSocketTest, BoundaryPreserved_SendRecv) {
+ // Bind s_ to loopback:TestPort.
+ ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+ // Send 3 packets from t_ to s_.
+ constexpr int psize = 100;
+ char buf[3 * psize];
+ RandomizeBuffer(buf, sizeof(buf));
+
+ for (int i = 0; i < 3; ++i) {
+ ASSERT_THAT(sendto(t_, buf + i * psize, psize, 0, addr_[0], addrlen_),
+ SyscallSucceedsWithValue(psize));
+ }
+
+ // Receive the data as 3 separate packets.
+ char received[6 * psize];
+ for (int i = 0; i < 3; ++i) {
+ EXPECT_THAT(recv(s_, received + i * psize, 3 * psize, 0),
+ SyscallSucceedsWithValue(psize));
+ }
+ EXPECT_EQ(memcmp(buf, received, 3 * psize), 0);
+}
+
+TEST_P(UdpSocketTest, BoundaryPreserved_WritevReadv) {
+ // Bind s_ to loopback:TestPort.
+ ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+ // Direct writes from t_ to s_.
+ ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
+
+ // Send 2 packets from t_ to s_, where each packet's data consists of 2
+ // discontiguous iovecs.
+ constexpr size_t kPieceSize = 100;
+ char buf[4 * kPieceSize];
+ RandomizeBuffer(buf, sizeof(buf));
+
+ for (int i = 0; i < 2; i++) {
+ struct iovec iov[2];
+ for (int j = 0; j < 2; j++) {
+ iov[j].iov_base = reinterpret_cast<void*>(
+ reinterpret_cast<uintptr_t>(buf) + (i + 2 * j) * kPieceSize);
+ iov[j].iov_len = kPieceSize;
+ }
+ ASSERT_THAT(writev(t_, iov, 2), SyscallSucceedsWithValue(2 * kPieceSize));
+ }
+
+ // Receive the data as 2 separate packets.
+ char received[6 * kPieceSize];
+ for (int i = 0; i < 2; i++) {
+ struct iovec iov[3];
+ for (int j = 0; j < 3; j++) {
+ iov[j].iov_base = reinterpret_cast<void*>(
+ reinterpret_cast<uintptr_t>(received) + (i + 2 * j) * kPieceSize);
+ iov[j].iov_len = kPieceSize;
+ }
+ ASSERT_THAT(readv(s_, iov, 3), SyscallSucceedsWithValue(2 * kPieceSize));
+ }
+ EXPECT_EQ(memcmp(buf, received, 4 * kPieceSize), 0);
+}
+
+TEST_P(UdpSocketTest, BoundaryPreserved_SendMsgRecvMsg) {
+ // Bind s_ to loopback:TestPort.
+ ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+ // Send 2 packets from t_ to s_, where each packet's data consists of 2
+ // discontiguous iovecs.
+ constexpr size_t kPieceSize = 100;
+ char buf[4 * kPieceSize];
+ RandomizeBuffer(buf, sizeof(buf));
+
+ for (int i = 0; i < 2; i++) {
+ struct iovec iov[2];
+ for (int j = 0; j < 2; j++) {
+ iov[j].iov_base = reinterpret_cast<void*>(
+ reinterpret_cast<uintptr_t>(buf) + (i + 2 * j) * kPieceSize);
+ iov[j].iov_len = kPieceSize;
+ }
+ struct msghdr msg = {};
+ msg.msg_name = addr_[0];
+ msg.msg_namelen = addrlen_;
+ msg.msg_iov = iov;
+ msg.msg_iovlen = 2;
+ ASSERT_THAT(sendmsg(t_, &msg, 0), SyscallSucceedsWithValue(2 * kPieceSize));
+ }
+
+ // Receive the data as 2 separate packets.
+ char received[6 * kPieceSize];
+ for (int i = 0; i < 2; i++) {
+ struct iovec iov[3];
+ for (int j = 0; j < 3; j++) {
+ iov[j].iov_base = reinterpret_cast<void*>(
+ reinterpret_cast<uintptr_t>(received) + (i + 2 * j) * kPieceSize);
+ iov[j].iov_len = kPieceSize;
+ }
+ struct msghdr msg = {};
+ msg.msg_iov = iov;
+ msg.msg_iovlen = 3;
+ ASSERT_THAT(recvmsg(s_, &msg, 0), SyscallSucceedsWithValue(2 * kPieceSize));
+ }
+ EXPECT_EQ(memcmp(buf, received, 4 * kPieceSize), 0);
+}
+
+TEST_P(UdpSocketTest, FIONREADShutdown) {
+ int n = -1;
+ EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+ EXPECT_EQ(n, 0);
+
+ // A UDP socket must be connected before it can be shutdown.
+ ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+ n = -1;
+ EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+ EXPECT_EQ(n, 0);
+
+ EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds());
+
+ n = -1;
+ EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+ EXPECT_EQ(n, 0);
+}
+
+TEST_P(UdpSocketTest, FIONREADWriteShutdown) {
+ int n = -1;
+ EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+ EXPECT_EQ(n, 0);
+
+ // Bind s_ to loopback:TestPort.
+ ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+ // A UDP socket must be connected before it can be shutdown.
+ ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+ n = -1;
+ EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+ EXPECT_EQ(n, 0);
+
+ const char str[] = "abc";
+ ASSERT_THAT(send(s_, str, sizeof(str), 0),
+ SyscallSucceedsWithValue(sizeof(str)));
+
+ n = -1;
+ EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+ EXPECT_EQ(n, sizeof(str));
+
+ EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds());
+
+ n = -1;
+ EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+ EXPECT_EQ(n, sizeof(str));
+}
+
+TEST_P(UdpSocketTest, Fionread) {
+ // Bind s_ to loopback:TestPort.
+ ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+ // Check that the bound socket with an empty buffer reports an empty first
+ // packet.
+ int n = -1;
+ EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+ EXPECT_EQ(n, 0);
+
+ // Send 3 packets from t_ to s_.
+ constexpr int psize = 100;
+ char buf[3 * psize];
+ RandomizeBuffer(buf, sizeof(buf));
+
+ for (int i = 0; i < 3; ++i) {
+ ASSERT_THAT(sendto(t_, buf + i * psize, psize, 0, addr_[0], addrlen_),
+ SyscallSucceedsWithValue(psize));
+
+ // Check that regardless of how many packets are in the queue, the size
+ // reported is that of a single packet.
+ n = -1;
+ EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+ EXPECT_EQ(n, psize);
+ }
+}
+
+TEST_P(UdpSocketTest, FIONREADZeroLengthPacket) {
+ // Bind s_ to loopback:TestPort.
+ ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+ // Check that the bound socket with an empty buffer reports an empty first
+ // packet.
+ int n = -1;
+ EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+ EXPECT_EQ(n, 0);
+
+ // Send 3 packets from t_ to s_.
+ constexpr int psize = 100;
+ char buf[3 * psize];
+ RandomizeBuffer(buf, sizeof(buf));
+
+ for (int i = 0; i < 3; ++i) {
+ ASSERT_THAT(sendto(t_, buf + i * psize, 0, 0, addr_[0], addrlen_),
+ SyscallSucceedsWithValue(0));
+
+ // Check that regardless of how many packets are in the queue, the size
+ // reported is that of a single packet.
+ n = -1;
+ EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+ EXPECT_EQ(n, 0);
+ }
+}
+
+TEST_P(UdpSocketTest, FIONREADZeroLengthWriteShutdown) {
+ int n = -1;
+ EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+ EXPECT_EQ(n, 0);
+
+ // Bind s_ to loopback:TestPort.
+ ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+ // A UDP socket must be connected before it can be shutdown.
+ ASSERT_THAT(connect(s_, addr_[0], addrlen_), SyscallSucceeds());
+
+ n = -1;
+ EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+ EXPECT_EQ(n, 0);
+
+ const char str[] = "abc";
+ ASSERT_THAT(send(s_, str, 0, 0), SyscallSucceedsWithValue(0));
+
+ n = -1;
+ EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+ EXPECT_EQ(n, 0);
+
+ EXPECT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds());
+
+ n = -1;
+ EXPECT_THAT(ioctl(s_, FIONREAD, &n), SyscallSucceedsWithValue(0));
+ EXPECT_EQ(n, 0);
+}
+
+TEST_P(UdpSocketTest, SoTimestampOffByDefault) {
+ // TODO(gvisor.dev/issue/1202): SO_TIMESTAMP socket option not supported by
+ // hostinet.
+ SKIP_IF(IsRunningWithHostinet());
+
+ int v = -1;
+ socklen_t optlen = sizeof(v);
+ ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_TIMESTAMP, &v, &optlen),
+ SyscallSucceeds());
+ ASSERT_EQ(v, kSockOptOff);
+ ASSERT_EQ(optlen, sizeof(v));
+}
+
+TEST_P(UdpSocketTest, SoTimestamp) {
+ // TODO(gvisor.dev/issue/1202): ioctl() and SO_TIMESTAMP socket option are not
+ // supported by hostinet.
+ SKIP_IF(IsRunningWithHostinet());
+
+ ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+ ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
+
+ int v = 1;
+ ASSERT_THAT(setsockopt(s_, SOL_SOCKET, SO_TIMESTAMP, &v, sizeof(v)),
+ SyscallSucceeds());
+
+ char buf[3];
+ // Send zero length packet from t_ to s_.
+ ASSERT_THAT(RetryEINTR(write)(t_, buf, 0), SyscallSucceedsWithValue(0));
+
+ char cmsgbuf[CMSG_SPACE(sizeof(struct timeval))];
+ msghdr msg;
+ memset(&msg, 0, sizeof(msg));
+ iovec iov;
+ memset(&iov, 0, sizeof(iov));
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsgbuf;
+ msg.msg_controllen = sizeof(cmsgbuf);
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(s_, &msg, 0), SyscallSucceedsWithValue(0));
+
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, nullptr);
+ ASSERT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+ ASSERT_EQ(cmsg->cmsg_type, SO_TIMESTAMP);
+ ASSERT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(struct timeval)));
+
+ struct timeval tv = {};
+ memcpy(&tv, CMSG_DATA(cmsg), sizeof(struct timeval));
+
+ ASSERT_TRUE(tv.tv_sec != 0 || tv.tv_usec != 0);
+
+ // There should be nothing to get via ioctl.
+ ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv), SyscallFailsWithErrno(ENOENT));
+}
+
+TEST_P(UdpSocketTest, WriteShutdownNotConnected) {
+ EXPECT_THAT(shutdown(s_, SHUT_WR), SyscallFailsWithErrno(ENOTCONN));
+}
+
+TEST_P(UdpSocketTest, TimestampIoctl) {
+ // TODO(gvisor.dev/issue/1202): ioctl() is not supported by hostinet.
+ SKIP_IF(IsRunningWithHostinet());
+
+ ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+ ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
+
+ char buf[3];
+ // Send packet from t_ to s_.
+ ASSERT_THAT(RetryEINTR(write)(t_, buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ // There should be no control messages.
+ char recv_buf[sizeof(buf)];
+ ASSERT_NO_FATAL_FAILURE(RecvNoCmsg(s_, recv_buf, sizeof(recv_buf)));
+
+ // A nonzero timeval should be available via ioctl.
+ struct timeval tv = {};
+ ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv), SyscallSucceeds());
+ ASSERT_TRUE(tv.tv_sec != 0 || tv.tv_usec != 0);
+}
+
+TEST_P(UdpSocketTest, TimestampIoctlNothingRead) {
+ // TODO(gvisor.dev/issue/1202): ioctl() is not supported by hostinet.
+ SKIP_IF(IsRunningWithHostinet());
+
+ ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+ ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
+
+ struct timeval tv = {};
+ ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv), SyscallFailsWithErrno(ENOENT));
+}
+
+// Test that the timestamp accessed via SIOCGSTAMP is still accessible after
+// SO_TIMESTAMP is enabled and used to retrieve a timestamp.
+TEST_P(UdpSocketTest, TimestampIoctlPersistence) {
+ // TODO(gvisor.dev/issue/1202): ioctl() and SO_TIMESTAMP socket option are not
+ // supported by hostinet.
+ SKIP_IF(IsRunningWithHostinet());
+
+ ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+ ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
+
+ char buf[3];
+ // Send packet from t_ to s_.
+ ASSERT_THAT(RetryEINTR(write)(t_, buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+ ASSERT_THAT(RetryEINTR(write)(t_, buf, 0), SyscallSucceedsWithValue(0));
+
+ // There should be no control messages.
+ char recv_buf[sizeof(buf)];
+ ASSERT_NO_FATAL_FAILURE(RecvNoCmsg(s_, recv_buf, sizeof(recv_buf)));
+
+ // A nonzero timeval should be available via ioctl.
+ struct timeval tv = {};
+ ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv), SyscallSucceeds());
+ ASSERT_TRUE(tv.tv_sec != 0 || tv.tv_usec != 0);
+
+ // Enable SO_TIMESTAMP and send a message.
+ int v = 1;
+ EXPECT_THAT(setsockopt(s_, SOL_SOCKET, SO_TIMESTAMP, &v, sizeof(v)),
+ SyscallSucceeds());
+ ASSERT_THAT(RetryEINTR(write)(t_, buf, 0), SyscallSucceedsWithValue(0));
+
+ // There should be a message for SO_TIMESTAMP.
+ char cmsgbuf[CMSG_SPACE(sizeof(struct timeval))];
+ msghdr msg = {};
+ iovec iov = {};
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsgbuf;
+ msg.msg_controllen = sizeof(cmsgbuf);
+ ASSERT_THAT(RetryEINTR(recvmsg)(s_, &msg, 0), SyscallSucceedsWithValue(0));
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, nullptr);
+
+ // The ioctl should return the exact same values as before.
+ struct timeval tv2 = {};
+ ASSERT_THAT(ioctl(s_, SIOCGSTAMP, &tv2), SyscallSucceeds());
+ ASSERT_EQ(tv.tv_sec, tv2.tv_sec);
+ ASSERT_EQ(tv.tv_usec, tv2.tv_usec);
+}
+
+// Test that a socket with IP_TOS or IPV6_TCLASS set will set the TOS byte on
+// outgoing packets, and that a receiving socket with IP_RECVTOS or
+// IPV6_RECVTCLASS will create the corresponding control message.
+TEST_P(UdpSocketTest, SetAndReceiveTOS) {
+ // TODO(b/68320120): IPV6_RECVTCLASS not supported for netstack.
+ SKIP_IF((GetParam() != AddressFamily::kIpv4) && IsRunningOnGvisor() &&
+ !IsRunningWithHostinet());
+ ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+ ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
+
+ // Allow socket to receive control message.
+ int recv_level = SOL_IP;
+ int recv_type = IP_RECVTOS;
+ if (GetParam() != AddressFamily::kIpv4) {
+ recv_level = SOL_IPV6;
+ recv_type = IPV6_RECVTCLASS;
+ }
+ ASSERT_THAT(
+ setsockopt(s_, recv_level, recv_type, &kSockOptOn, sizeof(kSockOptOn)),
+ SyscallSucceeds());
+
+ // Set socket TOS.
+ int sent_level = recv_level;
+ int sent_type = IP_TOS;
+ if (sent_level == SOL_IPV6) {
+ sent_type = IPV6_TCLASS;
+ }
+ int sent_tos = IPTOS_LOWDELAY; // Choose some TOS value.
+ ASSERT_THAT(
+ setsockopt(t_, sent_level, sent_type, &sent_tos, sizeof(sent_tos)),
+ SyscallSucceeds());
+
+ // Prepare message to send.
+ constexpr size_t kDataLength = 1024;
+ struct msghdr sent_msg = {};
+ struct iovec sent_iov = {};
+ char sent_data[kDataLength];
+ sent_iov.iov_base = &sent_data[0];
+ sent_iov.iov_len = kDataLength;
+ sent_msg.msg_iov = &sent_iov;
+ sent_msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(sendmsg)(t_, &sent_msg, 0),
+ SyscallSucceedsWithValue(kDataLength));
+
+ // Receive message.
+ struct msghdr received_msg = {};
+ struct iovec received_iov = {};
+ char received_data[kDataLength];
+ received_iov.iov_base = &received_data[0];
+ received_iov.iov_len = kDataLength;
+ received_msg.msg_iov = &received_iov;
+ received_msg.msg_iovlen = 1;
+ size_t cmsg_data_len = sizeof(int8_t);
+ if (sent_type == IPV6_TCLASS) {
+ cmsg_data_len = sizeof(int);
+ }
+ std::vector<char> received_cmsgbuf(CMSG_SPACE(cmsg_data_len));
+ received_msg.msg_control = &received_cmsgbuf[0];
+ received_msg.msg_controllen = received_cmsgbuf.size();
+ ASSERT_THAT(RetryEINTR(recvmsg)(s_, &received_msg, 0),
+ SyscallSucceedsWithValue(kDataLength));
+
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&received_msg);
+ ASSERT_NE(cmsg, nullptr);
+ EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(cmsg_data_len));
+ EXPECT_EQ(cmsg->cmsg_level, sent_level);
+ EXPECT_EQ(cmsg->cmsg_type, sent_type);
+ int8_t received_tos = 0;
+ memcpy(&received_tos, CMSG_DATA(cmsg), sizeof(received_tos));
+ EXPECT_EQ(received_tos, sent_tos);
+}
+
+// Test that sendmsg with IP_TOS and IPV6_TCLASS control messages will set the
+// TOS byte on outgoing packets, and that a receiving socket with IP_RECVTOS or
+// IPV6_RECVTCLASS will create the corresponding control message.
+TEST_P(UdpSocketTest, SendAndReceiveTOS) {
+ // TODO(b/68320120): IPV6_RECVTCLASS not supported for netstack.
+ // TODO(b/146661005): Setting TOS via cmsg not supported for netstack.
+ SKIP_IF(IsRunningOnGvisor() && !IsRunningWithHostinet());
+ ASSERT_THAT(bind(s_, addr_[0], addrlen_), SyscallSucceeds());
+ ASSERT_THAT(connect(t_, addr_[0], addrlen_), SyscallSucceeds());
+
+ // Allow socket to receive control message.
+ int recv_level = SOL_IP;
+ int recv_type = IP_RECVTOS;
+ if (GetParam() != AddressFamily::kIpv4) {
+ recv_level = SOL_IPV6;
+ recv_type = IPV6_RECVTCLASS;
+ }
+ int recv_opt = kSockOptOn;
+ ASSERT_THAT(
+ setsockopt(s_, recv_level, recv_type, &recv_opt, sizeof(recv_opt)),
+ SyscallSucceeds());
+
+ // Prepare message to send.
+ constexpr size_t kDataLength = 1024;
+ int sent_level = recv_level;
+ int sent_type = IP_TOS;
+ int sent_tos = IPTOS_LOWDELAY; // Choose some TOS value.
+
+ struct msghdr sent_msg = {};
+ struct iovec sent_iov = {};
+ char sent_data[kDataLength];
+ sent_iov.iov_base = &sent_data[0];
+ sent_iov.iov_len = kDataLength;
+ sent_msg.msg_iov = &sent_iov;
+ sent_msg.msg_iovlen = 1;
+ size_t cmsg_data_len = sizeof(int8_t);
+ if (sent_level == SOL_IPV6) {
+ sent_type = IPV6_TCLASS;
+ cmsg_data_len = sizeof(int);
+ }
+ std::vector<char> sent_cmsgbuf(CMSG_SPACE(cmsg_data_len));
+ sent_msg.msg_control = &sent_cmsgbuf[0];
+ sent_msg.msg_controllen = CMSG_LEN(cmsg_data_len);
+
+ // Manually add control message.
+ struct cmsghdr* sent_cmsg = CMSG_FIRSTHDR(&sent_msg);
+ sent_cmsg->cmsg_len = CMSG_LEN(cmsg_data_len);
+ sent_cmsg->cmsg_level = sent_level;
+ sent_cmsg->cmsg_type = sent_type;
+ *(int8_t*)CMSG_DATA(sent_cmsg) = sent_tos;
+
+ ASSERT_THAT(RetryEINTR(sendmsg)(t_, &sent_msg, 0),
+ SyscallSucceedsWithValue(kDataLength));
+
+ // Receive message.
+ struct msghdr received_msg = {};
+ struct iovec received_iov = {};
+ char received_data[kDataLength];
+ received_iov.iov_base = &received_data[0];
+ received_iov.iov_len = kDataLength;
+ received_msg.msg_iov = &received_iov;
+ received_msg.msg_iovlen = 1;
+ std::vector<char> received_cmsgbuf(CMSG_SPACE(cmsg_data_len));
+ received_msg.msg_control = &received_cmsgbuf[0];
+ received_msg.msg_controllen = CMSG_LEN(cmsg_data_len);
+ ASSERT_THAT(RetryEINTR(recvmsg)(s_, &received_msg, 0),
+ SyscallSucceedsWithValue(kDataLength));
+
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&received_msg);
+ ASSERT_NE(cmsg, nullptr);
+ EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(cmsg_data_len));
+ EXPECT_EQ(cmsg->cmsg_level, sent_level);
+ EXPECT_EQ(cmsg->cmsg_type, sent_type);
+ int8_t received_tos = 0;
+ memcpy(&received_tos, CMSG_DATA(cmsg), sizeof(received_tos));
+ EXPECT_EQ(received_tos, sent_tos);
+}
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/udp_socket_test_cases.h b/test/syscalls/linux/udp_socket_test_cases.h
new file mode 100644
index 000000000..2fd79d99e
--- /dev/null
+++ b/test/syscalls/linux/udp_socket_test_cases.h
@@ -0,0 +1,74 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef THIRD_PARTY_GOLANG_GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_UDP_UNBOUND_H_
+#define THIRD_PARTY_GOLANG_GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_UDP_UNBOUND_H_
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// The initial port to be be used on gvisor.
+constexpr int TestPort = 40000;
+
+// Fixture for tests parameterized by the address family to use (AF_INET and
+// AF_INET6) when creating sockets.
+class UdpSocketTest
+ : public ::testing::TestWithParam<gvisor::testing::AddressFamily> {
+ protected:
+ // Creates two sockets that will be used by test cases.
+ void SetUp() override;
+
+ // Closes the sockets created by SetUp().
+ void TearDown() override {
+ EXPECT_THAT(close(s_), SyscallSucceeds());
+ EXPECT_THAT(close(t_), SyscallSucceeds());
+
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(ports_); ++i) {
+ ASSERT_NO_ERRNO(FreeAvailablePort(ports_[i]));
+ }
+ }
+
+ // First UDP socket.
+ int s_;
+
+ // Second UDP socket.
+ int t_;
+
+ // The length of the socket address.
+ socklen_t addrlen_;
+
+ // Initialized address pointing to loopback and port TestPort+i.
+ struct sockaddr* addr_[3];
+
+ // Initialize "any" address.
+ struct sockaddr* anyaddr_;
+
+ // Used ports.
+ int ports_[3];
+
+ private:
+ // Storage for the loopback addresses.
+ struct sockaddr_storage addr_storage_[3];
+
+ // Storage for the "any" address.
+ struct sockaddr_storage anyaddr_storage_;
+};
+
+} // namespace testing
+} // namespace gvisor
+
+#endif // THIRD_PARTY_GOLANG_GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_UDP_UNBOUND_H_
diff --git a/test/syscalls/linux/xattr.cc b/test/syscalls/linux/xattr.cc
new file mode 100644
index 000000000..75740238c
--- /dev/null
+++ b/test/syscalls/linux/xattr.cc
@@ -0,0 +1,488 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sys/types.h>
+#include <sys/xattr.h>
+#include <unistd.h>
+
+#include <string>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/file_base.h"
+#include "test/util/capability_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+class XattrTest : public FileTest {};
+
+TEST_F(XattrTest, XattrNullName) {
+ const char* path = test_file_name_.c_str();
+
+ EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, /*flags=*/0),
+ SyscallFailsWithErrno(EFAULT));
+ EXPECT_THAT(getxattr(path, nullptr, nullptr, 0),
+ SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(XattrTest, XattrEmptyName) {
+ const char* path = test_file_name_.c_str();
+
+ EXPECT_THAT(setxattr(path, "", nullptr, 0, /*flags=*/0),
+ SyscallFailsWithErrno(ERANGE));
+ EXPECT_THAT(getxattr(path, "", nullptr, 0), SyscallFailsWithErrno(ERANGE));
+}
+
+TEST_F(XattrTest, XattrLargeName) {
+ const char* path = test_file_name_.c_str();
+ std::string name = "user.";
+ name += std::string(XATTR_NAME_MAX - name.length(), 'a');
+
+ // TODO(b/127675828): Support setxattr and getxattr.
+ if (!IsRunningOnGvisor()) {
+ EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0),
+ SyscallSucceeds());
+ EXPECT_THAT(getxattr(path, name.c_str(), nullptr, 0),
+ SyscallSucceedsWithValue(0));
+ }
+
+ name += "a";
+ EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0),
+ SyscallFailsWithErrno(ERANGE));
+ EXPECT_THAT(getxattr(path, name.c_str(), nullptr, 0),
+ SyscallFailsWithErrno(ERANGE));
+}
+
+TEST_F(XattrTest, XattrInvalidPrefix) {
+ const char* path = test_file_name_.c_str();
+ std::string name(XATTR_NAME_MAX, 'a');
+ EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0),
+ SyscallFailsWithErrno(EOPNOTSUPP));
+ EXPECT_THAT(getxattr(path, name.c_str(), nullptr, 0),
+ SyscallFailsWithErrno(EOPNOTSUPP));
+}
+
+TEST_F(XattrTest, XattrReadOnly) {
+ // Drop capabilities that allow us to override file and directory permissions.
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+ const char* path = test_file_name_.c_str();
+ char name[] = "user.abc";
+ char val = 'a';
+ size_t size = sizeof(val);
+
+ // TODO(b/127675828): Support setxattr and getxattr.
+ if (!IsRunningOnGvisor()) {
+ EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0),
+ SyscallSucceeds());
+ }
+
+ ASSERT_NO_ERRNO(testing::Chmod(test_file_name_, S_IRUSR));
+
+ EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0),
+ SyscallFailsWithErrno(EACCES));
+
+ // TODO(b/127675828): Support setxattr and getxattr.
+ if (!IsRunningOnGvisor()) {
+ char buf = '-';
+ EXPECT_THAT(getxattr(path, name, &buf, size),
+ SyscallSucceedsWithValue(size));
+ EXPECT_EQ(buf, val);
+ }
+}
+
+TEST_F(XattrTest, XattrWriteOnly) {
+ // Drop capabilities that allow us to override file and directory permissions.
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+ ASSERT_NO_ERRNO(testing::Chmod(test_file_name_, S_IWUSR));
+
+ const char* path = test_file_name_.c_str();
+ char name[] = "user.abc";
+ char val = 'a';
+ size_t size = sizeof(val);
+
+ // TODO(b/127675828): Support setxattr and getxattr.
+ if (!IsRunningOnGvisor()) {
+ EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0),
+ SyscallSucceeds());
+ }
+
+ EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(EACCES));
+}
+
+TEST_F(XattrTest, XattrTrustedWithNonadmin) {
+ // TODO(b/127675828): Support setxattr and getxattr.
+ SKIP_IF(IsRunningOnGvisor());
+ SKIP_IF(ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+ const char* path = test_file_name_.c_str();
+ const char name[] = "trusted.abc";
+ EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0),
+ SyscallFailsWithErrno(EPERM));
+ EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
+}
+
+TEST_F(XattrTest, XattrOnDirectory) {
+ // TODO(b/127675828): Support setxattr and getxattr.
+ SKIP_IF(IsRunningOnGvisor());
+
+ TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ char name[] = "user.abc";
+ EXPECT_THAT(setxattr(dir.path().c_str(), name, NULL, 0, /*flags=*/0),
+ SyscallSucceeds());
+ EXPECT_THAT(getxattr(dir.path().c_str(), name, NULL, 0),
+ SyscallSucceedsWithValue(0));
+}
+
+TEST_F(XattrTest, XattrOnSymlink) {
+ // TODO(b/127675828): Support setxattr and getxattr.
+ SKIP_IF(IsRunningOnGvisor());
+
+ TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateSymlinkTo(dir.path(), test_file_name_));
+ char name[] = "user.abc";
+ EXPECT_THAT(setxattr(link.path().c_str(), name, NULL, 0, /*flags=*/0),
+ SyscallSucceeds());
+ EXPECT_THAT(getxattr(link.path().c_str(), name, NULL, 0),
+ SyscallSucceedsWithValue(0));
+}
+
+TEST_F(XattrTest, XattrOnInvalidFileTypes) {
+ char name[] = "user.abc";
+
+ char char_device[] = "/dev/zero";
+ EXPECT_THAT(setxattr(char_device, name, NULL, 0, /*flags=*/0),
+ SyscallFailsWithErrno(EPERM));
+ EXPECT_THAT(getxattr(char_device, name, NULL, 0),
+ SyscallFailsWithErrno(ENODATA));
+
+ // Use tmpfs, where creation of named pipes is supported.
+ const std::string fifo = NewTempAbsPathInDir("/dev/shm");
+ const char* path = fifo.c_str();
+ EXPECT_THAT(mknod(path, S_IFIFO | S_IRUSR | S_IWUSR, 0), SyscallSucceeds());
+ EXPECT_THAT(setxattr(path, name, NULL, 0, /*flags=*/0),
+ SyscallFailsWithErrno(EPERM));
+ EXPECT_THAT(getxattr(path, name, NULL, 0), SyscallFailsWithErrno(ENODATA));
+}
+
+TEST_F(XattrTest, SetxattrSizeSmallerThanValue) {
+ // TODO(b/127675828): Support setxattr and getxattr.
+ SKIP_IF(IsRunningOnGvisor());
+
+ const char* path = test_file_name_.c_str();
+ char name[] = "user.abc";
+ std::vector<char> val = {'a', 'a'};
+ size_t size = 1;
+ EXPECT_THAT(setxattr(path, name, val.data(), size, /*flags=*/0),
+ SyscallSucceeds());
+
+ std::vector<char> buf = {'-', '-'};
+ std::vector<char> expected_buf = {'a', '-'};
+ EXPECT_THAT(getxattr(path, name, buf.data(), buf.size()),
+ SyscallSucceedsWithValue(size));
+ EXPECT_EQ(buf, expected_buf);
+}
+
+TEST_F(XattrTest, SetxattrZeroSize) {
+ // TODO(b/127675828): Support setxattr and getxattr.
+ SKIP_IF(IsRunningOnGvisor());
+
+ const char* path = test_file_name_.c_str();
+ char name[] = "user.abc";
+ char val = 'a';
+ EXPECT_THAT(setxattr(path, name, &val, 0, /*flags=*/0), SyscallSucceeds());
+
+ char buf = '-';
+ EXPECT_THAT(getxattr(path, name, &buf, XATTR_SIZE_MAX),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(buf, '-');
+}
+
+TEST_F(XattrTest, SetxattrSizeTooLarge) {
+ const char* path = test_file_name_.c_str();
+ char name[] = "user.abc";
+
+ // Note that each particular fs implementation may stipulate a lower size
+ // limit, in which case we actually may fail (e.g. error with ENOSPC) for
+ // some sizes under XATTR_SIZE_MAX.
+ size_t size = XATTR_SIZE_MAX + 1;
+ std::vector<char> val(size);
+ EXPECT_THAT(setxattr(path, name, val.data(), size, /*flags=*/0),
+ SyscallFailsWithErrno(E2BIG));
+
+ // TODO(b/127675828): Support setxattr and getxattr.
+ if (!IsRunningOnGvisor()) {
+ EXPECT_THAT(getxattr(path, name, nullptr, 0),
+ SyscallFailsWithErrno(ENODATA));
+ }
+}
+
+TEST_F(XattrTest, SetxattrNullValueAndNonzeroSize) {
+ const char* path = test_file_name_.c_str();
+ char name[] = "user.abc";
+ EXPECT_THAT(setxattr(path, name, nullptr, 1, /*flags=*/0),
+ SyscallFailsWithErrno(EFAULT));
+
+ // TODO(b/127675828): Support setxattr and getxattr.
+ if (!IsRunningOnGvisor()) {
+ EXPECT_THAT(getxattr(path, name, nullptr, 0),
+ SyscallFailsWithErrno(ENODATA));
+ }
+}
+
+TEST_F(XattrTest, SetxattrNullValueAndZeroSize) {
+ // TODO(b/127675828): Support setxattr and getxattr.
+ SKIP_IF(IsRunningOnGvisor());
+
+ const char* path = test_file_name_.c_str();
+ char name[] = "user.abc";
+ EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
+
+ EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(XattrTest, SetxattrValueTooLargeButOKSize) {
+ // TODO(b/127675828): Support setxattr and getxattr.
+ SKIP_IF(IsRunningOnGvisor());
+
+ const char* path = test_file_name_.c_str();
+ char name[] = "user.abc";
+ std::vector<char> val(XATTR_SIZE_MAX + 1);
+ std::fill(val.begin(), val.end(), 'a');
+ size_t size = 1;
+ EXPECT_THAT(setxattr(path, name, val.data(), size, /*flags=*/0),
+ SyscallSucceeds());
+
+ std::vector<char> buf = {'-', '-'};
+ std::vector<char> expected_buf = {'a', '-'};
+ EXPECT_THAT(getxattr(path, name, buf.data(), size),
+ SyscallSucceedsWithValue(size));
+ EXPECT_EQ(buf, expected_buf);
+}
+
+TEST_F(XattrTest, SetxattrReplaceWithSmaller) {
+ // TODO(b/127675828): Support setxattr and getxattr.
+ SKIP_IF(IsRunningOnGvisor());
+
+ const char* path = test_file_name_.c_str();
+ char name[] = "user.abc";
+ std::vector<char> val = {'a', 'a'};
+ EXPECT_THAT(setxattr(path, name, val.data(), 2, /*flags=*/0),
+ SyscallSucceeds());
+ EXPECT_THAT(setxattr(path, name, val.data(), 1, /*flags=*/0),
+ SyscallSucceeds());
+
+ std::vector<char> buf = {'-', '-'};
+ std::vector<char> expected_buf = {'a', '-'};
+ EXPECT_THAT(getxattr(path, name, buf.data(), 2), SyscallSucceedsWithValue(1));
+ EXPECT_EQ(buf, expected_buf);
+}
+
+TEST_F(XattrTest, SetxattrReplaceWithLarger) {
+ // TODO(b/127675828): Support setxattr and getxattr.
+ SKIP_IF(IsRunningOnGvisor());
+
+ const char* path = test_file_name_.c_str();
+ char name[] = "user.abc";
+ std::vector<char> val = {'a', 'a'};
+ EXPECT_THAT(setxattr(path, name, val.data(), 1, /*flags=*/0),
+ SyscallSucceeds());
+ EXPECT_THAT(setxattr(path, name, val.data(), 2, /*flags=*/0),
+ SyscallSucceeds());
+
+ std::vector<char> buf = {'-', '-'};
+ EXPECT_THAT(getxattr(path, name, buf.data(), 2), SyscallSucceedsWithValue(2));
+ EXPECT_EQ(buf, val);
+}
+
+TEST_F(XattrTest, SetxattrCreateFlag) {
+ // TODO(b/127675828): Support setxattr and getxattr.
+ SKIP_IF(IsRunningOnGvisor());
+
+ const char* path = test_file_name_.c_str();
+ char name[] = "user.abc";
+ EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_CREATE),
+ SyscallSucceeds());
+ EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_CREATE),
+ SyscallFailsWithErrno(EEXIST));
+
+ EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(XattrTest, SetxattrReplaceFlag) {
+ // TODO(b/127675828): Support setxattr and getxattr.
+ SKIP_IF(IsRunningOnGvisor());
+
+ const char* path = test_file_name_.c_str();
+ char name[] = "user.abc";
+ EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_REPLACE),
+ SyscallFailsWithErrno(ENODATA));
+ EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
+ EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_REPLACE),
+ SyscallSucceeds());
+
+ EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(XattrTest, SetxattrInvalidFlags) {
+ const char* path = test_file_name_.c_str();
+ int invalid_flags = 0xff;
+ EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, invalid_flags),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_F(XattrTest, Getxattr) {
+ // TODO(b/127675828): Support setxattr and getxattr.
+ SKIP_IF(IsRunningOnGvisor());
+
+ const char* path = test_file_name_.c_str();
+ char name[] = "user.abc";
+ int val = 1234;
+ size_t size = sizeof(val);
+ EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
+
+ int buf = 0;
+ EXPECT_THAT(getxattr(path, name, &buf, size), SyscallSucceedsWithValue(size));
+ EXPECT_EQ(buf, val);
+}
+
+TEST_F(XattrTest, GetxattrSizeSmallerThanValue) {
+ // TODO(b/127675828): Support setxattr and getxattr.
+ SKIP_IF(IsRunningOnGvisor());
+
+ const char* path = test_file_name_.c_str();
+ char name[] = "user.abc";
+ std::vector<char> val = {'a', 'a'};
+ size_t size = val.size();
+ EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
+
+ char buf = '-';
+ EXPECT_THAT(getxattr(path, name, &buf, 1), SyscallFailsWithErrno(ERANGE));
+ EXPECT_EQ(buf, '-');
+}
+
+TEST_F(XattrTest, GetxattrSizeLargerThanValue) {
+ // TODO(b/127675828): Support setxattr and getxattr.
+ SKIP_IF(IsRunningOnGvisor());
+
+ const char* path = test_file_name_.c_str();
+ char name[] = "user.abc";
+ char val = 'a';
+ EXPECT_THAT(setxattr(path, name, &val, 1, /*flags=*/0), SyscallSucceeds());
+
+ std::vector<char> buf(XATTR_SIZE_MAX);
+ std::fill(buf.begin(), buf.end(), '-');
+ std::vector<char> expected_buf = buf;
+ expected_buf[0] = 'a';
+ EXPECT_THAT(getxattr(path, name, buf.data(), buf.size()),
+ SyscallSucceedsWithValue(1));
+ EXPECT_EQ(buf, expected_buf);
+}
+
+TEST_F(XattrTest, GetxattrZeroSize) {
+ // TODO(b/127675828): Support setxattr and getxattr.
+ SKIP_IF(IsRunningOnGvisor());
+
+ const char* path = test_file_name_.c_str();
+ char name[] = "user.abc";
+ char val = 'a';
+ EXPECT_THAT(setxattr(path, name, &val, sizeof(val), /*flags=*/0),
+ SyscallSucceeds());
+
+ char buf = '-';
+ EXPECT_THAT(getxattr(path, name, &buf, 0),
+ SyscallSucceedsWithValue(sizeof(val)));
+ EXPECT_EQ(buf, '-');
+}
+
+TEST_F(XattrTest, GetxattrSizeTooLarge) {
+ // TODO(b/127675828): Support setxattr and getxattr.
+ SKIP_IF(IsRunningOnGvisor());
+
+ const char* path = test_file_name_.c_str();
+ char name[] = "user.abc";
+ char val = 'a';
+ EXPECT_THAT(setxattr(path, name, &val, sizeof(val), /*flags=*/0),
+ SyscallSucceeds());
+
+ std::vector<char> buf(XATTR_SIZE_MAX + 1);
+ std::fill(buf.begin(), buf.end(), '-');
+ std::vector<char> expected_buf = buf;
+ expected_buf[0] = 'a';
+ EXPECT_THAT(getxattr(path, name, buf.data(), buf.size()),
+ SyscallSucceedsWithValue(sizeof(val)));
+ EXPECT_EQ(buf, expected_buf);
+}
+
+TEST_F(XattrTest, GetxattrNullValue) {
+ // TODO(b/127675828): Support setxattr and getxattr.
+ SKIP_IF(IsRunningOnGvisor());
+
+ const char* path = test_file_name_.c_str();
+ char name[] = "user.abc";
+ char val = 'a';
+ size_t size = sizeof(val);
+ EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
+
+ EXPECT_THAT(getxattr(path, name, nullptr, size),
+ SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(XattrTest, GetxattrNullValueAndZeroSize) {
+ // TODO(b/127675828): Support setxattr and getxattr.
+ SKIP_IF(IsRunningOnGvisor());
+
+ const char* path = test_file_name_.c_str();
+ char name[] = "user.abc";
+ char val = 'a';
+ size_t size = sizeof(val);
+ // Set value with zero size.
+ EXPECT_THAT(setxattr(path, name, &val, 0, /*flags=*/0), SyscallSucceeds());
+ // Get value with nonzero size.
+ EXPECT_THAT(getxattr(path, name, nullptr, size), SyscallSucceedsWithValue(0));
+
+ // Set value with nonzero size.
+ EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
+ // Get value with zero size.
+ EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallSucceedsWithValue(size));
+}
+
+TEST_F(XattrTest, GetxattrNonexistentName) {
+ // TODO(b/127675828): Support getxattr.
+ SKIP_IF(IsRunningOnGvisor());
+
+ const char* path = test_file_name_.c_str();
+ std::string name = "user.nonexistent";
+ EXPECT_THAT(getxattr(path, name.c_str(), nullptr, 0),
+ SyscallFailsWithErrno(ENODATA));
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/syscall_test_runner.go b/test/syscalls/syscall_test_runner.go
index accf46347..b9fd885ff 100644
--- a/test/syscalls/syscall_test_runner.go
+++ b/test/syscalls/syscall_test_runner.go
@@ -46,6 +46,7 @@ var (
debug = flag.Bool("debug", false, "enable debug logs")
strace = flag.Bool("strace", false, "enable strace logs")
platform = flag.String("platform", "ptrace", "platform to run on")
+ network = flag.String("network", "none", "network stack to run on (sandbox, host, none)")
useTmpfs = flag.Bool("use-tmpfs", false, "mounts tmpfs for /tmp")
fileAccess = flag.String("file-access", "exclusive", "mounts root in exclusive or shared mode")
overlay = flag.Bool("overlay", false, "wrap filesystem mounts with writable tmpfs overlay")
@@ -137,7 +138,7 @@ func runRunsc(tc gtest.TestCase, spec *specs.Spec) error {
args := []string{
"-root", rootDir,
- "-network=none",
+ "-network", *network,
"-log-format=text",
"-TESTONLY-unsafe-nonroot=true",
"-net-raw=true",
@@ -335,10 +336,11 @@ func runTestCaseRunsc(testBin string, tc gtest.TestCase, t *testing.T) {
})
}
- // Set environment variable that indicates we are
- // running in gVisor and with the given platform.
+ // Set environment variables that indicate we are
+ // running in gVisor with the given platform and network.
platformVar := "TEST_ON_GVISOR"
- env := append(os.Environ(), platformVar+"="+*platform)
+ networkVar := "GVISOR_NETWORK"
+ env := append(os.Environ(), platformVar+"="+*platform, networkVar+"="+*network)
// Remove env variables that cause the gunit binary to write output
// files, since they will stomp on eachother, and on the output files
diff --git a/test/util/BUILD b/test/util/BUILD
index 4526bb3f1..cbc728159 100644
--- a/test/util/BUILD
+++ b/test/util/BUILD
@@ -237,6 +237,7 @@ cc_library(
] + select_for_linux(
[
"test_util_impl.cc",
+ "test_util_runfiles.cc",
],
),
hdrs = ["test_util.h"],
@@ -245,6 +246,7 @@ cc_library(
":logging",
":posix_error",
":save_util",
+ "@bazel_tools//tools/cpp/runfiles",
"@com_google_absl//absl/base:core_headers",
"@com_google_absl//absl/flags:flag",
"@com_google_absl//absl/flags:parse",
diff --git a/test/util/fs_util.cc b/test/util/fs_util.cc
index 88b1e7911..042cec94a 100644
--- a/test/util/fs_util.cc
+++ b/test/util/fs_util.cc
@@ -105,6 +105,15 @@ PosixErrorOr<struct stat> Stat(absl::string_view path) {
return stat_buf;
}
+PosixErrorOr<struct stat> Lstat(absl::string_view path) {
+ struct stat stat_buf;
+ int res = lstat(std::string(path).c_str(), &stat_buf);
+ if (res < 0) {
+ return PosixError(errno, absl::StrCat("lstat ", path));
+ }
+ return stat_buf;
+}
+
PosixErrorOr<struct stat> Fstat(int fd) {
struct stat stat_buf;
int res = fstat(fd, &stat_buf);
@@ -127,7 +136,7 @@ PosixErrorOr<bool> Exists(absl::string_view path) {
}
PosixErrorOr<bool> IsDirectory(absl::string_view path) {
- ASSIGN_OR_RETURN_ERRNO(struct stat stat_buf, Stat(path));
+ ASSIGN_OR_RETURN_ERRNO(struct stat stat_buf, Lstat(path));
if (S_ISDIR(stat_buf.st_mode)) {
return true;
}
diff --git a/test/util/test_util.cc b/test/util/test_util.cc
index 9cb050735..848504c88 100644
--- a/test/util/test_util.cc
+++ b/test/util/test_util.cc
@@ -41,6 +41,7 @@ namespace gvisor {
namespace testing {
#define TEST_ON_GVISOR "TEST_ON_GVISOR"
+#define GVISOR_NETWORK "GVISOR_NETWORK"
bool IsRunningOnGvisor() { return GvisorPlatform() != Platform::kNative; }
@@ -60,6 +61,11 @@ Platform GvisorPlatform() {
abort();
}
+bool IsRunningWithHostinet() {
+ char* env = getenv(GVISOR_NETWORK);
+ return env && strcmp(env, "host") == 0;
+}
+
// Inline cpuid instruction. Preserve %ebx/%rbx register. In PIC compilations
// %ebx contains the address of the global offset table. %rbx is occasionally
// used to address stack variables in presence of dynamic allocas.
diff --git a/test/util/test_util.h b/test/util/test_util.h
index dc30575b8..b3235c7e3 100644
--- a/test/util/test_util.h
+++ b/test/util/test_util.h
@@ -220,6 +220,7 @@ enum class Platform {
};
bool IsRunningOnGvisor();
Platform GvisorPlatform();
+bool IsRunningWithHostinet();
#ifdef __linux__
void SetupGvisorDeathTest();
@@ -764,6 +765,12 @@ MATCHER_P2(EquivalentWithin, target, tolerance,
return Equivalent(arg, target, tolerance);
}
+// Returns the absolute path to the a data dependency. 'path' is the runfile
+// location relative to workspace root.
+#ifdef __linux__
+std::string RunfilePath(std::string path);
+#endif
+
void TestInit(int* argc, char*** argv);
} // namespace testing
diff --git a/test/util/test_util_runfiles.cc b/test/util/test_util_runfiles.cc
new file mode 100644
index 000000000..7210094eb
--- /dev/null
+++ b/test/util/test_util_runfiles.cc
@@ -0,0 +1,46 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <string>
+
+#include "test/util/fs_util.h"
+#include "test/util/test_util.h"
+#include "tools/cpp/runfiles/runfiles.h"
+
+namespace gvisor {
+namespace testing {
+
+std::string RunfilePath(std::string path) {
+ static const bazel::tools::cpp::runfiles::Runfiles* const runfiles = [] {
+ std::string error;
+ auto* runfiles =
+ bazel::tools::cpp::runfiles::Runfiles::CreateForTest(&error);
+ if (runfiles == nullptr) {
+ std::cerr << "Unable to find runfiles: " << error << std::endl;
+ }
+ return runfiles;
+ }();
+
+ if (!runfiles) {
+ // Can't find runfiles? This probably won't work, but __main__/path is our
+ // best guess.
+ return JoinPath("__main__", path);
+ }
+
+ return runfiles->Rlocation(JoinPath("__main__", path));
+}
+
+} // namespace testing
+} // namespace gvisor
diff --git a/tools/go_stateify/defs.bzl b/tools/go_stateify/defs.bzl
index 3ce36c1c8..33267c074 100644
--- a/tools/go_stateify/defs.bzl
+++ b/tools/go_stateify/defs.bzl
@@ -43,12 +43,13 @@ def _go_stateify_impl(ctx):
# Run the stateify command.
args = ["-output=%s" % output.path]
- args += ["-pkg=%s" % ctx.attr.package]
+ args.append("-pkg=%s" % ctx.attr.package)
+ args.append("-arch=%s" % ctx.attr.arch)
if ctx.attr._statepkg:
- args += ["-statepkg=%s" % ctx.attr._statepkg]
+ args.append("-statepkg=%s" % ctx.attr._statepkg)
if ctx.attr.imports:
- args += ["-imports=%s" % ",".join(ctx.attr.imports)]
- args += ["--"]
+ args.append("-imports=%s" % ",".join(ctx.attr.imports))
+ args.append("--")
for src in ctx.attr.srcs:
args += [f.path for f in src.files.to_list()]
ctx.actions.run(
@@ -83,6 +84,10 @@ for statified types.
doc = "The package name for the input sources.",
mandatory = True,
),
+ "arch": attr.string(
+ doc = "Target platform.",
+ mandatory = True,
+ ),
"out": attr.output(
doc = """
The name of the generated file output. This must not conflict with any other
@@ -118,6 +123,10 @@ def go_library(name, srcs, deps = [], imports = [], **kwargs):
srcs = [src for src in srcs if src.endswith(".go")],
imports = imports,
package = name,
+ arch = select({
+ "@bazel_tools//src/conditions:linux_aarch64": "arm64",
+ "//conditions:default": "amd64",
+ }),
out = name + "_state_autogen.go",
)
all_srcs = srcs + [name + "_state_autogen.go"]
diff --git a/tools/go_stateify/main.go b/tools/go_stateify/main.go
index db7a7107b..7d5d291e6 100644
--- a/tools/go_stateify/main.go
+++ b/tools/go_stateify/main.go
@@ -22,7 +22,9 @@ import (
"go/ast"
"go/parser"
"go/token"
+ "io/ioutil"
"os"
+ "path/filepath"
"reflect"
"strings"
"sync"
@@ -33,8 +35,113 @@ var (
imports = flag.String("imports", "", "extra imports for the output file")
output = flag.String("output", "", "output file")
statePkg = flag.String("statepkg", "", "state import package; defaults to empty")
+ arch = flag.String("arch", "", "specify the target platform")
)
+// The known architectures.
+var okgoarch = []string{
+ "386",
+ "amd64",
+ "arm",
+ "arm64",
+ "mips",
+ "mipsle",
+ "mips64",
+ "mips64le",
+ "ppc64",
+ "ppc64le",
+ "riscv64",
+ "s390x",
+ "sparc64",
+ "wasm",
+}
+
+// readfile returns the content of the named file.
+func readfile(file string) string {
+ data, err := ioutil.ReadFile(file)
+ if err != nil {
+ panic(fmt.Sprintf("readfile err: %v", err))
+ }
+ return string(data)
+}
+
+// matchfield reports whether the field (x,y,z) matches this build.
+// all the elements in the field must be satisfied.
+func matchfield(f string, goarch string) bool {
+ for _, tag := range strings.Split(f, ",") {
+ if !matchtag(tag, goarch) {
+ return false
+ }
+ }
+ return true
+}
+
+// matchtag reports whether the tag (x or !x) matches this build.
+func matchtag(tag string, goarch string) bool {
+ if tag == "" {
+ return false
+ }
+ if tag[0] == '!' {
+ if len(tag) == 1 || tag[1] == '!' {
+ return false
+ }
+ return !matchtag(tag[1:], goarch)
+ }
+ return tag == goarch
+}
+
+// canBuild reports whether we can build this file for target platform by
+// checking file name and build tags. The code is derived from the Go source
+// cmd.dist.build.shouldbuild.
+func canBuild(file, goTargetArch string) bool {
+ name := filepath.Base(file)
+ excluded := func(list []string, ok string) bool {
+ for _, x := range list {
+ if x == ok || (ok == "android" && x == "linux") || (ok == "illumos" && x == "solaris") {
+ continue
+ }
+ i := strings.Index(name, x)
+ if i <= 0 || name[i-1] != '_' {
+ continue
+ }
+ i += len(x)
+ if i == len(name) || name[i] == '.' || name[i] == '_' {
+ return true
+ }
+ }
+ return false
+ }
+ if excluded(okgoarch, goTargetArch) {
+ return false
+ }
+
+ // Check file contents for // +build lines.
+ for _, p := range strings.Split(readfile(file), "\n") {
+ p = strings.TrimSpace(p)
+ if p == "" {
+ continue
+ }
+ if !strings.HasPrefix(p, "//") {
+ break
+ }
+ if !strings.Contains(p, "+build") {
+ continue
+ }
+ fields := strings.Fields(p[2:])
+ if len(fields) < 1 || fields[0] != "+build" {
+ continue
+ }
+ for _, p := range fields[1:] {
+ if matchfield(p, goTargetArch) {
+ goto fieldmatch
+ }
+ }
+ return false
+ fieldmatch:
+ }
+ return true
+}
+
// resolveTypeName returns a qualified type name.
func resolveTypeName(name string, typ ast.Expr) (field string, qualified string) {
for done := false; !done; {
@@ -256,6 +363,11 @@ func main() {
fmt.Fprintf(os.Stderr, "Input %q can't be parsed: %v\n", filename, err)
os.Exit(1)
}
+
+ if !canBuild(filename, *arch) {
+ continue
+ }
+
files = append(files, f)
}
diff --git a/tools/issue_reviver/BUILD b/tools/issue_reviver/BUILD
new file mode 100644
index 000000000..ee7ea11fd
--- /dev/null
+++ b/tools/issue_reviver/BUILD
@@ -0,0 +1,12 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_binary")
+
+package(licenses = ["notice"])
+
+go_binary(
+ name = "issue_reviver",
+ srcs = ["main.go"],
+ deps = [
+ "//tools/issue_reviver/github",
+ "//tools/issue_reviver/reviver",
+ ],
+)
diff --git a/tools/issue_reviver/github/BUILD b/tools/issue_reviver/github/BUILD
new file mode 100644
index 000000000..6da22ba1c
--- /dev/null
+++ b/tools/issue_reviver/github/BUILD
@@ -0,0 +1,17 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+ name = "github",
+ srcs = ["github.go"],
+ importpath = "gvisor.dev/gvisor/tools/issue_reviver/github",
+ visibility = [
+ "//tools/issue_reviver:__subpackages__",
+ ],
+ deps = [
+ "//tools/issue_reviver/reviver",
+ "@com_github_google_go-github//github:go_default_library",
+ "@org_golang_x_oauth2//:go_default_library",
+ ],
+)
diff --git a/tools/issue_reviver/github/github.go b/tools/issue_reviver/github/github.go
new file mode 100644
index 000000000..e07949c8f
--- /dev/null
+++ b/tools/issue_reviver/github/github.go
@@ -0,0 +1,164 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package github implements reviver.Bugger interface on top of Github issues.
+package github
+
+import (
+ "context"
+ "fmt"
+ "strconv"
+ "strings"
+ "time"
+
+ "github.com/google/go-github/github"
+ "golang.org/x/oauth2"
+ "gvisor.dev/gvisor/tools/issue_reviver/reviver"
+)
+
+// Bugger implements reviver.Bugger interface for github issues.
+type Bugger struct {
+ owner string
+ repo string
+ dryRun bool
+
+ client *github.Client
+ issues map[int]*github.Issue
+}
+
+// NewBugger creates a new Bugger.
+func NewBugger(token, owner, repo string, dryRun bool) (*Bugger, error) {
+ b := &Bugger{
+ owner: owner,
+ repo: repo,
+ dryRun: dryRun,
+ issues: map[int]*github.Issue{},
+ }
+ if err := b.load(token); err != nil {
+ return nil, err
+ }
+ return b, nil
+}
+
+func (b *Bugger) load(token string) error {
+ ctx := context.Background()
+ if len(token) == 0 {
+ fmt.Print("No OAUTH token provided, using unauthenticated account.\n")
+ b.client = github.NewClient(nil)
+ } else {
+ ts := oauth2.StaticTokenSource(
+ &oauth2.Token{AccessToken: token},
+ )
+ tc := oauth2.NewClient(ctx, ts)
+ b.client = github.NewClient(tc)
+ }
+
+ err := processAllPages(func(listOpts github.ListOptions) (*github.Response, error) {
+ opts := &github.IssueListByRepoOptions{State: "open", ListOptions: listOpts}
+ tmps, resp, err := b.client.Issues.ListByRepo(ctx, b.owner, b.repo, opts)
+ if err != nil {
+ return resp, err
+ }
+ for _, issue := range tmps {
+ b.issues[issue.GetNumber()] = issue
+ }
+ return resp, nil
+ })
+ if err != nil {
+ return err
+ }
+
+ fmt.Printf("Loaded %d issues from github.com/%s/%s\n", len(b.issues), b.owner, b.repo)
+ return nil
+}
+
+// Activate implements reviver.Bugger.
+func (b *Bugger) Activate(todo *reviver.Todo) (bool, error) {
+ const prefix = "gvisor.dev/issue/"
+
+ // First check if I can handle the TODO.
+ idStr := strings.TrimPrefix(todo.Issue, prefix)
+ if len(todo.Issue) == len(idStr) {
+ return false, nil
+ }
+
+ id, err := strconv.Atoi(idStr)
+ if err != nil {
+ return true, err
+ }
+
+ // Check against active issues cache.
+ if _, ok := b.issues[id]; ok {
+ fmt.Printf("%q is active: OK\n", todo.Issue)
+ return true, nil
+ }
+
+ fmt.Printf("%q is not active: reopening issue %d\n", todo.Issue, id)
+
+ // Format comment with TODO locations and search link.
+ comment := strings.Builder{}
+ fmt.Fprintln(&comment, "There are TODOs still referencing this issue:")
+ for _, l := range todo.Locations {
+ fmt.Fprintf(&comment,
+ "1. [%s:%d](https://github.com/%s/%s/blob/HEAD/%s#%d): %s\n",
+ l.File, l.Line, b.owner, b.repo, l.File, l.Line, l.Comment)
+ }
+ fmt.Fprintf(&comment,
+ "\n\nSearch [TODO](https://github.com/%s/%s/search?q=%%22%s%d%%22)", b.owner, b.repo, prefix, id)
+
+ if b.dryRun {
+ fmt.Printf("[dry-run: skipping change to issue %d]\n%s\n=======================\n", id, comment.String())
+ return true, nil
+ }
+
+ ctx := context.Background()
+ req := &github.IssueRequest{State: github.String("open")}
+ _, _, err = b.client.Issues.Edit(ctx, b.owner, b.repo, id, req)
+ if err != nil {
+ return true, fmt.Errorf("failed to reactivate issue %d: %v", id, err)
+ }
+
+ cmt := &github.IssueComment{
+ Body: github.String(comment.String()),
+ Reactions: &github.Reactions{Confused: github.Int(1)},
+ }
+ if _, _, err := b.client.Issues.CreateComment(ctx, b.owner, b.repo, id, cmt); err != nil {
+ return true, fmt.Errorf("failed to add comment to issue %d: %v", id, err)
+ }
+
+ return true, nil
+}
+
+func processAllPages(fn func(github.ListOptions) (*github.Response, error)) error {
+ opts := github.ListOptions{PerPage: 1000}
+ for {
+ resp, err := fn(opts)
+ if err != nil {
+ if rateErr, ok := err.(*github.RateLimitError); ok {
+ duration := rateErr.Rate.Reset.Sub(time.Now())
+ if duration > 5*time.Minute {
+ return fmt.Errorf("Rate limited for too long: %v", duration)
+ }
+ fmt.Printf("Rate limited, sleeping for: %v\n", duration)
+ time.Sleep(duration)
+ continue
+ }
+ return err
+ }
+ if resp.NextPage == 0 {
+ return nil
+ }
+ opts.Page = resp.NextPage
+ }
+}
diff --git a/tools/issue_reviver/main.go b/tools/issue_reviver/main.go
new file mode 100644
index 000000000..4256f5a6c
--- /dev/null
+++ b/tools/issue_reviver/main.go
@@ -0,0 +1,89 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package main is the entry point for issue_reviver.
+package main
+
+import (
+ "flag"
+ "fmt"
+ "io/ioutil"
+ "os"
+
+ "gvisor.dev/gvisor/tools/issue_reviver/github"
+ "gvisor.dev/gvisor/tools/issue_reviver/reviver"
+)
+
+var (
+ owner string
+ repo string
+ tokenFile string
+ path string
+ dryRun bool
+)
+
+// Keep the options simple for now. Supports only a single path and repo.
+func init() {
+ flag.StringVar(&owner, "owner", "google", "Github project org/owner to look for issues")
+ flag.StringVar(&repo, "repo", "gvisor", "Github repo to look for issues")
+ flag.StringVar(&tokenFile, "oauth-token-file", "", "Path to file containing the OAUTH token to be used as credential to github")
+ flag.StringVar(&path, "path", "", "Path to scan for TODOs")
+ flag.BoolVar(&dryRun, "dry-run", false, "If set to true, no changes are made to issues")
+}
+
+func main() {
+ flag.Parse()
+
+ // Check for mandatory parameters.
+ if len(owner) == 0 {
+ fmt.Println("missing --owner option.")
+ flag.Usage()
+ os.Exit(1)
+ }
+ if len(repo) == 0 {
+ fmt.Println("missing --repo option.")
+ flag.Usage()
+ os.Exit(1)
+ }
+ if len(path) == 0 {
+ fmt.Println("missing --path option.")
+ flag.Usage()
+ os.Exit(1)
+ }
+
+ // Token is passed as a file so it doesn't show up in command line arguments.
+ var token string
+ if len(tokenFile) != 0 {
+ bytes, err := ioutil.ReadFile(tokenFile)
+ if err != nil {
+ fmt.Println(err.Error())
+ os.Exit(1)
+ }
+ token = string(bytes)
+ }
+
+ bugger, err := github.NewBugger(token, owner, repo, dryRun)
+ if err != nil {
+ fmt.Fprintln(os.Stderr, "Error getting github issues:", err)
+ os.Exit(1)
+ }
+ rev := reviver.New([]string{path}, []reviver.Bugger{bugger})
+ if errs := rev.Run(); len(errs) > 0 {
+ fmt.Fprintf(os.Stderr, "Encountered %d errors:\n", len(errs))
+ for _, err := range errs {
+ fmt.Fprintf(os.Stderr, "\t%v\n", err)
+ }
+ os.Exit(1)
+ }
+}
diff --git a/tools/issue_reviver/reviver/BUILD b/tools/issue_reviver/reviver/BUILD
new file mode 100644
index 000000000..2c3675977
--- /dev/null
+++ b/tools/issue_reviver/reviver/BUILD
@@ -0,0 +1,19 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+ name = "reviver",
+ srcs = ["reviver.go"],
+ importpath = "gvisor.dev/gvisor/tools/issue_reviver/reviver",
+ visibility = [
+ "//tools/issue_reviver:__subpackages__",
+ ],
+)
+
+go_test(
+ name = "reviver_test",
+ size = "small",
+ srcs = ["reviver_test.go"],
+ embed = [":reviver"],
+)
diff --git a/tools/issue_reviver/reviver/reviver.go b/tools/issue_reviver/reviver/reviver.go
new file mode 100644
index 000000000..682db0c01
--- /dev/null
+++ b/tools/issue_reviver/reviver/reviver.go
@@ -0,0 +1,192 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package reviver scans the code looking for TODOs and pass them to registered
+// Buggers to ensure TODOs point to active issues.
+package reviver
+
+import (
+ "bufio"
+ "fmt"
+ "io/ioutil"
+ "os"
+ "path/filepath"
+ "regexp"
+ "sync"
+)
+
+// This is how a TODO looks like.
+var regexTodo = regexp.MustCompile(`(\/\/|#)\s*(TODO|FIXME)\(([a-zA-Z0-9.\/]+)\):\s*(.+)`)
+
+// Bugger interface is called for every TODO found in the code. If it can handle
+// the TODO, it must return true. If it returns false, the next Bugger is
+// called. If no Bugger handles the TODO, it's dropped on the floor.
+type Bugger interface {
+ Activate(todo *Todo) (bool, error)
+}
+
+// Location saves the location where the TODO was found.
+type Location struct {
+ Comment string
+ File string
+ Line uint
+}
+
+// Todo represents a unique TODO. There can be several TODOs pointing to the
+// same issue in the code. They are all grouped together.
+type Todo struct {
+ Issue string
+ Locations []Location
+}
+
+// Reviver scans the given paths for TODOs and calls Buggers to handle them.
+type Reviver struct {
+ paths []string
+ buggers []Bugger
+
+ mu sync.Mutex
+ todos map[string]*Todo
+ errs []error
+}
+
+// New create a new Reviver.
+func New(paths []string, buggers []Bugger) *Reviver {
+ return &Reviver{
+ paths: paths,
+ buggers: buggers,
+ todos: map[string]*Todo{},
+ }
+}
+
+// Run runs. It returns all errors found during processing, it doesn't stop
+// on errors.
+func (r *Reviver) Run() []error {
+ // Process each directory in parallel.
+ wg := sync.WaitGroup{}
+ for _, path := range r.paths {
+ wg.Add(1)
+ go func(path string) {
+ defer wg.Done()
+ r.processPath(path, &wg)
+ }(path)
+ }
+
+ wg.Wait()
+
+ r.mu.Lock()
+ defer r.mu.Unlock()
+
+ fmt.Printf("Processing %d TODOs (%d errors)...\n", len(r.todos), len(r.errs))
+ dropped := 0
+ for _, todo := range r.todos {
+ ok, err := r.processTodo(todo)
+ if err != nil {
+ r.errs = append(r.errs, err)
+ }
+ if !ok {
+ dropped++
+ }
+ }
+ fmt.Printf("Processed %d TODOs, %d were skipped (%d errors)\n", len(r.todos)-dropped, dropped, len(r.errs))
+
+ return r.errs
+}
+
+func (r *Reviver) processPath(path string, wg *sync.WaitGroup) {
+ fmt.Printf("Processing dir %q\n", path)
+ fis, err := ioutil.ReadDir(path)
+ if err != nil {
+ r.addErr(fmt.Errorf("error processing dir %q: %v", path, err))
+ return
+ }
+
+ for _, fi := range fis {
+ childPath := filepath.Join(path, fi.Name())
+ switch {
+ case fi.Mode().IsDir():
+ wg.Add(1)
+ go func() {
+ defer wg.Done()
+ r.processPath(childPath, wg)
+ }()
+
+ case fi.Mode().IsRegular():
+ file, err := os.Open(childPath)
+ if err != nil {
+ r.addErr(err)
+ continue
+ }
+
+ scanner := bufio.NewScanner(file)
+ lineno := uint(0)
+ for scanner.Scan() {
+ lineno++
+ line := scanner.Text()
+ if todo := r.processLine(line, childPath, lineno); todo != nil {
+ r.addTodo(todo)
+ }
+ }
+ }
+ }
+}
+
+func (r *Reviver) processLine(line, path string, lineno uint) *Todo {
+ matches := regexTodo.FindStringSubmatch(line)
+ if matches == nil {
+ return nil
+ }
+ if len(matches) != 5 {
+ panic(fmt.Sprintf("regex returned wrong matches for %q: %v", line, matches))
+ }
+ return &Todo{
+ Issue: matches[3],
+ Locations: []Location{
+ {
+ File: path,
+ Line: lineno,
+ Comment: matches[4],
+ },
+ },
+ }
+}
+
+func (r *Reviver) addTodo(newTodo *Todo) {
+ r.mu.Lock()
+ defer r.mu.Unlock()
+
+ if todo := r.todos[newTodo.Issue]; todo == nil {
+ r.todos[newTodo.Issue] = newTodo
+ } else {
+ todo.Locations = append(todo.Locations, newTodo.Locations...)
+ }
+}
+
+func (r *Reviver) addErr(err error) {
+ r.mu.Lock()
+ defer r.mu.Unlock()
+ r.errs = append(r.errs, err)
+}
+
+func (r *Reviver) processTodo(todo *Todo) (bool, error) {
+ for _, bugger := range r.buggers {
+ ok, err := bugger.Activate(todo)
+ if err != nil {
+ return false, err
+ }
+ if ok {
+ return true, nil
+ }
+ }
+ return false, nil
+}
diff --git a/tools/issue_reviver/reviver/reviver_test.go b/tools/issue_reviver/reviver/reviver_test.go
new file mode 100644
index 000000000..a9fb1f9f1
--- /dev/null
+++ b/tools/issue_reviver/reviver/reviver_test.go
@@ -0,0 +1,88 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package reviver
+
+import (
+ "testing"
+)
+
+func TestProcessLine(t *testing.T) {
+ for _, tc := range []struct {
+ line string
+ want *Todo
+ }{
+ {
+ line: "// TODO(foobar.com/issue/123): comment, bla. blabla.",
+ want: &Todo{
+ Issue: "foobar.com/issue/123",
+ Locations: []Location{
+ {Comment: "comment, bla. blabla."},
+ },
+ },
+ },
+ {
+ line: "// FIXME(b/123): internal bug",
+ want: &Todo{
+ Issue: "b/123",
+ Locations: []Location{
+ {Comment: "internal bug"},
+ },
+ },
+ },
+ {
+ line: "TODO(issue): not todo",
+ },
+ {
+ line: "FIXME(issue): not todo",
+ },
+ {
+ line: "// TODO (issue): not todo",
+ },
+ {
+ line: "// TODO(issue) not todo",
+ },
+ {
+ line: "// todo(issue): not todo",
+ },
+ {
+ line: "// TODO(issue):",
+ },
+ } {
+ t.Logf("Testing: %s", tc.line)
+ r := Reviver{}
+ got := r.processLine(tc.line, "test", 0)
+ if got == nil {
+ if tc.want != nil {
+ t.Errorf("failed to process line, want: %+v", tc.want)
+ }
+ } else {
+ if tc.want == nil {
+ t.Errorf("expected error, got: %+v", got)
+ continue
+ }
+ if got.Issue != tc.want.Issue {
+ t.Errorf("wrong issue, got: %v, want: %v", got.Issue, tc.want.Issue)
+ }
+ if len(got.Locations) != len(tc.want.Locations) {
+ t.Errorf("wrong number of locations, got: %v, want: %v, locations: %+v", len(got.Locations), len(tc.want.Locations), got.Locations)
+ }
+ for i, wantLoc := range tc.want.Locations {
+ if got.Locations[i].Comment != wantLoc.Comment {
+ t.Errorf("wrong comment, got: %v, want: %v", got.Locations[i].Comment, wantLoc.Comment)
+ }
+ }
+ }
+ }
+}
diff --git a/vdso/BUILD b/vdso/BUILD
index 7ceed349e..2b6744c26 100644
--- a/vdso/BUILD
+++ b/vdso/BUILD
@@ -68,14 +68,14 @@ genrule(
"&& $(location :check_vdso) " +
"--check-data " +
"--vdso $(location vdso.so) ",
+ exec_tools = [
+ ":check_vdso",
+ ],
features = ["-pie"],
toolchains = [
"@bazel_tools//tools/cpp:current_cc_toolchain",
":no_pie_cc_flags",
],
- tools = [
- ":check_vdso",
- ],
visibility = ["//:sandbox"],
)
@@ -87,6 +87,6 @@ cc_flags_supplier(
py_binary(
name = "check_vdso",
srcs = ["check_vdso.py"],
- python_version = "PY2",
+ python_version = "PY3",
visibility = ["//:sandbox"],
)