summaryrefslogtreecommitdiffhomepage
path: root/test/syscalls/linux
diff options
context:
space:
mode:
Diffstat (limited to 'test/syscalls/linux')
-rw-r--r--test/syscalls/linux/32bit.cc248
-rw-r--r--test/syscalls/linux/BUILD3933
-rw-r--r--test/syscalls/linux/accept_bind.cc641
-rw-r--r--test/syscalls/linux/accept_bind_stream.cc92
-rw-r--r--test/syscalls/linux/access.cc170
-rw-r--r--test/syscalls/linux/affinity.cc242
-rw-r--r--test/syscalls/linux/aio.cc430
-rw-r--r--test/syscalls/linux/alarm.cc192
-rw-r--r--test/syscalls/linux/arch_prctl.cc48
-rw-r--r--test/syscalls/linux/bad.cc45
-rw-r--r--test/syscalls/linux/base_poll_test.cc65
-rw-r--r--test/syscalls/linux/base_poll_test.h101
-rw-r--r--test/syscalls/linux/bind.cc145
-rw-r--r--test/syscalls/linux/brk.cc31
-rw-r--r--test/syscalls/linux/chdir.cc64
-rw-r--r--test/syscalls/linux/chmod.cc264
-rw-r--r--test/syscalls/linux/chown.cc206
-rw-r--r--test/syscalls/linux/chroot.cc366
-rw-r--r--test/syscalls/linux/clock_getres.cc37
-rw-r--r--test/syscalls/linux/clock_gettime.cc163
-rw-r--r--test/syscalls/linux/clock_nanosleep.cc179
-rw-r--r--test/syscalls/linux/concurrency.cc127
-rw-r--r--test/syscalls/linux/connect_external.cc163
-rw-r--r--test/syscalls/linux/creat.cc68
-rw-r--r--test/syscalls/linux/dev.cc167
-rw-r--r--test/syscalls/linux/dup.cc133
-rw-r--r--test/syscalls/linux/epoll.cc428
-rw-r--r--test/syscalls/linux/eventfd.cc222
-rw-r--r--test/syscalls/linux/exceptions.cc367
-rw-r--r--test/syscalls/linux/exec.cc904
-rw-r--r--test/syscalls/linux/exec.h34
-rw-r--r--test/syscalls/linux/exec_assert_closed_workload.cc45
-rw-r--r--test/syscalls/linux/exec_basic_workload.cc31
-rw-r--r--test/syscalls/linux/exec_binary.cc1646
-rw-r--r--test/syscalls/linux/exec_proc_exe_workload.cc42
-rw-r--r--test/syscalls/linux/exec_state_workload.cc202
-rw-r--r--test/syscalls/linux/exit.cc78
-rwxr-xr-xtest/syscalls/linux/exit_script.sh22
-rw-r--r--test/syscalls/linux/fadvise64.cc72
-rw-r--r--test/syscalls/linux/fallocate.cc186
-rw-r--r--test/syscalls/linux/fault.cc74
-rw-r--r--test/syscalls/linux/fchdir.cc77
-rw-r--r--test/syscalls/linux/fcntl.cc1353
-rw-r--r--test/syscalls/linux/file_base.h100
-rw-r--r--test/syscalls/linux/flock.cc636
-rw-r--r--test/syscalls/linux/fork.cc464
-rw-r--r--test/syscalls/linux/fpsig_fork.cc131
-rw-r--r--test/syscalls/linux/fpsig_nested.cc167
-rw-r--r--test/syscalls/linux/fsync.cc58
-rw-r--r--test/syscalls/linux/futex.cc742
-rw-r--r--test/syscalls/linux/getcpu.cc40
-rw-r--r--test/syscalls/linux/getdents.cc539
-rw-r--r--test/syscalls/linux/getrandom.cc63
-rw-r--r--test/syscalls/linux/getrusage.cc177
-rw-r--r--test/syscalls/linux/inotify.cc2380
-rw-r--r--test/syscalls/linux/ioctl.cc406
-rw-r--r--test/syscalls/linux/ip_socket_test_util.cc239
-rw-r--r--test/syscalls/linux/ip_socket_test_util.h135
-rw-r--r--test/syscalls/linux/iptables.cc204
-rw-r--r--test/syscalls/linux/iptables.h198
-rw-r--r--test/syscalls/linux/itimer.cc366
-rw-r--r--test/syscalls/linux/kill.cc383
-rw-r--r--test/syscalls/linux/link.cc305
-rw-r--r--test/syscalls/linux/lseek.cc202
-rw-r--r--test/syscalls/linux/madvise.cc251
-rw-r--r--test/syscalls/linux/memfd.cc557
-rw-r--r--test/syscalls/linux/memory_accounting.cc99
-rw-r--r--test/syscalls/linux/mempolicy.cc289
-rw-r--r--test/syscalls/linux/mincore.cc96
-rw-r--r--test/syscalls/linux/mkdir.cc88
-rw-r--r--test/syscalls/linux/mknod.cc190
-rw-r--r--test/syscalls/linux/mlock.cc332
-rw-r--r--test/syscalls/linux/mmap.cc1676
-rw-r--r--test/syscalls/linux/mount.cc327
-rw-r--r--test/syscalls/linux/mremap.cc492
-rw-r--r--test/syscalls/linux/msync.cc151
-rw-r--r--test/syscalls/linux/munmap.cc53
-rw-r--r--test/syscalls/linux/network_namespace.cc52
-rw-r--r--test/syscalls/linux/open.cc451
-rw-r--r--test/syscalls/linux/open_create.cc155
-rw-r--r--test/syscalls/linux/packet_socket.cc440
-rw-r--r--test/syscalls/linux/packet_socket_raw.cc565
-rw-r--r--test/syscalls/linux/partial_bad_buffer.cc405
-rw-r--r--test/syscalls/linux/pause.cc88
-rw-r--r--test/syscalls/linux/ping_socket.cc91
-rw-r--r--test/syscalls/linux/pipe.cc670
-rw-r--r--test/syscalls/linux/poll.cc294
-rw-r--r--test/syscalls/linux/ppoll.cc155
-rw-r--r--test/syscalls/linux/prctl.cc230
-rw-r--r--test/syscalls/linux/prctl_setuid.cc268
-rw-r--r--test/syscalls/linux/pread64.cc167
-rw-r--r--test/syscalls/linux/preadv.cc95
-rw-r--r--test/syscalls/linux/preadv2.cc280
-rw-r--r--test/syscalls/linux/priority.cc216
-rw-r--r--test/syscalls/linux/priority_execve.cc42
-rw-r--r--test/syscalls/linux/proc.cc2173
-rw-r--r--test/syscalls/linux/proc_net.cc482
-rw-r--r--test/syscalls/linux/proc_net_tcp.cc496
-rw-r--r--test/syscalls/linux/proc_net_udp.cc309
-rw-r--r--test/syscalls/linux/proc_net_unix.cc443
-rw-r--r--test/syscalls/linux/proc_pid_oomscore.cc72
-rw-r--r--test/syscalls/linux/proc_pid_smaps.cc468
-rw-r--r--test/syscalls/linux/proc_pid_uid_gid_map.cc311
-rw-r--r--test/syscalls/linux/pselect.cc190
-rw-r--r--test/syscalls/linux/ptrace.cc1229
-rw-r--r--test/syscalls/linux/pty.cc1627
-rw-r--r--test/syscalls/linux/pty_root.cc78
-rw-r--r--test/syscalls/linux/pwrite64.cc83
-rw-r--r--test/syscalls/linux/pwritev2.cc307
-rw-r--r--test/syscalls/linux/raw_socket.cc819
-rw-r--r--test/syscalls/linux/raw_socket_hdrincl.cc406
-rw-r--r--test/syscalls/linux/raw_socket_icmp.cc514
-rw-r--r--test/syscalls/linux/read.cc118
-rw-r--r--test/syscalls/linux/readahead.cc91
-rw-r--r--test/syscalls/linux/readv.cc294
-rw-r--r--test/syscalls/linux/readv_common.cc220
-rw-r--r--test/syscalls/linux/readv_common.h61
-rw-r--r--test/syscalls/linux/readv_socket.cc212
-rw-r--r--test/syscalls/linux/rename.cc394
-rw-r--r--test/syscalls/linux/rlimits.cc75
-rw-r--r--test/syscalls/linux/rseq.cc198
-rw-r--r--test/syscalls/linux/rseq/BUILD61
-rw-r--r--test/syscalls/linux/rseq/critical.h39
-rw-r--r--test/syscalls/linux/rseq/critical_amd64.S66
-rw-r--r--test/syscalls/linux/rseq/critical_arm64.S66
-rw-r--r--test/syscalls/linux/rseq/rseq.cc366
-rw-r--r--test/syscalls/linux/rseq/start_amd64.S45
-rw-r--r--test/syscalls/linux/rseq/start_arm64.S45
-rw-r--r--test/syscalls/linux/rseq/syscalls.h69
-rw-r--r--test/syscalls/linux/rseq/test.h43
-rw-r--r--test/syscalls/linux/rseq/types.h31
-rw-r--r--test/syscalls/linux/rseq/uapi.h51
-rw-r--r--test/syscalls/linux/rtsignal.cc171
-rw-r--r--test/syscalls/linux/sched.cc71
-rw-r--r--test/syscalls/linux/sched_yield.cc33
-rw-r--r--test/syscalls/linux/seccomp.cc425
-rw-r--r--test/syscalls/linux/select.cc168
-rw-r--r--test/syscalls/linux/semaphore.cc491
-rw-r--r--test/syscalls/linux/sendfile.cc587
-rw-r--r--test/syscalls/linux/sendfile_socket.cc231
-rw-r--r--test/syscalls/linux/shm.cc508
-rw-r--r--test/syscalls/linux/sigaction.cc79
-rw-r--r--test/syscalls/linux/sigaltstack.cc268
-rw-r--r--test/syscalls/linux/sigaltstack_check.cc33
-rw-r--r--test/syscalls/linux/sigiret.cc136
-rw-r--r--test/syscalls/linux/signalfd.cc373
-rw-r--r--test/syscalls/linux/sigprocmask.cc269
-rw-r--r--test/syscalls/linux/sigstop.cc151
-rw-r--r--test/syscalls/linux/sigtimedwait.cc323
-rw-r--r--test/syscalls/linux/socket.cc121
-rw-r--r--test/syscalls/linux/socket_abstract.cc49
-rw-r--r--test/syscalls/linux/socket_bind_to_device.cc313
-rw-r--r--test/syscalls/linux/socket_bind_to_device_distribution.cc401
-rw-r--r--test/syscalls/linux/socket_bind_to_device_sequence.cc513
-rw-r--r--test/syscalls/linux/socket_bind_to_device_util.cc75
-rw-r--r--test/syscalls/linux/socket_bind_to_device_util.h67
-rw-r--r--test/syscalls/linux/socket_blocking.cc60
-rw-r--r--test/syscalls/linux/socket_blocking.h29
-rw-r--r--test/syscalls/linux/socket_capability.cc61
-rw-r--r--test/syscalls/linux/socket_filesystem.cc49
-rw-r--r--test/syscalls/linux/socket_generic.cc820
-rw-r--r--test/syscalls/linux/socket_generic.h30
-rw-r--r--test/syscalls/linux/socket_generic_stress.cc83
-rw-r--r--test/syscalls/linux/socket_inet_loopback.cc2566
-rw-r--r--test/syscalls/linux/socket_inet_loopback_nogotsan.cc171
-rw-r--r--test/syscalls/linux/socket_ip_loopback_blocking.cc49
-rw-r--r--test/syscalls/linux/socket_ip_tcp_generic.cc1054
-rw-r--r--test/syscalls/linux/socket_ip_tcp_generic.h29
-rw-r--r--test/syscalls/linux/socket_ip_tcp_generic_loopback.cc45
-rw-r--r--test/syscalls/linux/socket_ip_tcp_loopback.cc40
-rw-r--r--test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc45
-rw-r--r--test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc44
-rw-r--r--test/syscalls/linux/socket_ip_tcp_udp_generic.cc77
-rw-r--r--test/syscalls/linux/socket_ip_udp_generic.cc452
-rw-r--r--test/syscalls/linux/socket_ip_udp_generic.h29
-rw-r--r--test/syscalls/linux/socket_ip_udp_loopback.cc50
-rw-r--r--test/syscalls/linux/socket_ip_udp_loopback_blocking.cc39
-rw-r--r--test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc39
-rw-r--r--test/syscalls/linux/socket_ip_unbound.cc474
-rw-r--r--test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.cc66
-rw-r--r--test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.h30
-rw-r--r--test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc39
-rw-r--r--test/syscalls/linux/socket_ipv4_udp_unbound.cc2456
-rw-r--r--test/syscalls/linux/socket_ipv4_udp_unbound.h29
-rw-r--r--test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc1099
-rw-r--r--test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h46
-rw-r--r--test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc39
-rw-r--r--test/syscalls/linux/socket_ipv4_udp_unbound_loopback.cc32
-rw-r--r--test/syscalls/linux/socket_netdevice.cc184
-rw-r--r--test/syscalls/linux/socket_netlink.cc153
-rw-r--r--test/syscalls/linux/socket_netlink_route.cc935
-rw-r--r--test/syscalls/linux/socket_netlink_route_util.cc162
-rw-r--r--test/syscalls/linux/socket_netlink_route_util.h55
-rw-r--r--test/syscalls/linux/socket_netlink_uevent.cc83
-rw-r--r--test/syscalls/linux/socket_netlink_util.cc187
-rw-r--r--test/syscalls/linux/socket_netlink_util.h62
-rw-r--r--test/syscalls/linux/socket_non_blocking.cc62
-rw-r--r--test/syscalls/linux/socket_non_blocking.h29
-rw-r--r--test/syscalls/linux/socket_non_stream.cc337
-rw-r--r--test/syscalls/linux/socket_non_stream.h29
-rw-r--r--test/syscalls/linux/socket_non_stream_blocking.cc85
-rw-r--r--test/syscalls/linux/socket_non_stream_blocking.h30
-rw-r--r--test/syscalls/linux/socket_stream.cc178
-rw-r--r--test/syscalls/linux/socket_stream.h30
-rw-r--r--test/syscalls/linux/socket_stream_blocking.cc163
-rw-r--r--test/syscalls/linux/socket_stream_blocking.h30
-rw-r--r--test/syscalls/linux/socket_stream_nonblock.cc49
-rw-r--r--test/syscalls/linux/socket_stream_nonblock.h30
-rw-r--r--test/syscalls/linux/socket_test_util.cc907
-rw-r--r--test/syscalls/linux/socket_test_util.h518
-rw-r--r--test/syscalls/linux/socket_test_util_impl.cc28
-rw-r--r--test/syscalls/linux/socket_unix.cc274
-rw-r--r--test/syscalls/linux/socket_unix.h29
-rw-r--r--test/syscalls/linux/socket_unix_abstract_nonblock.cc39
-rw-r--r--test/syscalls/linux/socket_unix_blocking_local.cc45
-rw-r--r--test/syscalls/linux/socket_unix_cmsg.cc1501
-rw-r--r--test/syscalls/linux/socket_unix_cmsg.h30
-rw-r--r--test/syscalls/linux/socket_unix_dgram.cc45
-rw-r--r--test/syscalls/linux/socket_unix_dgram.h29
-rw-r--r--test/syscalls/linux/socket_unix_dgram_local.cc58
-rw-r--r--test/syscalls/linux/socket_unix_dgram_non_blocking.cc57
-rw-r--r--test/syscalls/linux/socket_unix_domain.cc39
-rw-r--r--test/syscalls/linux/socket_unix_filesystem_nonblock.cc39
-rw-r--r--test/syscalls/linux/socket_unix_non_stream.cc256
-rw-r--r--test/syscalls/linux/socket_unix_non_stream.h30
-rw-r--r--test/syscalls/linux/socket_unix_non_stream_blocking_local.cc42
-rw-r--r--test/syscalls/linux/socket_unix_pair.cc44
-rw-r--r--test/syscalls/linux/socket_unix_pair_nonblock.cc39
-rw-r--r--test/syscalls/linux/socket_unix_seqpacket.cc67
-rw-r--r--test/syscalls/linux/socket_unix_seqpacket.h30
-rw-r--r--test/syscalls/linux/socket_unix_seqpacket_local.cc58
-rw-r--r--test/syscalls/linux/socket_unix_stream.cc125
-rw-r--r--test/syscalls/linux/socket_unix_stream_blocking_local.cc40
-rw-r--r--test/syscalls/linux/socket_unix_stream_local.cc48
-rw-r--r--test/syscalls/linux/socket_unix_stream_nonblock_local.cc39
-rw-r--r--test/syscalls/linux/socket_unix_unbound_abstract.cc116
-rw-r--r--test/syscalls/linux/socket_unix_unbound_dgram.cc183
-rw-r--r--test/syscalls/linux/socket_unix_unbound_filesystem.cc84
-rw-r--r--test/syscalls/linux/socket_unix_unbound_seqpacket.cc89
-rw-r--r--test/syscalls/linux/socket_unix_unbound_stream.cc733
-rw-r--r--test/syscalls/linux/splice.cc699
-rw-r--r--test/syscalls/linux/stat.cc720
-rw-r--r--test/syscalls/linux/stat_times.cc303
-rw-r--r--test/syscalls/linux/statfs.cc82
-rw-r--r--test/syscalls/linux/sticky.cc161
-rw-r--r--test/syscalls/linux/symlink.cc402
-rw-r--r--test/syscalls/linux/sync.cc59
-rw-r--r--test/syscalls/linux/sync_file_range.cc112
-rw-r--r--test/syscalls/linux/sysinfo.cc86
-rw-r--r--test/syscalls/linux/syslog.cc51
-rw-r--r--test/syscalls/linux/sysret.cc142
-rw-r--r--test/syscalls/linux/tcp_socket.cc1568
-rw-r--r--test/syscalls/linux/tgkill.cc48
-rw-r--r--test/syscalls/linux/time.cc107
-rw-r--r--test/syscalls/linux/timerfd.cc273
-rw-r--r--test/syscalls/linux/timers.cc662
-rw-r--r--test/syscalls/linux/tkill.cc75
-rw-r--r--test/syscalls/linux/truncate.cc218
-rw-r--r--test/syscalls/linux/tuntap.cc422
-rw-r--r--test/syscalls/linux/tuntap_hostinet.cc38
-rw-r--r--test/syscalls/linux/udp_bind.cc316
-rw-r--r--test/syscalls/linux/udp_socket.cc30
-rw-r--r--test/syscalls/linux/udp_socket_errqueue_test_case.cc57
-rw-r--r--test/syscalls/linux/udp_socket_test_cases.cc1727
-rw-r--r--test/syscalls/linux/udp_socket_test_cases.h82
-rw-r--r--test/syscalls/linux/uidgid.cc276
-rw-r--r--test/syscalls/linux/uname.cc111
-rw-r--r--test/syscalls/linux/unix_domain_socket_test_util.cc351
-rw-r--r--test/syscalls/linux/unix_domain_socket_test_util.h162
-rw-r--r--test/syscalls/linux/unlink.cc214
-rw-r--r--test/syscalls/linux/unshare.cc50
-rw-r--r--test/syscalls/linux/utimes.cc319
-rw-r--r--test/syscalls/linux/vdso.cc48
-rw-r--r--test/syscalls/linux/vdso_clock_gettime.cc108
-rw-r--r--test/syscalls/linux/vfork.cc195
-rw-r--r--test/syscalls/linux/vsyscall.cc46
-rw-r--r--test/syscalls/linux/wait.cc913
-rw-r--r--test/syscalls/linux/write.cc139
-rw-r--r--test/syscalls/linux/xattr.cc610
279 files changed, 80398 insertions, 0 deletions
diff --git a/test/syscalls/linux/32bit.cc b/test/syscalls/linux/32bit.cc
new file mode 100644
index 000000000..3c825477c
--- /dev/null
+++ b/test/syscalls/linux/32bit.cc
@@ -0,0 +1,248 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string.h>
+#include <sys/mman.h>
+
+#include "gtest/gtest.h"
+#include "absl/base/macros.h"
+#include "test/util/memory_util.h"
+#include "test/util/platform_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+
+#ifndef __x86_64__
+#error "This test is x86-64 specific."
+#endif
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+constexpr char kInt3 = '\xcc';
+constexpr char kInt80[2] = {'\xcd', '\x80'};
+constexpr char kSyscall[2] = {'\x0f', '\x05'};
+constexpr char kSysenter[2] = {'\x0f', '\x34'};
+
+void ExitGroup32(const char instruction[2], int code) {
+ const Mapping m = ASSERT_NO_ERRNO_AND_VALUE(
+ Mmap(nullptr, kPageSize, PROT_READ | PROT_WRITE | PROT_EXEC,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_32BIT, -1, 0));
+
+ // Fill with INT 3 in case we execute too far.
+ memset(m.ptr(), kInt3, m.len());
+
+ // Copy in the actual instruction.
+ memcpy(m.ptr(), instruction, 2);
+
+ // We're playing *extremely* fast-and-loose with the various syscall ABIs
+ // here, which we can more-or-less get away with since exit_group doesn't
+ // return.
+ //
+ // SYSENTER expects the user stack in (%ebp) and arg6 in 0(%ebp). The kernel
+ // will unconditionally dereference %ebp for arg6, so we must pass a valid
+ // address or it will return EFAULT.
+ //
+ // SYSENTER also unconditionally returns to thread_info->sysenter_return which
+ // is ostensibly a stub in the 32-bit VDSO. But a 64-bit binary doesn't have
+ // the 32-bit VDSO mapped, so sysenter_return will simply be the value
+ // inherited from the most recent 32-bit ancestor, or NULL if there is none.
+ // As a result, return would not return from SYSENTER.
+ asm volatile(
+ "movl $252, %%eax\n" // exit_group
+ "movl %[code], %%ebx\n" // code
+ "movl %%edx, %%ebp\n" // SYSENTER: user stack (use IP as a valid addr)
+ "leaq -20(%%rsp), %%rsp\n"
+ "movl $0x2b, 16(%%rsp)\n" // SS = CPL3 data segment
+ "movl $0,12(%%rsp)\n" // ESP = nullptr (unused)
+ "movl $0, 8(%%rsp)\n" // EFLAGS
+ "movl $0x23, 4(%%rsp)\n" // CS = CPL3 32-bit code segment
+ "movl %%edx, 0(%%rsp)\n" // EIP
+ "iretl\n"
+ "int $3\n"
+ :
+ : [ code ] "m"(code), [ ip ] "d"(m.ptr())
+ : "rax", "rbx");
+}
+
+constexpr int kExitCode = 42;
+
+TEST(Syscall32Bit, Int80) {
+ switch (PlatformSupport32Bit()) {
+ case PlatformSupport::NotSupported:
+ break;
+ case PlatformSupport::Segfault:
+ EXPECT_EXIT(ExitGroup32(kInt80, kExitCode),
+ ::testing::KilledBySignal(SIGSEGV), "");
+ break;
+
+ case PlatformSupport::Ignored:
+ // Since the call is ignored, we'll hit the int3 trap.
+ EXPECT_EXIT(ExitGroup32(kInt80, kExitCode),
+ ::testing::KilledBySignal(SIGTRAP), "");
+ break;
+
+ case PlatformSupport::Allowed:
+ EXPECT_EXIT(ExitGroup32(kInt80, kExitCode), ::testing::ExitedWithCode(42),
+ "");
+ break;
+ }
+}
+
+TEST(Syscall32Bit, Sysenter) {
+ if ((PlatformSupport32Bit() == PlatformSupport::Allowed ||
+ PlatformSupport32Bit() == PlatformSupport::Ignored) &&
+ GetCPUVendor() == CPUVendor::kAMD) {
+ // SYSENTER is an illegal instruction in compatibility mode on AMD.
+ EXPECT_EXIT(ExitGroup32(kSysenter, kExitCode),
+ ::testing::KilledBySignal(SIGILL), "");
+ return;
+ }
+
+ switch (PlatformSupport32Bit()) {
+ case PlatformSupport::NotSupported:
+ break;
+
+ case PlatformSupport::Segfault:
+ EXPECT_EXIT(ExitGroup32(kSysenter, kExitCode),
+ ::testing::KilledBySignal(SIGSEGV), "");
+ break;
+
+ case PlatformSupport::Ignored:
+ // See above, except expected code is SIGSEGV.
+ EXPECT_EXIT(ExitGroup32(kSysenter, kExitCode),
+ ::testing::KilledBySignal(SIGSEGV), "");
+ break;
+
+ case PlatformSupport::Allowed:
+ EXPECT_EXIT(ExitGroup32(kSysenter, kExitCode),
+ ::testing::ExitedWithCode(42), "");
+ break;
+ }
+}
+
+TEST(Syscall32Bit, Syscall) {
+ if ((PlatformSupport32Bit() == PlatformSupport::Allowed ||
+ PlatformSupport32Bit() == PlatformSupport::Ignored) &&
+ GetCPUVendor() == CPUVendor::kIntel) {
+ // SYSCALL is an illegal instruction in compatibility mode on Intel.
+ EXPECT_EXIT(ExitGroup32(kSyscall, kExitCode),
+ ::testing::KilledBySignal(SIGILL), "");
+ return;
+ }
+
+ switch (PlatformSupport32Bit()) {
+ case PlatformSupport::NotSupported:
+ break;
+
+ case PlatformSupport::Segfault:
+ EXPECT_EXIT(ExitGroup32(kSyscall, kExitCode),
+ ::testing::KilledBySignal(SIGSEGV), "");
+ break;
+
+ case PlatformSupport::Ignored:
+ // See above.
+ EXPECT_EXIT(ExitGroup32(kSyscall, kExitCode),
+ ::testing::KilledBySignal(SIGSEGV), "");
+ break;
+
+ case PlatformSupport::Allowed:
+ EXPECT_EXIT(ExitGroup32(kSyscall, kExitCode),
+ ::testing::ExitedWithCode(42), "");
+ break;
+ }
+}
+
+// Far call code called below.
+//
+// Input stack layout:
+//
+// %esp+12 lcall segment
+// %esp+8 lcall address offset
+// %esp+0 return address
+//
+// The lcall will enter compatibility mode and jump to the call address (the
+// address of the lret). The lret will return to 64-bit mode at the retq, which
+// will return to the external caller of this function.
+//
+// Since this enters compatibility mode, it must be mapped in a 32-bit region of
+// address space and have a 32-bit stack pointer.
+constexpr char kFarCall[] = {
+ '\x67', '\xff', '\x5c', '\x24', '\x08', // lcall *8(%esp)
+ '\xc3', // retq
+ '\xcb', // lret
+};
+
+void FarCall32() {
+ const Mapping m = ASSERT_NO_ERRNO_AND_VALUE(
+ Mmap(nullptr, kPageSize, PROT_READ | PROT_WRITE | PROT_EXEC,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_32BIT, -1, 0));
+
+ // Fill with INT 3 in case we execute too far.
+ memset(m.ptr(), kInt3, m.len());
+
+ // 32-bit code.
+ memcpy(m.ptr(), kFarCall, sizeof(kFarCall));
+
+ // Use the end of the code page as its stack.
+ uintptr_t stack = m.endaddr();
+
+ uintptr_t lcall = m.addr();
+ uintptr_t lret = m.addr() + sizeof(kFarCall) - 1;
+
+ // N.B. We must save and restore RSP manually. GCC can do so automatically
+ // with an "rsp" clobber, but clang cannot.
+ asm volatile(
+ // Place the address of lret (%edx) and the 32-bit code segment (0x23) on
+ // the 32-bit stack for lcall.
+ "subl $0x8, %%ecx\n"
+ "movl $0x23, 4(%%ecx)\n"
+ "movl %%edx, 0(%%ecx)\n"
+
+ // Save the current stack and switch to 32-bit stack.
+ "pushq %%rbp\n"
+ "movq %%rsp, %%rbp\n"
+ "movq %%rcx, %%rsp\n"
+
+ // Run the lcall code.
+ "callq *%%rbx\n"
+
+ // Restore the old stack.
+ "leaveq\n"
+ : "+c"(stack)
+ : "b"(lcall), "d"(lret));
+}
+
+TEST(Call32Bit, Disallowed) {
+ switch (PlatformSupport32Bit()) {
+ case PlatformSupport::NotSupported:
+ break;
+
+ case PlatformSupport::Segfault:
+ EXPECT_EXIT(FarCall32(), ::testing::KilledBySignal(SIGSEGV), "");
+ break;
+
+ case PlatformSupport::Ignored:
+ ABSL_FALLTHROUGH_INTENDED;
+ case PlatformSupport::Allowed:
+ // Shouldn't crash.
+ FarCall32();
+ }
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
new file mode 100644
index 000000000..9e097c888
--- /dev/null
+++ b/test/syscalls/linux/BUILD
@@ -0,0 +1,3933 @@
+load("//tools:defs.bzl", "cc_binary", "cc_library", "default_net_util", "gtest", "select_arch", "select_system")
+
+package(
+ default_visibility = ["//:sandbox"],
+ licenses = ["notice"],
+)
+
+exports_files(
+ [
+ "socket.cc",
+ "socket_inet_loopback.cc",
+ "socket_ip_loopback_blocking.cc",
+ "socket_ip_tcp_generic_loopback.cc",
+ "socket_ip_tcp_loopback.cc",
+ "socket_ip_tcp_loopback_blocking.cc",
+ "socket_ip_tcp_loopback_nonblock.cc",
+ "socket_ip_tcp_udp_generic.cc",
+ "socket_ip_udp_loopback.cc",
+ "socket_ip_udp_loopback_blocking.cc",
+ "socket_ip_udp_loopback_nonblock.cc",
+ "socket_ip_unbound.cc",
+ "socket_ipv4_tcp_unbound_external_networking_test.cc",
+ "socket_ipv4_udp_unbound_external_networking_test.cc",
+ "socket_ipv4_udp_unbound_loopback.cc",
+ "tcp_socket.cc",
+ "udp_bind.cc",
+ "udp_socket.cc",
+ ],
+ visibility = ["//:sandbox"],
+)
+
+cc_binary(
+ name = "sigaltstack_check",
+ testonly = 1,
+ srcs = ["sigaltstack_check.cc"],
+ deps = ["//test/util:logging"],
+)
+
+cc_binary(
+ name = "exec_assert_closed_workload",
+ testonly = 1,
+ srcs = ["exec_assert_closed_workload.cc"],
+ deps = [
+ "@com_google_absl//absl/strings",
+ ],
+)
+
+cc_binary(
+ name = "exec_basic_workload",
+ testonly = 1,
+ srcs = [
+ "exec.h",
+ "exec_basic_workload.cc",
+ ],
+)
+
+cc_binary(
+ name = "exec_proc_exe_workload",
+ testonly = 1,
+ srcs = ["exec_proc_exe_workload.cc"],
+ deps = [
+ "//test/util:fs_util",
+ "//test/util:posix_error",
+ ],
+)
+
+cc_binary(
+ name = "exec_state_workload",
+ testonly = 1,
+ srcs = ["exec_state_workload.cc"],
+ deps = ["@com_google_absl//absl/strings"],
+)
+
+sh_binary(
+ name = "exit_script",
+ testonly = 1,
+ srcs = [
+ "exit_script.sh",
+ ],
+)
+
+cc_binary(
+ name = "priority_execve",
+ testonly = 1,
+ srcs = [
+ "priority_execve.cc",
+ ],
+)
+
+cc_library(
+ name = "base_poll_test",
+ testonly = 1,
+ srcs = ["base_poll_test.cc"],
+ hdrs = ["base_poll_test.h"],
+ deps = [
+ "@com_google_absl//absl/memory",
+ "@com_google_absl//absl/synchronization",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:logging",
+ "//test/util:signal_util",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_library(
+ name = "file_base",
+ testonly = 1,
+ hdrs = ["file_base.h"],
+ deps = [
+ "//test/util:file_descriptor",
+ "@com_google_absl//absl/strings",
+ gtest,
+ "//test/util:posix_error",
+ "//test/util:temp_path",
+ "//test/util:test_util",
+ ],
+)
+
+cc_library(
+ name = "socket_netlink_util",
+ testonly = 1,
+ srcs = ["socket_netlink_util.cc"],
+ hdrs = ["socket_netlink_util.h"],
+ deps = [
+ ":socket_test_util",
+ "//test/util:file_descriptor",
+ "//test/util:posix_error",
+ "@com_google_absl//absl/strings",
+ ],
+)
+
+cc_library(
+ name = "socket_netlink_route_util",
+ testonly = 1,
+ srcs = ["socket_netlink_route_util.cc"],
+ hdrs = ["socket_netlink_route_util.h"],
+ deps = [
+ ":socket_netlink_util",
+ ],
+)
+
+cc_library(
+ name = "socket_test_util",
+ testonly = 1,
+ srcs = [
+ "socket_test_util.cc",
+ "socket_test_util_impl.cc",
+ ],
+ hdrs = ["socket_test_util.h"],
+ defines = select_system(),
+ deps = default_net_util() + [
+ gtest,
+ "@com_google_absl//absl/memory",
+ "@com_google_absl//absl/strings",
+ "@com_google_absl//absl/strings:str_format",
+ "@com_google_absl//absl/time",
+ "@com_google_absl//absl/types:optional",
+ "//test/util:file_descriptor",
+ "//test/util:posix_error",
+ "//test/util:temp_path",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_library(
+ name = "unix_domain_socket_test_util",
+ testonly = 1,
+ srcs = ["unix_domain_socket_test_util.cc"],
+ hdrs = ["unix_domain_socket_test_util.h"],
+ deps = [
+ ":socket_test_util",
+ "@com_google_absl//absl/strings",
+ gtest,
+ "//test/util:test_util",
+ ],
+)
+
+cc_library(
+ name = "ip_socket_test_util",
+ testonly = 1,
+ srcs = ["ip_socket_test_util.cc"],
+ hdrs = ["ip_socket_test_util.h"],
+ deps = [
+ ":socket_test_util",
+ "@com_google_absl//absl/strings",
+ ],
+)
+
+cc_binary(
+ name = "clock_nanosleep_test",
+ testonly = 1,
+ srcs = ["clock_nanosleep.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:cleanup",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:posix_error",
+ "//test/util:signal_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ "//test/util:timer_util",
+ ],
+)
+
+cc_binary(
+ name = "32bit_test",
+ testonly = 1,
+ srcs = select_arch(
+ amd64 = ["32bit.cc"],
+ arm64 = [],
+ ),
+ linkstatic = 1,
+ deps = [
+ "@com_google_absl//absl/base:core_headers",
+ gtest,
+ "//test/util:memory_util",
+ "//test/util:platform_util",
+ "//test/util:posix_error",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "accept_bind_test",
+ testonly = 1,
+ srcs = ["accept_bind.cc"],
+ linkstatic = 1,
+ deps = [
+ ":socket_test_util",
+ ":unix_domain_socket_test_util",
+ "//test/util:file_descriptor",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "accept_bind_stream_test",
+ testonly = 1,
+ srcs = ["accept_bind_stream.cc"],
+ linkstatic = 1,
+ deps = [
+ ":socket_test_util",
+ ":unix_domain_socket_test_util",
+ "//test/util:file_descriptor",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "access_test",
+ testonly = 1,
+ srcs = ["access.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:capability_util",
+ "//test/util:fs_util",
+ gtest,
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "affinity_test",
+ testonly = 1,
+ srcs = ["affinity.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:cleanup",
+ "//test/util:fs_util",
+ "@com_google_absl//absl/strings",
+ gtest,
+ "//test/util:posix_error",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "aio_test",
+ testonly = 1,
+ srcs = [
+ "aio.cc",
+ "file_base.h",
+ ],
+ linkstatic = 1,
+ deps = [
+ "//test/util:cleanup",
+ "//test/util:file_descriptor",
+ "//test/util:fs_util",
+ "@com_google_absl//absl/strings",
+ gtest,
+ "//test/util:memory_util",
+ "//test/util:posix_error",
+ "//test/util:proc_util",
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "alarm_test",
+ testonly = 1,
+ srcs = ["alarm.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:file_descriptor",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:logging",
+ "//test/util:signal_util",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "bad_test",
+ testonly = 1,
+ srcs = ["bad.cc"],
+ linkstatic = 1,
+ visibility = [
+ "//:sandbox",
+ ],
+ deps = [
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "bind_test",
+ testonly = 1,
+ srcs = ["bind.cc"],
+ linkstatic = 1,
+ deps = [
+ ":socket_test_util",
+ ":unix_domain_socket_test_util",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "socket_test",
+ testonly = 1,
+ srcs = ["socket.cc"],
+ linkstatic = 1,
+ deps = [
+ ":socket_test_util",
+ gtest,
+ "//test/util:file_descriptor",
+ "//test/util:temp_umask",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "socket_capability_test",
+ testonly = 1,
+ srcs = ["socket_capability.cc"],
+ linkstatic = 1,
+ deps = [
+ ":socket_test_util",
+ "//test/util:capability_util",
+ "//test/util:file_descriptor",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "brk_test",
+ testonly = 1,
+ srcs = ["brk.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "chdir_test",
+ testonly = 1,
+ srcs = ["chdir.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:capability_util",
+ gtest,
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "chmod_test",
+ testonly = 1,
+ srcs = ["chmod.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:capability_util",
+ "//test/util:file_descriptor",
+ "//test/util:fs_util",
+ gtest,
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "chown_test",
+ testonly = 1,
+ srcs = ["chown.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:capability_util",
+ "//test/util:file_descriptor",
+ "//test/util:fs_util",
+ "@com_google_absl//absl/flags:flag",
+ "@com_google_absl//absl/synchronization",
+ gtest,
+ "//test/util:posix_error",
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "sticky_test",
+ testonly = 1,
+ srcs = ["sticky.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:capability_util",
+ "//test/util:file_descriptor",
+ "//test/util:fs_util",
+ "@com_google_absl//absl/flags:flag",
+ gtest,
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "chroot_test",
+ testonly = 1,
+ srcs = ["chroot.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:capability_util",
+ "//test/util:cleanup",
+ "//test/util:file_descriptor",
+ "//test/util:fs_util",
+ "@com_google_absl//absl/strings",
+ gtest,
+ "//test/util:mount_util",
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "clock_getres_test",
+ testonly = 1,
+ srcs = ["clock_getres.cc"],
+ linkstatic = 1,
+ deps = [
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "clock_gettime_test",
+ testonly = 1,
+ srcs = ["clock_gettime.cc"],
+ linkstatic = 1,
+ deps = [
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "concurrency_test",
+ testonly = 1,
+ srcs = ["concurrency.cc"],
+ linkstatic = 1,
+ deps = [
+ "@com_google_absl//absl/strings",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:platform_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "connect_external_test",
+ testonly = 1,
+ srcs = ["connect_external.cc"],
+ linkstatic = 1,
+ deps = [
+ ":socket_test_util",
+ "//test/util:file_descriptor",
+ "//test/util:fs_util",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "creat_test",
+ testonly = 1,
+ srcs = ["creat.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:fs_util",
+ gtest,
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "dev_test",
+ testonly = 1,
+ srcs = ["dev.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:file_descriptor",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "dup_test",
+ testonly = 1,
+ srcs = ["dup.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:eventfd_util",
+ "//test/util:file_descriptor",
+ gtest,
+ "//test/util:posix_error",
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "epoll_test",
+ testonly = 1,
+ srcs = ["epoll.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:epoll_util",
+ "//test/util:eventfd_util",
+ "//test/util:file_descriptor",
+ gtest,
+ "//test/util:posix_error",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "eventfd_test",
+ testonly = 1,
+ srcs = ["eventfd.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:epoll_util",
+ "//test/util:eventfd_util",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "exceptions_test",
+ testonly = 1,
+ srcs = select_arch(
+ amd64 = ["exceptions.cc"],
+ arm64 = [],
+ ),
+ linkstatic = 1,
+ deps = [
+ gtest,
+ "//test/util:logging",
+ "//test/util:platform_util",
+ "//test/util:signal_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "getcpu_test",
+ testonly = 1,
+ srcs = ["getcpu.cc"],
+ linkstatic = 1,
+ deps = [
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "getcpu_host_test",
+ testonly = 1,
+ srcs = ["getcpu.cc"],
+ linkstatic = 1,
+ deps = [
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "getrusage_test",
+ testonly = 1,
+ srcs = ["getrusage.cc"],
+ linkstatic = 1,
+ deps = [
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:logging",
+ "//test/util:memory_util",
+ "//test/util:signal_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "exec_binary_test",
+ testonly = 1,
+ srcs = ["exec_binary.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:cleanup",
+ "//test/util:file_descriptor",
+ "//test/util:fs_util",
+ "@com_google_absl//absl/strings",
+ gtest,
+ "//test/util:multiprocess_util",
+ "//test/util:posix_error",
+ "//test/util:proc_util",
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "exec_test",
+ testonly = 1,
+ srcs = [
+ "exec.cc",
+ "exec.h",
+ ],
+ data = [
+ ":exec_assert_closed_workload",
+ ":exec_basic_workload",
+ ":exec_proc_exe_workload",
+ ":exec_state_workload",
+ ":exit_script",
+ ":priority_execve",
+ ],
+ linkstatic = 1,
+ deps = [
+ "//test/util:file_descriptor",
+ "//test/util:fs_util",
+ "@com_google_absl//absl/strings",
+ "@com_google_absl//absl/synchronization",
+ "@com_google_absl//absl/types:optional",
+ gtest,
+ "//test/util:multiprocess_util",
+ "//test/util:posix_error",
+ "//test/util:temp_path",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "exit_test",
+ testonly = 1,
+ srcs = ["exit.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:file_descriptor",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:time_util",
+ ],
+)
+
+cc_binary(
+ name = "fallocate_test",
+ testonly = 1,
+ srcs = ["fallocate.cc"],
+ linkstatic = 1,
+ deps = [
+ ":file_base",
+ ":socket_test_util",
+ "//test/util:cleanup",
+ "//test/util:eventfd_util",
+ "//test/util:file_descriptor",
+ "@com_google_absl//absl/strings",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:posix_error",
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "fault_test",
+ testonly = 1,
+ srcs = ["fault.cc"],
+ linkstatic = 1,
+ deps = [
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "fchdir_test",
+ testonly = 1,
+ srcs = ["fchdir.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:capability_util",
+ gtest,
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "fcntl_test",
+ testonly = 1,
+ srcs = ["fcntl.cc"],
+ linkstatic = 1,
+ deps = [
+ ":socket_test_util",
+ "//test/util:cleanup",
+ "//test/util:epoll_util",
+ "//test/util:eventfd_util",
+ "//test/util:fs_util",
+ "@com_google_absl//absl/base:core_headers",
+ "@com_google_absl//absl/flags:flag",
+ "@com_google_absl//absl/memory",
+ "@com_google_absl//absl/strings",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:multiprocess_util",
+ "//test/util:posix_error",
+ "//test/util:save_util",
+ "//test/util:temp_path",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ "//test/util:timer_util",
+ ],
+)
+
+cc_binary(
+ name = "flock_test",
+ testonly = 1,
+ srcs = [
+ "file_base.h",
+ "flock.cc",
+ ],
+ linkstatic = 1,
+ deps = [
+ ":socket_test_util",
+ "//test/util:file_descriptor",
+ "@com_google_absl//absl/strings",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:epoll_util",
+ "//test/util:eventfd_util",
+ "//test/util:posix_error",
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ "//test/util:timer_util",
+ ],
+)
+
+cc_binary(
+ name = "fork_test",
+ testonly = 1,
+ srcs = ["fork.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:capability_util",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:logging",
+ "//test/util:memory_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "fpsig_fork_test",
+ testonly = 1,
+ srcs = ["fpsig_fork.cc"],
+ linkstatic = 1,
+ deps = [
+ gtest,
+ "//test/util:logging",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "fpsig_nested_test",
+ testonly = 1,
+ srcs = ["fpsig_nested.cc"],
+ linkstatic = 1,
+ deps = [
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "sync_file_range_test",
+ testonly = 1,
+ srcs = ["sync_file_range.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:file_descriptor",
+ gtest,
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "fsync_test",
+ testonly = 1,
+ srcs = ["fsync.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:file_descriptor",
+ gtest,
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "futex_test",
+ testonly = 1,
+ srcs = ["futex.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:cleanup",
+ "//test/util:file_descriptor",
+ "@com_google_absl//absl/memory",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:memory_util",
+ "//test/util:save_util",
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ "//test/util:time_util",
+ "//test/util:timer_util",
+ ],
+)
+
+cc_binary(
+ name = "getdents_test",
+ testonly = 1,
+ srcs = ["getdents.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:eventfd_util",
+ "//test/util:file_descriptor",
+ "//test/util:fs_util",
+ "@com_google_absl//absl/strings",
+ gtest,
+ "//test/util:posix_error",
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "getrandom_test",
+ testonly = 1,
+ srcs = ["getrandom.cc"],
+ linkstatic = 1,
+ deps = [
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "inotify_test",
+ testonly = 1,
+ srcs = ["inotify.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:epoll_util",
+ "//test/util:file_descriptor",
+ "//test/util:fs_util",
+ "//test/util:posix_error",
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ "@com_google_absl//absl/strings",
+ "@com_google_absl//absl/strings:str_format",
+ "@com_google_absl//absl/synchronization",
+ "@com_google_absl//absl/time",
+ ],
+)
+
+cc_binary(
+ name = "ioctl_test",
+ testonly = 1,
+ srcs = ["ioctl.cc"],
+ linkstatic = 1,
+ deps = [
+ ":ip_socket_test_util",
+ ":socket_test_util",
+ ":unix_domain_socket_test_util",
+ "//test/util:file_descriptor",
+ gtest,
+ "//test/util:signal_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_library(
+ name = "iptables_types",
+ testonly = 1,
+ hdrs = [
+ "iptables.h",
+ ],
+)
+
+cc_binary(
+ name = "iptables_test",
+ testonly = 1,
+ srcs = [
+ "iptables.cc",
+ ],
+ linkstatic = 1,
+ deps = [
+ ":iptables_types",
+ ":socket_test_util",
+ "//test/util:capability_util",
+ "//test/util:file_descriptor",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "itimer_test",
+ testonly = 1,
+ srcs = ["itimer.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:file_descriptor",
+ "@com_google_absl//absl/strings",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:logging",
+ "//test/util:multiprocess_util",
+ "//test/util:posix_error",
+ "//test/util:signal_util",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ "//test/util:timer_util",
+ ],
+)
+
+cc_binary(
+ name = "kill_test",
+ testonly = 1,
+ srcs = ["kill.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:capability_util",
+ "//test/util:file_descriptor",
+ "@com_google_absl//absl/flags:flag",
+ "@com_google_absl//absl/synchronization",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:logging",
+ "//test/util:signal_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "link_test",
+ testonly = 1,
+ srcs = ["link.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:capability_util",
+ "//test/util:file_descriptor",
+ "//test/util:fs_util",
+ "@com_google_absl//absl/flags:flag",
+ "@com_google_absl//absl/strings",
+ gtest,
+ "//test/util:posix_error",
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "lseek_test",
+ testonly = 1,
+ srcs = ["lseek.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:file_descriptor",
+ gtest,
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "madvise_test",
+ testonly = 1,
+ srcs = ["madvise.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:file_descriptor",
+ gtest,
+ "//test/util:logging",
+ "//test/util:memory_util",
+ "//test/util:multiprocess_util",
+ "//test/util:posix_error",
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "mempolicy_test",
+ testonly = 1,
+ srcs = ["mempolicy.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:cleanup",
+ "@com_google_absl//absl/memory",
+ gtest,
+ "//test/util:memory_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "mincore_test",
+ testonly = 1,
+ srcs = ["mincore.cc"],
+ linkstatic = 1,
+ deps = [
+ gtest,
+ "//test/util:memory_util",
+ "//test/util:posix_error",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "mkdir_test",
+ testonly = 1,
+ srcs = ["mkdir.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:capability_util",
+ "//test/util:fs_util",
+ gtest,
+ "//test/util:temp_path",
+ "//test/util:temp_umask",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "mknod_test",
+ testonly = 1,
+ srcs = ["mknod.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:file_descriptor",
+ gtest,
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "mlock_test",
+ testonly = 1,
+ srcs = ["mlock.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:capability_util",
+ "//test/util:cleanup",
+ gtest,
+ "//test/util:memory_util",
+ "//test/util:multiprocess_util",
+ "//test/util:rlimit_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "mmap_test",
+ testonly = 1,
+ srcs = ["mmap.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:cleanup",
+ "//test/util:file_descriptor",
+ "//test/util:fs_util",
+ "@com_google_absl//absl/strings",
+ gtest,
+ "//test/util:memory_util",
+ "//test/util:multiprocess_util",
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "mount_test",
+ testonly = 1,
+ srcs = ["mount.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:capability_util",
+ "//test/util:file_descriptor",
+ "//test/util:fs_util",
+ "@com_google_absl//absl/strings",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:mount_util",
+ "//test/util:multiprocess_util",
+ "//test/util:posix_error",
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "mremap_test",
+ testonly = 1,
+ srcs = ["mremap.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:file_descriptor",
+ "@com_google_absl//absl/strings",
+ gtest,
+ "//test/util:logging",
+ "//test/util:memory_util",
+ "//test/util:multiprocess_util",
+ "//test/util:posix_error",
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "msync_test",
+ testonly = 1,
+ srcs = ["msync.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:file_descriptor",
+ "//test/util:memory_util",
+ "//test/util:posix_error",
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "munmap_test",
+ testonly = 1,
+ srcs = ["munmap.cc"],
+ linkstatic = 1,
+ deps = [
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "open_test",
+ testonly = 1,
+ srcs = [
+ "file_base.h",
+ "open.cc",
+ ],
+ linkstatic = 1,
+ deps = [
+ "//test/util:capability_util",
+ "//test/util:cleanup",
+ "//test/util:file_descriptor",
+ "//test/util:fs_util",
+ "@com_google_absl//absl/memory",
+ "@com_google_absl//absl/strings",
+ gtest,
+ "//test/util:posix_error",
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "open_create_test",
+ testonly = 1,
+ srcs = ["open_create.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:capability_util",
+ "//test/util:file_descriptor",
+ "//test/util:fs_util",
+ gtest,
+ "//test/util:temp_path",
+ "//test/util:temp_umask",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "packet_socket_raw_test",
+ testonly = 1,
+ srcs = ["packet_socket_raw.cc"],
+ linkstatic = 1,
+ deps = [
+ ":socket_test_util",
+ ":unix_domain_socket_test_util",
+ "//test/util:capability_util",
+ "//test/util:file_descriptor",
+ "@com_google_absl//absl/base:core_headers",
+ "@com_google_absl//absl/base:endian",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "packet_socket_test",
+ testonly = 1,
+ srcs = ["packet_socket.cc"],
+ linkstatic = 1,
+ deps = [
+ ":socket_test_util",
+ ":unix_domain_socket_test_util",
+ "//test/util:capability_util",
+ "//test/util:file_descriptor",
+ "@com_google_absl//absl/base:core_headers",
+ "@com_google_absl//absl/base:endian",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "pty_test",
+ testonly = 1,
+ srcs = ["pty.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:capability_util",
+ "//test/util:file_descriptor",
+ "@com_google_absl//absl/base:core_headers",
+ "@com_google_absl//absl/strings",
+ "@com_google_absl//absl/synchronization",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:posix_error",
+ "//test/util:pty_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "pty_root_test",
+ testonly = 1,
+ srcs = ["pty_root.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:capability_util",
+ "//test/util:file_descriptor",
+ "@com_google_absl//absl/base:core_headers",
+ gtest,
+ "//test/util:posix_error",
+ "//test/util:pty_util",
+ "//test/util:test_main",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "partial_bad_buffer_test",
+ testonly = 1,
+ srcs = ["partial_bad_buffer.cc"],
+ linkstatic = 1,
+ deps = [
+ ":socket_test_util",
+ "//test/util:file_descriptor",
+ "//test/util:fs_util",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:posix_error",
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "pause_test",
+ testonly = 1,
+ srcs = ["pause.cc"],
+ linkstatic = 1,
+ deps = [
+ "@com_google_absl//absl/synchronization",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:signal_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "ping_socket_test",
+ testonly = 1,
+ srcs = ["ping_socket.cc"],
+ linkstatic = 1,
+ deps = [
+ ":socket_test_util",
+ "//test/util:file_descriptor",
+ gtest,
+ "//test/util:save_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "pipe_test",
+ testonly = 1,
+ srcs = ["pipe.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:file_descriptor",
+ "//test/util:fs_util",
+ "@com_google_absl//absl/strings",
+ "@com_google_absl//absl/synchronization",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:posix_error",
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "poll_test",
+ testonly = 1,
+ srcs = ["poll.cc"],
+ linkstatic = 1,
+ deps = [
+ ":base_poll_test",
+ "//test/util:eventfd_util",
+ "//test/util:file_descriptor",
+ "@com_google_absl//absl/synchronization",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:logging",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "ppoll_test",
+ testonly = 1,
+ srcs = ["ppoll.cc"],
+ linkstatic = 1,
+ deps = [
+ ":base_poll_test",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:signal_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "arch_prctl_test",
+ testonly = 1,
+ srcs = select_arch(
+ amd64 = ["arch_prctl.cc"],
+ arm64 = [],
+ ),
+ linkstatic = 1,
+ deps = [
+ "//test/util:file_descriptor",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "prctl_test",
+ testonly = 1,
+ srcs = ["prctl.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:capability_util",
+ "//test/util:cleanup",
+ "@com_google_absl//absl/flags:flag",
+ gtest,
+ "//test/util:multiprocess_util",
+ "//test/util:posix_error",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "prctl_setuid_test",
+ testonly = 1,
+ srcs = ["prctl_setuid.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:capability_util",
+ "@com_google_absl//absl/flags:flag",
+ gtest,
+ "//test/util:logging",
+ "//test/util:multiprocess_util",
+ "//test/util:posix_error",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "pread64_test",
+ testonly = 1,
+ srcs = ["pread64.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:file_descriptor",
+ gtest,
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "preadv_test",
+ testonly = 1,
+ srcs = ["preadv.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:file_descriptor",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:logging",
+ "//test/util:memory_util",
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ "//test/util:timer_util",
+ ],
+)
+
+cc_binary(
+ name = "preadv2_test",
+ testonly = 1,
+ srcs = [
+ "file_base.h",
+ "preadv2.cc",
+ ],
+ linkstatic = 1,
+ deps = [
+ "//test/util:file_descriptor",
+ "@com_google_absl//absl/memory",
+ "@com_google_absl//absl/strings",
+ gtest,
+ "//test/util:posix_error",
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "priority_test",
+ testonly = 1,
+ srcs = ["priority.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:capability_util",
+ "//test/util:fs_util",
+ "@com_google_absl//absl/strings",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "proc_test",
+ testonly = 1,
+ srcs = ["proc.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:capability_util",
+ "//test/util:cleanup",
+ "//test/util:file_descriptor",
+ "//test/util:fs_util",
+ "@com_google_absl//absl/strings",
+ "@com_google_absl//absl/synchronization",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:memory_util",
+ "//test/util:posix_error",
+ "//test/util:temp_path",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ "//test/util:time_util",
+ "//test/util:timer_util",
+ ],
+)
+
+cc_binary(
+ name = "proc_net_test",
+ testonly = 1,
+ srcs = ["proc_net.cc"],
+ linkstatic = 1,
+ deps = [
+ ":socket_test_util",
+ "//test/util:capability_util",
+ "//test/util:file_descriptor",
+ "//test/util:fs_util",
+ "@com_google_absl//absl/strings",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "proc_pid_oomscore_test",
+ testonly = 1,
+ srcs = ["proc_pid_oomscore.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:fs_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "@com_google_absl//absl/strings",
+ ],
+)
+
+cc_binary(
+ name = "proc_pid_smaps_test",
+ testonly = 1,
+ srcs = ["proc_pid_smaps.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:file_descriptor",
+ "//test/util:fs_util",
+ "@com_google_absl//absl/container:flat_hash_set",
+ "@com_google_absl//absl/strings",
+ "@com_google_absl//absl/strings:str_format",
+ "@com_google_absl//absl/types:optional",
+ gtest,
+ "//test/util:memory_util",
+ "//test/util:posix_error",
+ "//test/util:proc_util",
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "proc_pid_uid_gid_map_test",
+ testonly = 1,
+ srcs = ["proc_pid_uid_gid_map.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:capability_util",
+ "//test/util:cleanup",
+ "//test/util:file_descriptor",
+ "//test/util:fs_util",
+ "@com_google_absl//absl/strings",
+ gtest,
+ "//test/util:logging",
+ "//test/util:multiprocess_util",
+ "//test/util:posix_error",
+ "//test/util:save_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:time_util",
+ ],
+)
+
+cc_binary(
+ name = "pselect_test",
+ testonly = 1,
+ srcs = ["pselect.cc"],
+ linkstatic = 1,
+ deps = [
+ ":base_poll_test",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:signal_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "ptrace_test",
+ testonly = 1,
+ srcs = ["ptrace.cc"],
+ linkstatic = 1,
+ deps = [
+ "@com_google_absl//absl/flags:flag",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:logging",
+ "//test/util:multiprocess_util",
+ "//test/util:platform_util",
+ "//test/util:signal_util",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ "//test/util:time_util",
+ ],
+)
+
+cc_binary(
+ name = "pwrite64_test",
+ testonly = 1,
+ srcs = ["pwrite64.cc"],
+ linkstatic = 1,
+ deps = [
+ gtest,
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "pwritev2_test",
+ testonly = 1,
+ srcs = [
+ "pwritev2.cc",
+ ],
+ linkstatic = 1,
+ deps = [
+ ":file_base",
+ "//test/util:file_descriptor",
+ "@com_google_absl//absl/strings",
+ gtest,
+ "//test/util:posix_error",
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "raw_socket_hdrincl_test",
+ testonly = 1,
+ srcs = ["raw_socket_hdrincl.cc"],
+ linkstatic = 1,
+ deps = [
+ ":socket_test_util",
+ ":unix_domain_socket_test_util",
+ "//test/util:capability_util",
+ "//test/util:file_descriptor",
+ "@com_google_absl//absl/base:core_headers",
+ "@com_google_absl//absl/base:endian",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "raw_socket_test",
+ testonly = 1,
+ srcs = ["raw_socket.cc"],
+ linkstatic = 1,
+ deps = [
+ ":socket_test_util",
+ ":unix_domain_socket_test_util",
+ "//test/util:capability_util",
+ "//test/util:file_descriptor",
+ "@com_google_absl//absl/base:core_headers",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "raw_socket_icmp_test",
+ testonly = 1,
+ srcs = ["raw_socket_icmp.cc"],
+ linkstatic = 1,
+ deps = [
+ ":socket_test_util",
+ ":unix_domain_socket_test_util",
+ "//test/util:capability_util",
+ "//test/util:file_descriptor",
+ "@com_google_absl//absl/base:core_headers",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "read_test",
+ testonly = 1,
+ srcs = ["read.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:file_descriptor",
+ gtest,
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "readahead_test",
+ testonly = 1,
+ srcs = ["readahead.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:file_descriptor",
+ gtest,
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "readv_test",
+ testonly = 1,
+ srcs = [
+ "file_base.h",
+ "readv.cc",
+ "readv_common.cc",
+ "readv_common.h",
+ ],
+ linkstatic = 1,
+ deps = [
+ "//test/util:file_descriptor",
+ "@com_google_absl//absl/strings",
+ gtest,
+ "//test/util:posix_error",
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:timer_util",
+ ],
+)
+
+cc_binary(
+ name = "readv_socket_test",
+ testonly = 1,
+ srcs = [
+ "readv_common.cc",
+ "readv_common.h",
+ "readv_socket.cc",
+ ],
+ linkstatic = 1,
+ deps = [
+ "//test/util:file_descriptor",
+ "@com_google_absl//absl/strings",
+ gtest,
+ "//test/util:posix_error",
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "rename_test",
+ testonly = 1,
+ srcs = ["rename.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:capability_util",
+ "//test/util:cleanup",
+ "//test/util:file_descriptor",
+ "//test/util:fs_util",
+ "@com_google_absl//absl/strings",
+ gtest,
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "rlimits_test",
+ testonly = 1,
+ srcs = ["rlimits.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:capability_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "rseq_test",
+ testonly = 1,
+ srcs = ["rseq.cc"],
+ data = ["//test/syscalls/linux/rseq"],
+ linkstatic = 1,
+ deps = [
+ "//test/syscalls/linux/rseq:lib",
+ gtest,
+ "//test/util:logging",
+ "//test/util:multiprocess_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "rtsignal_test",
+ testonly = 1,
+ srcs = ["rtsignal.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:cleanup",
+ gtest,
+ "//test/util:logging",
+ "//test/util:posix_error",
+ "//test/util:signal_util",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "sched_test",
+ testonly = 1,
+ srcs = ["sched.cc"],
+ linkstatic = 1,
+ deps = [
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "sched_yield_test",
+ testonly = 1,
+ srcs = ["sched_yield.cc"],
+ linkstatic = 1,
+ deps = [
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "seccomp_test",
+ testonly = 1,
+ srcs = ["seccomp.cc"],
+ linkstatic = 1,
+ deps = [
+ "@com_google_absl//absl/base:core_headers",
+ gtest,
+ "//test/util:logging",
+ "//test/util:memory_util",
+ "//test/util:multiprocess_util",
+ "//test/util:posix_error",
+ "//test/util:proc_util",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "select_test",
+ testonly = 1,
+ srcs = ["select.cc"],
+ linkstatic = 1,
+ deps = [
+ ":base_poll_test",
+ "//test/util:file_descriptor",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:multiprocess_util",
+ "//test/util:posix_error",
+ "//test/util:rlimit_util",
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "sendfile_test",
+ testonly = 1,
+ srcs = ["sendfile.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:eventfd_util",
+ "//test/util:file_descriptor",
+ "@com_google_absl//absl/strings",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "sendfile_socket_test",
+ testonly = 1,
+ srcs = ["sendfile_socket.cc"],
+ linkstatic = 1,
+ deps = [
+ ":socket_test_util",
+ "//test/util:file_descriptor",
+ "@com_google_absl//absl/strings",
+ gtest,
+ ":ip_socket_test_util",
+ ":unix_domain_socket_test_util",
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "splice_test",
+ testonly = 1,
+ srcs = ["splice.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:file_descriptor",
+ "@com_google_absl//absl/strings",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "sigaction_test",
+ testonly = 1,
+ srcs = ["sigaction.cc"],
+ linkstatic = 1,
+ deps = [
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "sigaltstack_test",
+ testonly = 1,
+ srcs = ["sigaltstack.cc"],
+ data = [
+ ":sigaltstack_check",
+ ],
+ linkstatic = 1,
+ deps = [
+ "//test/util:cleanup",
+ "//test/util:fs_util",
+ gtest,
+ "//test/util:multiprocess_util",
+ "//test/util:posix_error",
+ "//test/util:signal_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "sigiret_test",
+ testonly = 1,
+ srcs = select_arch(
+ amd64 = ["sigiret.cc"],
+ arm64 = [],
+ ),
+ linkstatic = 1,
+ deps = [
+ gtest,
+ "//test/util:logging",
+ "//test/util:signal_util",
+ "//test/util:test_util",
+ "//test/util:timer_util",
+ ] + select_arch(
+ amd64 = [],
+ arm64 = ["//test/util:test_main"],
+ ),
+)
+
+cc_binary(
+ name = "signalfd_test",
+ testonly = 1,
+ srcs = ["signalfd.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:file_descriptor",
+ "@com_google_absl//absl/synchronization",
+ gtest,
+ "//test/util:logging",
+ "//test/util:posix_error",
+ "//test/util:signal_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "sigprocmask_test",
+ testonly = 1,
+ srcs = ["sigprocmask.cc"],
+ linkstatic = 1,
+ deps = [
+ gtest,
+ "//test/util:signal_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "sigstop_test",
+ testonly = 1,
+ srcs = ["sigstop.cc"],
+ linkstatic = 1,
+ deps = [
+ "@com_google_absl//absl/flags:flag",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:multiprocess_util",
+ "//test/util:posix_error",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "sigtimedwait_test",
+ testonly = 1,
+ srcs = ["sigtimedwait.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:file_descriptor",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:logging",
+ "//test/util:signal_util",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ "//test/util:timer_util",
+ ],
+)
+
+cc_library(
+ name = "socket_generic_test_cases",
+ testonly = 1,
+ srcs = [
+ "socket_generic.cc",
+ ],
+ hdrs = [
+ "socket_generic.h",
+ ],
+ deps = [
+ ":socket_test_util",
+ ":unix_domain_socket_test_util",
+ "@com_google_absl//absl/strings",
+ "@com_google_absl//absl/strings:str_format",
+ gtest,
+ "//test/util:test_util",
+ ],
+ alwayslink = 1,
+)
+
+cc_binary(
+ name = "socket_stress_test",
+ testonly = 1,
+ srcs = [
+ "socket_generic_stress.cc",
+ ],
+ linkstatic = 1,
+ deps = [
+ ":ip_socket_test_util",
+ ":socket_test_util",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_library(
+ name = "socket_unix_dgram_test_cases",
+ testonly = 1,
+ srcs = ["socket_unix_dgram.cc"],
+ hdrs = ["socket_unix_dgram.h"],
+ deps = [
+ ":socket_test_util",
+ ":unix_domain_socket_test_util",
+ gtest,
+ "//test/util:test_util",
+ ],
+ alwayslink = 1,
+)
+
+cc_library(
+ name = "socket_unix_seqpacket_test_cases",
+ testonly = 1,
+ srcs = ["socket_unix_seqpacket.cc"],
+ hdrs = ["socket_unix_seqpacket.h"],
+ deps = [
+ ":socket_test_util",
+ ":unix_domain_socket_test_util",
+ gtest,
+ "//test/util:test_util",
+ ],
+ alwayslink = 1,
+)
+
+cc_library(
+ name = "socket_ip_tcp_generic_test_cases",
+ testonly = 1,
+ srcs = [
+ "socket_ip_tcp_generic.cc",
+ ],
+ hdrs = [
+ "socket_ip_tcp_generic.h",
+ ],
+ deps = [
+ ":socket_test_util",
+ "@com_google_absl//absl/memory",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+ alwayslink = 1,
+)
+
+cc_library(
+ name = "socket_non_blocking_test_cases",
+ testonly = 1,
+ srcs = [
+ "socket_non_blocking.cc",
+ ],
+ hdrs = [
+ "socket_non_blocking.h",
+ ],
+ deps = [
+ ":socket_test_util",
+ ":unix_domain_socket_test_util",
+ gtest,
+ "//test/util:test_util",
+ ],
+ alwayslink = 1,
+)
+
+cc_library(
+ name = "socket_unix_non_stream_test_cases",
+ testonly = 1,
+ srcs = [
+ "socket_unix_non_stream.cc",
+ ],
+ hdrs = [
+ "socket_unix_non_stream.h",
+ ],
+ deps = [
+ ":socket_test_util",
+ ":unix_domain_socket_test_util",
+ gtest,
+ "//test/util:memory_util",
+ "//test/util:test_util",
+ ],
+ alwayslink = 1,
+)
+
+cc_library(
+ name = "socket_non_stream_test_cases",
+ testonly = 1,
+ srcs = [
+ "socket_non_stream.cc",
+ ],
+ hdrs = [
+ "socket_non_stream.h",
+ ],
+ deps = [
+ ":ip_socket_test_util",
+ ":socket_test_util",
+ ":unix_domain_socket_test_util",
+ gtest,
+ "//test/util:test_util",
+ ],
+ alwayslink = 1,
+)
+
+cc_library(
+ name = "socket_ip_udp_test_cases",
+ testonly = 1,
+ srcs = [
+ "socket_ip_udp_generic.cc",
+ ],
+ hdrs = [
+ "socket_ip_udp_generic.h",
+ ],
+ deps = [
+ ":ip_socket_test_util",
+ ":socket_test_util",
+ gtest,
+ "//test/util:test_util",
+ ],
+ alwayslink = 1,
+)
+
+cc_library(
+ name = "socket_ipv4_udp_unbound_test_cases",
+ testonly = 1,
+ srcs = [
+ "socket_ipv4_udp_unbound.cc",
+ ],
+ hdrs = [
+ "socket_ipv4_udp_unbound.h",
+ ],
+ deps = [
+ ":ip_socket_test_util",
+ ":socket_test_util",
+ "@com_google_absl//absl/memory",
+ gtest,
+ "//test/util:test_util",
+ ],
+ alwayslink = 1,
+)
+
+cc_library(
+ name = "socket_ipv4_udp_unbound_external_networking_test_cases",
+ testonly = 1,
+ srcs = [
+ "socket_ipv4_udp_unbound_external_networking.cc",
+ ],
+ hdrs = [
+ "socket_ipv4_udp_unbound_external_networking.h",
+ ],
+ deps = [
+ ":ip_socket_test_util",
+ ":socket_test_util",
+ gtest,
+ "//test/util:test_util",
+ ],
+ alwayslink = 1,
+)
+
+cc_library(
+ name = "socket_ipv4_tcp_unbound_external_networking_test_cases",
+ testonly = 1,
+ srcs = [
+ "socket_ipv4_tcp_unbound_external_networking.cc",
+ ],
+ hdrs = [
+ "socket_ipv4_tcp_unbound_external_networking.h",
+ ],
+ deps = [
+ ":ip_socket_test_util",
+ ":socket_test_util",
+ gtest,
+ "//test/util:test_util",
+ ],
+ alwayslink = 1,
+)
+
+cc_binary(
+ name = "socket_abstract_test",
+ testonly = 1,
+ srcs = [
+ "socket_abstract.cc",
+ ],
+ linkstatic = 1,
+ deps = [
+ ":socket_generic_test_cases",
+ ":socket_test_util",
+ ":socket_unix_cmsg_test_cases",
+ ":socket_unix_test_cases",
+ ":unix_domain_socket_test_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "socket_abstract_non_blocking_test",
+ testonly = 1,
+ srcs = [
+ "socket_unix_abstract_nonblock.cc",
+ ],
+ linkstatic = 1,
+ deps = [
+ ":socket_non_blocking_test_cases",
+ ":socket_test_util",
+ ":unix_domain_socket_test_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "socket_unix_dgram_local_test",
+ testonly = 1,
+ srcs = ["socket_unix_dgram_local.cc"],
+ linkstatic = 1,
+ deps = [
+ ":socket_non_stream_test_cases",
+ ":socket_test_util",
+ ":socket_unix_dgram_test_cases",
+ ":socket_unix_non_stream_test_cases",
+ ":unix_domain_socket_test_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "socket_unix_dgram_non_blocking_test",
+ testonly = 1,
+ srcs = ["socket_unix_dgram_non_blocking.cc"],
+ linkstatic = 1,
+ deps = [
+ ":socket_test_util",
+ ":unix_domain_socket_test_util",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "socket_unix_seqpacket_local_test",
+ testonly = 1,
+ srcs = [
+ "socket_unix_seqpacket_local.cc",
+ ],
+ linkstatic = 1,
+ deps = [
+ ":socket_non_stream_test_cases",
+ ":socket_test_util",
+ ":socket_unix_non_stream_test_cases",
+ ":socket_unix_seqpacket_test_cases",
+ ":unix_domain_socket_test_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "socket_unix_stream_test",
+ testonly = 1,
+ srcs = ["socket_unix_stream.cc"],
+ linkstatic = 1,
+ deps = [
+ ":socket_test_util",
+ ":unix_domain_socket_test_util",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "socket_ip_tcp_generic_loopback_test",
+ testonly = 1,
+ srcs = [
+ "socket_ip_tcp_generic_loopback.cc",
+ ],
+ linkstatic = 1,
+ deps = [
+ ":ip_socket_test_util",
+ ":socket_ip_tcp_generic_test_cases",
+ ":socket_test_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "socket_ip_tcp_udp_generic_loopback_test",
+ testonly = 1,
+ srcs = [
+ "socket_ip_tcp_udp_generic.cc",
+ ],
+ linkstatic = 1,
+ deps = [
+ ":ip_socket_test_util",
+ ":socket_test_util",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "socket_ip_tcp_loopback_test",
+ testonly = 1,
+ srcs = [
+ "socket_ip_tcp_loopback.cc",
+ ],
+ linkstatic = 1,
+ deps = [
+ ":ip_socket_test_util",
+ ":socket_generic_test_cases",
+ ":socket_test_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "socket_ip_tcp_loopback_non_blocking_test",
+ testonly = 1,
+ srcs = [
+ "socket_ip_tcp_loopback_nonblock.cc",
+ ],
+ linkstatic = 1,
+ deps = [
+ ":ip_socket_test_util",
+ ":socket_non_blocking_test_cases",
+ ":socket_test_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "socket_ip_udp_loopback_test",
+ testonly = 1,
+ srcs = [
+ "socket_ip_udp_loopback.cc",
+ ],
+ linkstatic = 1,
+ deps = [
+ ":ip_socket_test_util",
+ ":socket_generic_test_cases",
+ ":socket_ip_udp_test_cases",
+ ":socket_non_stream_test_cases",
+ ":socket_test_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "socket_ipv4_udp_unbound_external_networking_test",
+ testonly = 1,
+ srcs = [
+ "socket_ipv4_udp_unbound_external_networking_test.cc",
+ ],
+ linkstatic = 1,
+ deps = [
+ ":ip_socket_test_util",
+ ":socket_ipv4_udp_unbound_external_networking_test_cases",
+ ":socket_test_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "socket_ipv4_tcp_unbound_external_networking_test",
+ testonly = 1,
+ srcs = [
+ "socket_ipv4_tcp_unbound_external_networking_test.cc",
+ ],
+ linkstatic = 1,
+ deps = [
+ ":ip_socket_test_util",
+ ":socket_ipv4_tcp_unbound_external_networking_test_cases",
+ ":socket_test_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "socket_bind_to_device_test",
+ testonly = 1,
+ srcs = [
+ "socket_bind_to_device.cc",
+ ],
+ linkstatic = 1,
+ deps = [
+ ":ip_socket_test_util",
+ ":socket_bind_to_device_util",
+ ":socket_test_util",
+ "//test/util:capability_util",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "socket_bind_to_device_sequence_test",
+ testonly = 1,
+ srcs = [
+ "socket_bind_to_device_sequence.cc",
+ ],
+ linkstatic = 1,
+ deps = [
+ ":ip_socket_test_util",
+ ":socket_bind_to_device_util",
+ ":socket_test_util",
+ "//test/util:capability_util",
+ "@com_google_absl//absl/container:node_hash_map",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "socket_bind_to_device_distribution_test",
+ testonly = 1,
+ srcs = [
+ "socket_bind_to_device_distribution.cc",
+ ],
+ linkstatic = 1,
+ deps = [
+ ":ip_socket_test_util",
+ ":socket_bind_to_device_util",
+ ":socket_test_util",
+ "//test/util:capability_util",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "socket_ip_udp_loopback_non_blocking_test",
+ testonly = 1,
+ srcs = [
+ "socket_ip_udp_loopback_nonblock.cc",
+ ],
+ linkstatic = 1,
+ deps = [
+ ":ip_socket_test_util",
+ ":socket_non_blocking_test_cases",
+ ":socket_test_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "socket_ipv4_udp_unbound_loopback_test",
+ testonly = 1,
+ srcs = [
+ "socket_ipv4_udp_unbound_loopback.cc",
+ ],
+ linkstatic = 1,
+ deps = [
+ ":ip_socket_test_util",
+ ":socket_ipv4_udp_unbound_test_cases",
+ ":socket_test_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "socket_ip_unbound_test",
+ testonly = 1,
+ srcs = [
+ "socket_ip_unbound.cc",
+ ],
+ linkstatic = 1,
+ deps = [
+ ":ip_socket_test_util",
+ ":socket_test_util",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "socket_domain_test",
+ testonly = 1,
+ srcs = [
+ "socket_unix_domain.cc",
+ ],
+ linkstatic = 1,
+ deps = [
+ ":socket_generic_test_cases",
+ ":socket_test_util",
+ ":unix_domain_socket_test_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "socket_domain_non_blocking_test",
+ testonly = 1,
+ srcs = [
+ "socket_unix_pair_nonblock.cc",
+ ],
+ linkstatic = 1,
+ deps = [
+ ":socket_non_blocking_test_cases",
+ ":socket_test_util",
+ ":unix_domain_socket_test_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "socket_filesystem_test",
+ testonly = 1,
+ srcs = [
+ "socket_filesystem.cc",
+ ],
+ linkstatic = 1,
+ deps = [
+ ":socket_generic_test_cases",
+ ":socket_test_util",
+ ":socket_unix_cmsg_test_cases",
+ ":socket_unix_test_cases",
+ ":unix_domain_socket_test_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "socket_filesystem_non_blocking_test",
+ testonly = 1,
+ srcs = [
+ "socket_unix_filesystem_nonblock.cc",
+ ],
+ linkstatic = 1,
+ deps = [
+ ":socket_non_blocking_test_cases",
+ ":socket_test_util",
+ ":unix_domain_socket_test_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "socket_inet_loopback_test",
+ testonly = 1,
+ srcs = ["socket_inet_loopback.cc"],
+ linkstatic = 1,
+ deps = [
+ ":ip_socket_test_util",
+ ":socket_test_util",
+ "//test/util:file_descriptor",
+ "@com_google_absl//absl/memory",
+ "@com_google_absl//absl/strings",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:posix_error",
+ "//test/util:save_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "socket_inet_loopback_nogotsan_test",
+ testonly = 1,
+ srcs = ["socket_inet_loopback_nogotsan.cc"],
+ linkstatic = 1,
+ deps = [
+ ":ip_socket_test_util",
+ ":socket_test_util",
+ "//test/util:file_descriptor",
+ "@com_google_absl//absl/memory",
+ "@com_google_absl//absl/strings",
+ gtest,
+ "//test/util:posix_error",
+ "//test/util:save_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "socket_netlink_test",
+ testonly = 1,
+ srcs = ["socket_netlink.cc"],
+ linkstatic = 1,
+ deps = [
+ ":socket_test_util",
+ "//test/util:file_descriptor",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "socket_netlink_route_test",
+ testonly = 1,
+ srcs = ["socket_netlink_route.cc"],
+ linkstatic = 1,
+ deps = [
+ ":socket_netlink_route_util",
+ ":socket_netlink_util",
+ ":socket_test_util",
+ "//test/util:capability_util",
+ "//test/util:cleanup",
+ "//test/util:file_descriptor",
+ "@com_google_absl//absl/strings:str_format",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "socket_netlink_uevent_test",
+ testonly = 1,
+ srcs = ["socket_netlink_uevent.cc"],
+ linkstatic = 1,
+ deps = [
+ ":socket_netlink_util",
+ ":socket_test_util",
+ "//test/util:file_descriptor",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+# These socket tests are in a library because the test cases are shared
+# across several test build targets.
+cc_library(
+ name = "socket_stream_test_cases",
+ testonly = 1,
+ srcs = [
+ "socket_stream.cc",
+ ],
+ hdrs = [
+ "socket_stream.h",
+ ],
+ deps = [
+ ":socket_test_util",
+ ":unix_domain_socket_test_util",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:test_util",
+ ],
+ alwayslink = 1,
+)
+
+cc_library(
+ name = "socket_blocking_test_cases",
+ testonly = 1,
+ srcs = [
+ "socket_blocking.cc",
+ ],
+ hdrs = [
+ "socket_blocking.h",
+ ],
+ deps = [
+ ":socket_test_util",
+ ":unix_domain_socket_test_util",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ "//test/util:timer_util",
+ ],
+ alwayslink = 1,
+)
+
+cc_library(
+ name = "socket_unix_test_cases",
+ testonly = 1,
+ srcs = [
+ "socket_unix.cc",
+ ],
+ hdrs = [
+ "socket_unix.h",
+ ],
+ deps = [
+ ":socket_test_util",
+ ":unix_domain_socket_test_util",
+ "@com_google_absl//absl/strings",
+ gtest,
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+ alwayslink = 1,
+)
+
+cc_library(
+ name = "socket_unix_cmsg_test_cases",
+ testonly = 1,
+ srcs = [
+ "socket_unix_cmsg.cc",
+ ],
+ hdrs = [
+ "socket_unix_cmsg.h",
+ ],
+ deps = [
+ ":socket_test_util",
+ ":unix_domain_socket_test_util",
+ "@com_google_absl//absl/strings",
+ gtest,
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+ alwayslink = 1,
+)
+
+cc_library(
+ name = "socket_stream_blocking_test_cases",
+ testonly = 1,
+ srcs = [
+ "socket_stream_blocking.cc",
+ ],
+ hdrs = [
+ "socket_stream_blocking.h",
+ ],
+ deps = [
+ ":socket_test_util",
+ ":unix_domain_socket_test_util",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ "//test/util:timer_util",
+ ],
+ alwayslink = 1,
+)
+
+cc_library(
+ name = "socket_stream_nonblocking_test_cases",
+ testonly = 1,
+ srcs = [
+ "socket_stream_nonblock.cc",
+ ],
+ hdrs = [
+ "socket_stream_nonblock.h",
+ ],
+ deps = [
+ ":socket_test_util",
+ ":unix_domain_socket_test_util",
+ gtest,
+ "//test/util:test_util",
+ ],
+ alwayslink = 1,
+)
+
+cc_library(
+ name = "socket_non_stream_blocking_test_cases",
+ testonly = 1,
+ srcs = [
+ "socket_non_stream_blocking.cc",
+ ],
+ hdrs = [
+ "socket_non_stream_blocking.h",
+ ],
+ deps = [
+ ":socket_test_util",
+ ":unix_domain_socket_test_util",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+ alwayslink = 1,
+)
+
+cc_library(
+ name = "socket_bind_to_device_util",
+ testonly = 1,
+ srcs = [
+ "socket_bind_to_device_util.cc",
+ ],
+ hdrs = [
+ "socket_bind_to_device_util.h",
+ ],
+ deps = [
+ "//test/util:test_util",
+ "@com_google_absl//absl/memory",
+ "@com_google_absl//absl/strings",
+ ],
+ alwayslink = 1,
+)
+
+cc_binary(
+ name = "socket_stream_local_test",
+ testonly = 1,
+ srcs = [
+ "socket_unix_stream_local.cc",
+ ],
+ linkstatic = 1,
+ deps = [
+ ":socket_stream_test_cases",
+ ":socket_test_util",
+ ":unix_domain_socket_test_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "socket_stream_blocking_local_test",
+ testonly = 1,
+ srcs = [
+ "socket_unix_stream_blocking_local.cc",
+ ],
+ linkstatic = 1,
+ deps = [
+ ":socket_stream_blocking_test_cases",
+ ":socket_test_util",
+ ":unix_domain_socket_test_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "socket_stream_blocking_tcp_test",
+ testonly = 1,
+ srcs = [
+ "socket_ip_tcp_loopback_blocking.cc",
+ ],
+ linkstatic = 1,
+ deps = [
+ ":ip_socket_test_util",
+ ":socket_stream_blocking_test_cases",
+ ":socket_test_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "socket_stream_nonblock_local_test",
+ testonly = 1,
+ srcs = [
+ "socket_unix_stream_nonblock_local.cc",
+ ],
+ linkstatic = 1,
+ deps = [
+ ":socket_stream_nonblocking_test_cases",
+ ":socket_test_util",
+ ":unix_domain_socket_test_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "socket_unix_unbound_dgram_test",
+ testonly = 1,
+ srcs = ["socket_unix_unbound_dgram.cc"],
+ linkstatic = 1,
+ deps = [
+ ":socket_test_util",
+ ":unix_domain_socket_test_util",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "socket_unix_unbound_abstract_test",
+ testonly = 1,
+ srcs = ["socket_unix_unbound_abstract.cc"],
+ linkstatic = 1,
+ deps = [
+ ":socket_test_util",
+ ":unix_domain_socket_test_util",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "socket_unix_unbound_filesystem_test",
+ testonly = 1,
+ srcs = ["socket_unix_unbound_filesystem.cc"],
+ linkstatic = 1,
+ deps = [
+ ":socket_test_util",
+ ":unix_domain_socket_test_util",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "socket_blocking_local_test",
+ testonly = 1,
+ srcs = [
+ "socket_unix_blocking_local.cc",
+ ],
+ linkstatic = 1,
+ deps = [
+ ":socket_blocking_test_cases",
+ ":socket_test_util",
+ ":unix_domain_socket_test_util",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "socket_blocking_ip_test",
+ testonly = 1,
+ srcs = [
+ "socket_ip_loopback_blocking.cc",
+ ],
+ linkstatic = 1,
+ deps = [
+ ":ip_socket_test_util",
+ ":socket_blocking_test_cases",
+ ":socket_test_util",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "socket_non_stream_blocking_local_test",
+ testonly = 1,
+ srcs = [
+ "socket_unix_non_stream_blocking_local.cc",
+ ],
+ linkstatic = 1,
+ deps = [
+ ":socket_non_stream_blocking_test_cases",
+ ":socket_test_util",
+ ":unix_domain_socket_test_util",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "socket_non_stream_blocking_udp_test",
+ testonly = 1,
+ srcs = [
+ "socket_ip_udp_loopback_blocking.cc",
+ ],
+ linkstatic = 1,
+ deps = [
+ ":ip_socket_test_util",
+ ":socket_non_stream_blocking_test_cases",
+ ":socket_test_util",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "socket_unix_pair_test",
+ testonly = 1,
+ srcs = [
+ "socket_unix_pair.cc",
+ ],
+ linkstatic = 1,
+ deps = [
+ ":socket_test_util",
+ ":socket_unix_cmsg_test_cases",
+ ":socket_unix_test_cases",
+ ":unix_domain_socket_test_util",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "socket_unix_unbound_seqpacket_test",
+ testonly = 1,
+ srcs = ["socket_unix_unbound_seqpacket.cc"],
+ linkstatic = 1,
+ deps = [
+ ":socket_test_util",
+ ":unix_domain_socket_test_util",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "socket_unix_unbound_stream_test",
+ testonly = 1,
+ srcs = ["socket_unix_unbound_stream.cc"],
+ linkstatic = 1,
+ deps = [
+ ":socket_test_util",
+ ":unix_domain_socket_test_util",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "socket_netdevice_test",
+ testonly = 1,
+ srcs = ["socket_netdevice.cc"],
+ linkstatic = 1,
+ deps = [
+ ":socket_netlink_util",
+ ":socket_test_util",
+ "//test/util:file_descriptor",
+ "@com_google_absl//absl/base:endian",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "stat_test",
+ testonly = 1,
+ srcs = [
+ "file_base.h",
+ "stat.cc",
+ ],
+ linkstatic = 1,
+ deps = [
+ "//test/util:cleanup",
+ "//test/util:file_descriptor",
+ "//test/util:fs_util",
+ "@com_google_absl//absl/strings",
+ gtest,
+ "//test/util:posix_error",
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "stat_times_test",
+ testonly = 1,
+ srcs = ["stat_times.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:file_descriptor",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "statfs_test",
+ testonly = 1,
+ srcs = [
+ "file_base.h",
+ "statfs.cc",
+ ],
+ linkstatic = 1,
+ deps = [
+ "//test/util:file_descriptor",
+ "@com_google_absl//absl/strings",
+ gtest,
+ "//test/util:posix_error",
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "symlink_test",
+ testonly = 1,
+ srcs = ["symlink.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:capability_util",
+ "//test/util:file_descriptor",
+ "//test/util:fs_util",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "sync_test",
+ testonly = 1,
+ srcs = ["sync.cc"],
+ linkstatic = 1,
+ deps = [
+ gtest,
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "sysinfo_test",
+ testonly = 1,
+ srcs = ["sysinfo.cc"],
+ linkstatic = 1,
+ deps = [
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "syslog_test",
+ testonly = 1,
+ srcs = ["syslog.cc"],
+ linkstatic = 1,
+ deps = [
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "sysret_test",
+ testonly = 1,
+ srcs = ["sysret.cc"],
+ linkstatic = 1,
+ deps = [
+ gtest,
+ "//test/util:logging",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "tcp_socket_test",
+ testonly = 1,
+ srcs = ["tcp_socket.cc"],
+ linkstatic = 1,
+ deps = [
+ ":socket_test_util",
+ "//test/util:file_descriptor",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:posix_error",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "tgkill_test",
+ testonly = 1,
+ srcs = ["tgkill.cc"],
+ linkstatic = 1,
+ deps = [
+ gtest,
+ "//test/util:signal_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "time_test",
+ testonly = 1,
+ srcs = ["time.cc"],
+ linkstatic = 1,
+ deps = [
+ gtest,
+ "//test/util:proc_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "timerfd_test",
+ testonly = 1,
+ srcs = ["timerfd.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:file_descriptor",
+ "//test/util:posix_error",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "@com_google_absl//absl/time",
+ ],
+)
+
+cc_binary(
+ name = "timers_test",
+ testonly = 1,
+ srcs = ["timers.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:cleanup",
+ "@com_google_absl//absl/flags:flag",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:logging",
+ "//test/util:multiprocess_util",
+ "//test/util:posix_error",
+ "//test/util:signal_util",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "tkill_test",
+ testonly = 1,
+ srcs = ["tkill.cc"],
+ linkstatic = 1,
+ deps = [
+ gtest,
+ "//test/util:logging",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "truncate_test",
+ testonly = 1,
+ srcs = ["truncate.cc"],
+ linkstatic = 1,
+ deps = [
+ ":file_base",
+ "//test/util:capability_util",
+ "//test/util:cleanup",
+ "//test/util:file_descriptor",
+ "@com_google_absl//absl/strings",
+ gtest,
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "tuntap_test",
+ testonly = 1,
+ srcs = ["tuntap.cc"],
+ linkstatic = 1,
+ deps = [
+ ":socket_test_util",
+ gtest,
+ ":socket_netlink_route_util",
+ "//test/util:capability_util",
+ "//test/util:file_descriptor",
+ "//test/util:fs_util",
+ "//test/util:posix_error",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "@com_google_absl//absl/strings",
+ ],
+)
+
+cc_binary(
+ name = "tuntap_hostinet_test",
+ testonly = 1,
+ srcs = ["tuntap_hostinet.cc"],
+ linkstatic = 1,
+ deps = [
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_library(
+ name = "udp_socket_test_cases",
+ testonly = 1,
+ srcs = [
+ "udp_socket_errqueue_test_case.cc",
+ "udp_socket_test_cases.cc",
+ ],
+ hdrs = ["udp_socket_test_cases.h"],
+ defines = select_system(),
+ deps = [
+ ":ip_socket_test_util",
+ ":socket_test_util",
+ ":unix_domain_socket_test_util",
+ "@com_google_absl//absl/base:core_headers",
+ "@com_google_absl//absl/strings:str_format",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:file_descriptor",
+ "//test/util:posix_error",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+ alwayslink = 1,
+)
+
+cc_binary(
+ name = "udp_socket_test",
+ testonly = 1,
+ srcs = ["udp_socket.cc"],
+ linkstatic = 1,
+ deps = [
+ ":udp_socket_test_cases",
+ ],
+)
+
+cc_binary(
+ name = "udp_bind_test",
+ testonly = 1,
+ srcs = ["udp_bind.cc"],
+ linkstatic = 1,
+ deps = [
+ ":socket_test_util",
+ "//test/util:file_descriptor",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "uidgid_test",
+ testonly = 1,
+ srcs = ["uidgid.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:capability_util",
+ "@com_google_absl//absl/flags:flag",
+ "@com_google_absl//absl/strings",
+ gtest,
+ "//test/util:posix_error",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ "//test/util:uid_util",
+ ],
+)
+
+cc_binary(
+ name = "uname_test",
+ testonly = 1,
+ srcs = ["uname.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:capability_util",
+ "@com_google_absl//absl/strings",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "unlink_test",
+ testonly = 1,
+ srcs = ["unlink.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:capability_util",
+ "//test/util:file_descriptor",
+ "//test/util:fs_util",
+ "@com_google_absl//absl/strings",
+ gtest,
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "unshare_test",
+ testonly = 1,
+ srcs = ["unshare.cc"],
+ linkstatic = 1,
+ deps = [
+ "@com_google_absl//absl/synchronization",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "utimes_test",
+ testonly = 1,
+ srcs = ["utimes.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:file_descriptor",
+ "//test/util:fs_util",
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "@com_google_absl//absl/time",
+ ],
+)
+
+cc_binary(
+ name = "vdso_test",
+ testonly = 1,
+ srcs = ["vdso.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:fs_util",
+ gtest,
+ "//test/util:posix_error",
+ "//test/util:proc_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "vfork_test",
+ testonly = 1,
+ srcs = ["vfork.cc"],
+ linkstatic = 1,
+ deps = [
+ "@com_google_absl//absl/flags:flag",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:logging",
+ "//test/util:multiprocess_util",
+ "//test/util:test_util",
+ "//test/util:time_util",
+ ],
+)
+
+cc_binary(
+ name = "wait_test",
+ testonly = 1,
+ srcs = ["wait.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:cleanup",
+ "//test/util:file_descriptor",
+ "@com_google_absl//absl/strings",
+ "@com_google_absl//absl/synchronization",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:logging",
+ "//test/util:multiprocess_util",
+ "//test/util:posix_error",
+ "//test/util:signal_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ "//test/util:time_util",
+ ],
+)
+
+cc_binary(
+ name = "write_test",
+ testonly = 1,
+ srcs = ["write.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:cleanup",
+ gtest,
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "memory_accounting_test",
+ testonly = 1,
+ srcs = ["memory_accounting.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:fs_util",
+ "@com_google_absl//absl/strings",
+ "@com_google_absl//absl/strings:str_format",
+ gtest,
+ "//test/util:posix_error",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "network_namespace_test",
+ testonly = 1,
+ srcs = ["network_namespace.cc"],
+ linkstatic = 1,
+ deps = [
+ ":socket_test_util",
+ gtest,
+ "//test/util:capability_util",
+ "//test/util:posix_error",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "semaphore_test",
+ testonly = 1,
+ srcs = ["semaphore.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:capability_util",
+ "@com_google_absl//absl/base:core_headers",
+ "@com_google_absl//absl/memory",
+ "@com_google_absl//absl/synchronization",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ ],
+)
+
+cc_binary(
+ name = "shm_test",
+ testonly = 1,
+ srcs = ["shm.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:multiprocess_util",
+ "//test/util:posix_error",
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ "@com_google_absl//absl/time",
+ ],
+)
+
+cc_binary(
+ name = "fadvise64_test",
+ testonly = 1,
+ srcs = ["fadvise64.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:file_descriptor",
+ gtest,
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "vdso_clock_gettime_test",
+ testonly = 1,
+ srcs = ["vdso_clock_gettime.cc"],
+ linkstatic = 1,
+ deps = [
+ "@com_google_absl//absl/strings",
+ "@com_google_absl//absl/time",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "vsyscall_test",
+ testonly = 1,
+ srcs = ["vsyscall.cc"],
+ linkstatic = 1,
+ deps = [
+ gtest,
+ "//test/util:proc_util",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "proc_net_unix_test",
+ testonly = 1,
+ srcs = ["proc_net_unix.cc"],
+ linkstatic = 1,
+ deps = [
+ ":unix_domain_socket_test_util",
+ "//test/util:file_descriptor",
+ "//test/util:fs_util",
+ "@com_google_absl//absl/strings",
+ "@com_google_absl//absl/strings:str_format",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "memfd_test",
+ testonly = 1,
+ srcs = ["memfd.cc"],
+ linkstatic = 1,
+ deps = [
+ "//test/util:file_descriptor",
+ "//test/util:fs_util",
+ gtest,
+ "//test/util:memory_util",
+ "//test/util:multiprocess_util",
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "proc_net_tcp_test",
+ testonly = 1,
+ srcs = ["proc_net_tcp.cc"],
+ linkstatic = 1,
+ deps = [
+ ":ip_socket_test_util",
+ "//test/util:file_descriptor",
+ "@com_google_absl//absl/strings",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "proc_net_udp_test",
+ testonly = 1,
+ srcs = ["proc_net_udp.cc"],
+ linkstatic = 1,
+ deps = [
+ ":ip_socket_test_util",
+ "//test/util:file_descriptor",
+ "@com_google_absl//absl/strings",
+ gtest,
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
+
+cc_binary(
+ name = "xattr_test",
+ testonly = 1,
+ srcs = [
+ "file_base.h",
+ "xattr.cc",
+ ],
+ linkstatic = 1,
+ deps = [
+ "//test/util:capability_util",
+ "//test/util:file_descriptor",
+ "//test/util:fs_util",
+ "@com_google_absl//absl/container:flat_hash_set",
+ "@com_google_absl//absl/strings",
+ gtest,
+ "//test/util:posix_error",
+ "//test/util:temp_path",
+ "//test/util:test_main",
+ "//test/util:test_util",
+ ],
+)
diff --git a/test/syscalls/linux/accept_bind.cc b/test/syscalls/linux/accept_bind.cc
new file mode 100644
index 000000000..f65a14fb8
--- /dev/null
+++ b/test/syscalls/linux/accept_bind.cc
@@ -0,0 +1,641 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include <algorithm>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST_P(AllSocketPairTest, Listen) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ ASSERT_THAT(listen(sockets->first_fd(), /* backlog = */ 5),
+ SyscallSucceeds());
+}
+
+TEST_P(AllSocketPairTest, ListenIncreaseBacklog) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ ASSERT_THAT(listen(sockets->first_fd(), /* backlog = */ 5),
+ SyscallSucceeds());
+ ASSERT_THAT(listen(sockets->first_fd(), /* backlog = */ 10),
+ SyscallSucceeds());
+}
+
+TEST_P(AllSocketPairTest, ListenDecreaseBacklog) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ ASSERT_THAT(listen(sockets->first_fd(), /* backlog = */ 5),
+ SyscallSucceeds());
+ ASSERT_THAT(listen(sockets->first_fd(), /* backlog = */ 1),
+ SyscallSucceeds());
+}
+
+TEST_P(AllSocketPairTest, ListenWithoutBind) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ ASSERT_THAT(listen(sockets->first_fd(), 0), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(AllSocketPairTest, DoubleBind) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->second_addr(),
+ sockets->second_addr_size()),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(AllSocketPairTest, BindListenBind) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->second_addr(),
+ sockets->second_addr_size()),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(AllSocketPairTest, DoubleListen) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+ ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+}
+
+TEST_P(AllSocketPairTest, DoubleConnect) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+ ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallFailsWithErrno(EISCONN));
+}
+
+TEST_P(AllSocketPairTest, Connect) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+ ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+}
+
+TEST_P(AllSocketPairTest, ConnectWithWrongType) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ int type;
+ socklen_t typelen = sizeof(type);
+ EXPECT_THAT(
+ getsockopt(sockets->first_fd(), SOL_SOCKET, SO_TYPE, &type, &typelen),
+ SyscallSucceeds());
+ switch (type) {
+ case SOCK_STREAM:
+ type = SOCK_SEQPACKET;
+ break;
+ case SOCK_SEQPACKET:
+ type = SOCK_STREAM;
+ break;
+ }
+
+ const FileDescriptor another_socket =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_UNIX, type, 0));
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+ if (sockets->first_addr()->sa_data[0] != 0) {
+ ASSERT_THAT(connect(another_socket.get(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallFailsWithErrno(EPROTOTYPE));
+ } else {
+ ASSERT_THAT(connect(another_socket.get(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallFailsWithErrno(ECONNREFUSED));
+ }
+
+ ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+}
+
+TEST_P(AllSocketPairTest, ConnectNonListening) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallFailsWithErrno(ECONNREFUSED));
+}
+
+TEST_P(AllSocketPairTest, ConnectToFilePath) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ struct sockaddr_un addr = {};
+ addr.sun_family = AF_UNIX;
+ constexpr char kPath[] = "/tmp";
+ memcpy(addr.sun_path, kPath, sizeof(kPath));
+
+ ASSERT_THAT(
+ connect(sockets->second_fd(),
+ reinterpret_cast<const struct sockaddr*>(&addr), sizeof(addr)),
+ SyscallFailsWithErrno(ECONNREFUSED));
+}
+
+TEST_P(AllSocketPairTest, ConnectToInvalidAbstractPath) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ struct sockaddr_un addr = {};
+ addr.sun_family = AF_UNIX;
+ constexpr char kPath[] = "\0nonexistent";
+ memcpy(addr.sun_path, kPath, sizeof(kPath));
+
+ ASSERT_THAT(
+ connect(sockets->second_fd(),
+ reinterpret_cast<const struct sockaddr*>(&addr), sizeof(addr)),
+ SyscallFailsWithErrno(ECONNREFUSED));
+}
+
+TEST_P(AllSocketPairTest, SelfConnect) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+ ASSERT_THAT(connect(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(AllSocketPairTest, ConnectWithoutListen) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallFailsWithErrno(ECONNREFUSED));
+}
+
+TEST_P(AllSocketPairTest, Accept) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+ ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ int accepted = -1;
+ ASSERT_THAT(accepted = accept(sockets->first_fd(), nullptr, nullptr),
+ SyscallSucceeds());
+ ASSERT_THAT(close(accepted), SyscallSucceeds());
+}
+
+TEST_P(AllSocketPairTest, AcceptValidAddrLen) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+ ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ int accepted = -1;
+ struct sockaddr_un addr = {};
+ socklen_t addr_len = sizeof(addr);
+ ASSERT_THAT(
+ accepted = accept(sockets->first_fd(),
+ reinterpret_cast<struct sockaddr*>(&addr), &addr_len),
+ SyscallSucceeds());
+ ASSERT_THAT(close(accepted), SyscallSucceeds());
+}
+
+TEST_P(AllSocketPairTest, AcceptNegativeAddrLen) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+ ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ // With a negative addr_len, accept returns EINVAL,
+ struct sockaddr_un addr = {};
+ socklen_t addr_len = -1;
+ ASSERT_THAT(accept(sockets->first_fd(),
+ reinterpret_cast<struct sockaddr*>(&addr), &addr_len),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(AllSocketPairTest, AcceptLargePositiveAddrLen) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+ ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ // With a large (positive) addr_len, accept does not return EINVAL.
+ int accepted = -1;
+ char addr_buf[200];
+ socklen_t addr_len = sizeof(addr_buf);
+ ASSERT_THAT(accepted = accept(sockets->first_fd(),
+ reinterpret_cast<struct sockaddr*>(addr_buf),
+ &addr_len),
+ SyscallSucceeds());
+ // addr_len should have been updated by accept().
+ EXPECT_LT(addr_len, sizeof(addr_buf));
+ ASSERT_THAT(close(accepted), SyscallSucceeds());
+}
+
+TEST_P(AllSocketPairTest, AcceptVeryLargePositiveAddrLen) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+ ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ // With a large (positive) addr_len, accept does not return EINVAL.
+ int accepted = -1;
+ char addr_buf[2000];
+ socklen_t addr_len = sizeof(addr_buf);
+ ASSERT_THAT(accepted = accept(sockets->first_fd(),
+ reinterpret_cast<struct sockaddr*>(addr_buf),
+ &addr_len),
+ SyscallSucceeds());
+ // addr_len should have been updated by accept().
+ EXPECT_LT(addr_len, sizeof(addr_buf));
+ ASSERT_THAT(close(accepted), SyscallSucceeds());
+}
+
+TEST_P(AllSocketPairTest, AcceptWithoutBind) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(accept(sockets->first_fd(), nullptr, nullptr),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(AllSocketPairTest, AcceptWithoutListen) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+ ASSERT_THAT(accept(sockets->first_fd(), nullptr, nullptr),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(AllSocketPairTest, GetRemoteAddress) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+ ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ socklen_t addr_len = sockets->first_addr_size();
+ struct sockaddr_storage addr = {};
+ ASSERT_THAT(
+ getpeername(sockets->second_fd(), (struct sockaddr*)(&addr), &addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(addr_len, sockets->first_addr_len());
+ EXPECT_EQ(0, memcmp(&addr, sockets->first_addr(), sockets->first_addr_len()));
+}
+
+TEST_P(AllSocketPairTest, UnboundGetLocalAddress) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+ ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ socklen_t addr_len = sockets->first_addr_size();
+ struct sockaddr_storage addr = {};
+ ASSERT_THAT(
+ getsockname(sockets->second_fd(), (struct sockaddr*)(&addr), &addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(addr_len, 2);
+ EXPECT_EQ(
+ memcmp(&addr, sockets->second_addr(),
+ std::min((size_t)addr_len, (size_t)sockets->second_addr_len())),
+ 0);
+}
+
+TEST_P(AllSocketPairTest, BoundGetLocalAddress) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+ ASSERT_THAT(bind(sockets->second_fd(), sockets->second_addr(),
+ sockets->second_addr_size()),
+ SyscallSucceeds());
+
+ ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ socklen_t addr_len = sockets->first_addr_size();
+ struct sockaddr_storage addr = {};
+ ASSERT_THAT(
+ getsockname(sockets->second_fd(), (struct sockaddr*)(&addr), &addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(addr_len, sockets->second_addr_len());
+ EXPECT_EQ(
+ memcmp(&addr, sockets->second_addr(),
+ std::min((size_t)addr_len, (size_t)sockets->second_addr_len())),
+ 0);
+}
+
+TEST_P(AllSocketPairTest, BoundConnector) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+ ASSERT_THAT(bind(sockets->second_fd(), sockets->second_addr(),
+ sockets->second_addr_size()),
+ SyscallSucceeds());
+
+ ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+}
+
+TEST_P(AllSocketPairTest, UnboundSenderAddr) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+ ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ int accepted = -1;
+ ASSERT_THAT(accepted = accept(sockets->first_fd(), nullptr, nullptr),
+ SyscallSucceeds());
+ FileDescriptor accepted_fd(accepted);
+
+ int i = 0;
+ ASSERT_THAT(RetryEINTR(send)(sockets->second_fd(), &i, sizeof(i), 0),
+ SyscallSucceedsWithValue(sizeof(i)));
+
+ struct sockaddr_storage addr;
+ socklen_t addr_len = sizeof(addr);
+ ASSERT_THAT(
+ RetryEINTR(recvfrom)(accepted_fd.get(), &i, sizeof(i), 0,
+ reinterpret_cast<sockaddr*>(&addr), &addr_len),
+ SyscallSucceedsWithValue(sizeof(i)));
+ EXPECT_EQ(addr_len, 0);
+}
+
+TEST_P(AllSocketPairTest, BoundSenderAddr) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+ ASSERT_THAT(bind(sockets->second_fd(), sockets->second_addr(),
+ sockets->second_addr_size()),
+ SyscallSucceeds());
+
+ ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ int accepted = -1;
+ ASSERT_THAT(accepted = accept(sockets->first_fd(), nullptr, nullptr),
+ SyscallSucceeds());
+ FileDescriptor accepted_fd(accepted);
+
+ int i = 0;
+ ASSERT_THAT(RetryEINTR(send)(sockets->second_fd(), &i, sizeof(i), 0),
+ SyscallSucceedsWithValue(sizeof(i)));
+
+ struct sockaddr_storage addr;
+ socklen_t addr_len = sizeof(addr);
+ ASSERT_THAT(
+ RetryEINTR(recvfrom)(accepted_fd.get(), &i, sizeof(i), 0,
+ reinterpret_cast<sockaddr*>(&addr), &addr_len),
+ SyscallSucceedsWithValue(sizeof(i)));
+ EXPECT_EQ(addr_len, sockets->second_addr_len());
+ EXPECT_EQ(
+ memcmp(&addr, sockets->second_addr(),
+ std::min((size_t)addr_len, (size_t)sockets->second_addr_len())),
+ 0);
+}
+
+TEST_P(AllSocketPairTest, BindAfterConnectSenderAddr) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+ ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ ASSERT_THAT(bind(sockets->second_fd(), sockets->second_addr(),
+ sockets->second_addr_size()),
+ SyscallSucceeds());
+
+ int accepted = -1;
+ ASSERT_THAT(accepted = accept(sockets->first_fd(), nullptr, nullptr),
+ SyscallSucceeds());
+ FileDescriptor accepted_fd(accepted);
+
+ int i = 0;
+ ASSERT_THAT(RetryEINTR(send)(sockets->second_fd(), &i, sizeof(i), 0),
+ SyscallSucceedsWithValue(sizeof(i)));
+
+ struct sockaddr_storage addr;
+ socklen_t addr_len = sizeof(addr);
+ ASSERT_THAT(
+ RetryEINTR(recvfrom)(accepted_fd.get(), &i, sizeof(i), 0,
+ reinterpret_cast<sockaddr*>(&addr), &addr_len),
+ SyscallSucceedsWithValue(sizeof(i)));
+ EXPECT_EQ(addr_len, sockets->second_addr_len());
+ EXPECT_EQ(
+ memcmp(&addr, sockets->second_addr(),
+ std::min((size_t)addr_len, (size_t)sockets->second_addr_len())),
+ 0);
+}
+
+TEST_P(AllSocketPairTest, BindAfterAcceptSenderAddr) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+ ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ int accepted = -1;
+ ASSERT_THAT(accepted = accept(sockets->first_fd(), nullptr, nullptr),
+ SyscallSucceeds());
+ FileDescriptor accepted_fd(accepted);
+
+ ASSERT_THAT(bind(sockets->second_fd(), sockets->second_addr(),
+ sockets->second_addr_size()),
+ SyscallSucceeds());
+
+ int i = 0;
+ ASSERT_THAT(RetryEINTR(send)(sockets->second_fd(), &i, sizeof(i), 0),
+ SyscallSucceedsWithValue(sizeof(i)));
+
+ struct sockaddr_storage addr;
+ socklen_t addr_len = sizeof(addr);
+ ASSERT_THAT(
+ RetryEINTR(recvfrom)(accepted_fd.get(), &i, sizeof(i), 0,
+ reinterpret_cast<sockaddr*>(&addr), &addr_len),
+ SyscallSucceedsWithValue(sizeof(i)));
+ EXPECT_EQ(addr_len, sockets->second_addr_len());
+ EXPECT_EQ(
+ memcmp(&addr, sockets->second_addr(),
+ std::min((size_t)addr_len, (size_t)sockets->second_addr_len())),
+ 0);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ AllUnixDomainSockets, AllSocketPairTest,
+ ::testing::ValuesIn(VecCat<SocketPairKind>(
+ ApplyVec<SocketPairKind>(
+ FilesystemUnboundUnixDomainSocketPair,
+ AllBitwiseCombinations(List<int>{SOCK_STREAM, SOCK_SEQPACKET},
+ List<int>{0, SOCK_NONBLOCK})),
+ ApplyVec<SocketPairKind>(
+ AbstractUnboundUnixDomainSocketPair,
+ AllBitwiseCombinations(List<int>{SOCK_STREAM, SOCK_SEQPACKET},
+ List<int>{0, SOCK_NONBLOCK})))));
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/accept_bind_stream.cc b/test/syscalls/linux/accept_bind_stream.cc
new file mode 100644
index 000000000..4857f160b
--- /dev/null
+++ b/test/syscalls/linux/accept_bind_stream.cc
@@ -0,0 +1,92 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <sys/un.h>
+
+#include <algorithm>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST_P(AllSocketPairTest, BoundSenderAddrCoalesced) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+ ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ int accepted = -1;
+ ASSERT_THAT(accepted = accept(sockets->first_fd(), nullptr, nullptr),
+ SyscallSucceeds());
+ FileDescriptor closer(accepted);
+
+ int i = 0;
+ ASSERT_THAT(RetryEINTR(send)(sockets->second_fd(), &i, sizeof(i), 0),
+ SyscallSucceedsWithValue(sizeof(i)));
+
+ ASSERT_THAT(bind(sockets->second_fd(), sockets->second_addr(),
+ sockets->second_addr_size()),
+ SyscallSucceeds());
+
+ i = 0;
+ ASSERT_THAT(RetryEINTR(send)(sockets->second_fd(), &i, sizeof(i), 0),
+ SyscallSucceedsWithValue(sizeof(i)));
+
+ int ri[2] = {0, 0};
+ struct sockaddr_storage addr;
+ socklen_t addr_len = sizeof(addr);
+ ASSERT_THAT(
+ RetryEINTR(recvfrom)(accepted, ri, sizeof(ri), 0,
+ reinterpret_cast<sockaddr*>(&addr), &addr_len),
+ SyscallSucceedsWithValue(sizeof(ri)));
+ EXPECT_EQ(addr_len, sockets->second_addr_len());
+
+ EXPECT_EQ(
+ memcmp(&addr, sockets->second_addr(),
+ std::min((size_t)addr_len, (size_t)sockets->second_addr_len())),
+ 0);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ AllUnixDomainSockets, AllSocketPairTest,
+ ::testing::ValuesIn(VecCat<SocketPairKind>(
+ ApplyVec<SocketPairKind>(FilesystemUnboundUnixDomainSocketPair,
+ AllBitwiseCombinations(List<int>{SOCK_STREAM},
+ List<int>{
+ 0, SOCK_NONBLOCK})),
+ ApplyVec<SocketPairKind>(
+ AbstractUnboundUnixDomainSocketPair,
+ AllBitwiseCombinations(List<int>{SOCK_STREAM},
+ List<int>{0, SOCK_NONBLOCK})))));
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/access.cc b/test/syscalls/linux/access.cc
new file mode 100644
index 000000000..bcc25cef4
--- /dev/null
+++ b/test/syscalls/linux/access.cc
@@ -0,0 +1,170 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/util/capability_util.h"
+#include "test/util/fs_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+using ::testing::Ge;
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+class AccessTest : public ::testing::Test {
+ public:
+ std::string CreateTempFile(int perm) {
+ const std::string path = NewTempAbsPath();
+ const int fd = open(path.c_str(), O_CREAT | O_RDONLY, perm);
+ TEST_PCHECK(fd > 0);
+ TEST_PCHECK(close(fd) == 0);
+ return path;
+ }
+
+ protected:
+ // SetUp creates various configurations of files.
+ void SetUp() override {
+ // Move to the temporary directory. This allows us to reason more easily
+ // about absolute and relative paths.
+ ASSERT_THAT(chdir(GetAbsoluteTestTmpdir().c_str()), SyscallSucceeds());
+
+ // Create an empty file, standard permissions.
+ relfile_ = NewTempRelPath();
+ int fd;
+ ASSERT_THAT(fd = open(relfile_.c_str(), O_CREAT | O_TRUNC, 0644),
+ SyscallSucceedsWithValue(Ge(0)));
+ ASSERT_THAT(close(fd), SyscallSucceeds());
+ absfile_ = GetAbsoluteTestTmpdir() + "/" + relfile_;
+
+ // Create an empty directory, no writable permissions.
+ absdir_ = NewTempAbsPath();
+ reldir_ = JoinPath(Basename(absdir_), "");
+ ASSERT_THAT(mkdir(reldir_.c_str(), 0555), SyscallSucceeds());
+
+ // This file doesn't exist.
+ relnone_ = NewTempRelPath();
+ absnone_ = GetAbsoluteTestTmpdir() + "/" + relnone_;
+ }
+
+ // TearDown unlinks created files.
+ void TearDown() override {
+ ASSERT_THAT(unlink(absfile_.c_str()), SyscallSucceeds());
+ ASSERT_THAT(rmdir(absdir_.c_str()), SyscallSucceeds());
+ }
+
+ std::string relfile_;
+ std::string reldir_;
+
+ std::string absfile_;
+ std::string absdir_;
+
+ std::string relnone_;
+ std::string absnone_;
+};
+
+TEST_F(AccessTest, RelativeFile) {
+ EXPECT_THAT(access(relfile_.c_str(), R_OK), SyscallSucceeds());
+}
+
+TEST_F(AccessTest, RelativeDir) {
+ EXPECT_THAT(access(reldir_.c_str(), R_OK | X_OK), SyscallSucceeds());
+}
+
+TEST_F(AccessTest, AbsFile) {
+ EXPECT_THAT(access(absfile_.c_str(), R_OK), SyscallSucceeds());
+}
+
+TEST_F(AccessTest, AbsDir) {
+ EXPECT_THAT(access(absdir_.c_str(), R_OK | X_OK), SyscallSucceeds());
+}
+
+TEST_F(AccessTest, RelDoesNotExist) {
+ EXPECT_THAT(access(relnone_.c_str(), R_OK), SyscallFailsWithErrno(ENOENT));
+}
+
+TEST_F(AccessTest, AbsDoesNotExist) {
+ EXPECT_THAT(access(absnone_.c_str(), R_OK), SyscallFailsWithErrno(ENOENT));
+}
+
+TEST_F(AccessTest, InvalidMode) {
+ EXPECT_THAT(access(relfile_.c_str(), 0xffffffff),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_F(AccessTest, NoPerms) {
+ // Drop capabilities that allow us to override permissions. We must drop
+ // PERMITTED because access() checks those instead of EFFECTIVE.
+ ASSERT_NO_ERRNO(DropPermittedCapability(CAP_DAC_OVERRIDE));
+ ASSERT_NO_ERRNO(DropPermittedCapability(CAP_DAC_READ_SEARCH));
+
+ EXPECT_THAT(access(absdir_.c_str(), W_OK), SyscallFailsWithErrno(EACCES));
+}
+
+TEST_F(AccessTest, InvalidName) {
+ EXPECT_THAT(access(reinterpret_cast<char*>(0x1234), W_OK),
+ SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(AccessTest, UsrReadOnly) {
+ // Drop capabilities that allow us to override permissions. We must drop
+ // PERMITTED because access() checks those instead of EFFECTIVE.
+ ASSERT_NO_ERRNO(DropPermittedCapability(CAP_DAC_OVERRIDE));
+ ASSERT_NO_ERRNO(DropPermittedCapability(CAP_DAC_READ_SEARCH));
+
+ const std::string filename = CreateTempFile(0400);
+ EXPECT_THAT(access(filename.c_str(), R_OK), SyscallSucceeds());
+ EXPECT_THAT(access(filename.c_str(), W_OK), SyscallFailsWithErrno(EACCES));
+ EXPECT_THAT(access(filename.c_str(), X_OK), SyscallFailsWithErrno(EACCES));
+ EXPECT_THAT(unlink(filename.c_str()), SyscallSucceeds());
+}
+
+TEST_F(AccessTest, UsrReadExec) {
+ // Drop capabilities that allow us to override permissions. We must drop
+ // PERMITTED because access() checks those instead of EFFECTIVE.
+ ASSERT_NO_ERRNO(DropPermittedCapability(CAP_DAC_OVERRIDE));
+ ASSERT_NO_ERRNO(DropPermittedCapability(CAP_DAC_READ_SEARCH));
+
+ const std::string filename = CreateTempFile(0500);
+ EXPECT_THAT(access(filename.c_str(), R_OK | X_OK), SyscallSucceeds());
+ EXPECT_THAT(access(filename.c_str(), W_OK), SyscallFailsWithErrno(EACCES));
+ EXPECT_THAT(unlink(filename.c_str()), SyscallSucceeds());
+}
+
+TEST_F(AccessTest, UsrReadWrite) {
+ const std::string filename = CreateTempFile(0600);
+ EXPECT_THAT(access(filename.c_str(), R_OK | W_OK), SyscallSucceeds());
+ EXPECT_THAT(access(filename.c_str(), X_OK), SyscallFailsWithErrno(EACCES));
+ EXPECT_THAT(unlink(filename.c_str()), SyscallSucceeds());
+}
+
+TEST_F(AccessTest, UsrReadWriteExec) {
+ const std::string filename = CreateTempFile(0700);
+ EXPECT_THAT(access(filename.c_str(), R_OK | W_OK | X_OK), SyscallSucceeds());
+ EXPECT_THAT(unlink(filename.c_str()), SyscallSucceeds());
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/affinity.cc b/test/syscalls/linux/affinity.cc
new file mode 100644
index 000000000..128364c34
--- /dev/null
+++ b/test/syscalls/linux/affinity.cc
@@ -0,0 +1,242 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sched.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "absl/strings/str_split.h"
+#include "test/util/cleanup.h"
+#include "test/util/fs_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+// These tests are for both the sched_getaffinity(2) and sched_setaffinity(2)
+// syscalls.
+class AffinityTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ EXPECT_THAT(
+ // Needs use the raw syscall to get the actual size.
+ cpuset_size_ = syscall(SYS_sched_getaffinity, /*pid=*/0,
+ sizeof(cpu_set_t), &mask_),
+ SyscallSucceeds());
+ // Lots of tests rely on having more than 1 logical processor available.
+ EXPECT_GT(CPU_COUNT(&mask_), 1);
+ }
+
+ static PosixError ClearLowestBit(cpu_set_t* mask, size_t cpus) {
+ const size_t mask_size = CPU_ALLOC_SIZE(cpus);
+ for (size_t n = 0; n < cpus; ++n) {
+ if (CPU_ISSET_S(n, mask_size, mask)) {
+ CPU_CLR_S(n, mask_size, mask);
+ return NoError();
+ }
+ }
+ return PosixError(EINVAL, "No bit to clear, mask is empty");
+ }
+
+ PosixError ClearLowestBit() { return ClearLowestBit(&mask_, CPU_SETSIZE); }
+
+ // Stores the initial cpu mask for this process.
+ cpu_set_t mask_ = {};
+ int cpuset_size_ = 0;
+};
+
+// sched_getaffinity(2) is implemented.
+TEST_F(AffinityTest, SchedGetAffinityImplemented) {
+ EXPECT_THAT(sched_getaffinity(/*pid=*/0, sizeof(cpu_set_t), &mask_),
+ SyscallSucceeds());
+}
+
+// PID is not found.
+TEST_F(AffinityTest, SchedGetAffinityInvalidPID) {
+ // Flaky, but it's tough to avoid a race condition when finding an unused pid
+ EXPECT_THAT(sched_getaffinity(/*pid=*/INT_MAX - 1, sizeof(cpu_set_t), &mask_),
+ SyscallFailsWithErrno(ESRCH));
+}
+
+// PID is not found.
+TEST_F(AffinityTest, SchedSetAffinityInvalidPID) {
+ // Flaky, but it's tough to avoid a race condition when finding an unused pid
+ EXPECT_THAT(sched_setaffinity(/*pid=*/INT_MAX - 1, sizeof(cpu_set_t), &mask_),
+ SyscallFailsWithErrno(ESRCH));
+}
+
+TEST_F(AffinityTest, SchedSetAffinityZeroMask) {
+ CPU_ZERO(&mask_);
+ EXPECT_THAT(sched_setaffinity(/*pid=*/0, sizeof(cpu_set_t), &mask_),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+// N.B. This test case relies on cpuset_size_ larger than the actual number of
+// of all existing CPUs. Check your machine if the test fails.
+TEST_F(AffinityTest, SchedSetAffinityNonexistentCPUDropped) {
+ cpu_set_t mask = mask_;
+ // Add a nonexistent CPU.
+ //
+ // The number needs to be larger than the possible number of CPU available,
+ // but smaller than the number of the CPU that the kernel claims to support --
+ // it's implicitly returned by raw sched_getaffinity syscall.
+ CPU_SET(cpuset_size_ * 8 - 1, &mask);
+ EXPECT_THAT(
+ // Use raw syscall because it will be rejected by the libc wrapper
+ // otherwise.
+ syscall(SYS_sched_setaffinity, /*pid=*/0, sizeof(cpu_set_t), &mask),
+ SyscallSucceeds())
+ << "failed with cpumask : " << CPUSetToString(mask)
+ << ", cpuset_size_ : " << cpuset_size_;
+ cpu_set_t newmask;
+ EXPECT_THAT(sched_getaffinity(/*pid=*/0, sizeof(cpu_set_t), &newmask),
+ SyscallSucceeds());
+ EXPECT_TRUE(CPU_EQUAL(&mask_, &newmask))
+ << "got: " << CPUSetToString(newmask)
+ << " != expected: " << CPUSetToString(mask_);
+}
+
+TEST_F(AffinityTest, SchedSetAffinityOnlyNonexistentCPUFails) {
+ // Make an empty cpu set.
+ CPU_ZERO(&mask_);
+ // Add a nonexistent CPU.
+ //
+ // The number needs to be larger than the possible number of CPU available,
+ // but smaller than the number of the CPU that the kernel claims to support --
+ // it's implicitly returned by raw sched_getaffinity syscall.
+ int cpu = cpuset_size_ * 8 - 1;
+ if (cpu <= NumCPUs()) {
+ GTEST_SKIP() << "Skipping test: cpu " << cpu << " exists";
+ }
+ CPU_SET(cpu, &mask_);
+ EXPECT_THAT(
+ // Use raw syscall because it will be rejected by the libc wrapper
+ // otherwise.
+ syscall(SYS_sched_setaffinity, /*pid=*/0, sizeof(cpu_set_t), &mask_),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_F(AffinityTest, SchedSetAffinityInvalidSize) {
+ EXPECT_GT(cpuset_size_, 0);
+ // Not big enough.
+ EXPECT_THAT(sched_getaffinity(/*pid=*/0, cpuset_size_ - 1, &mask_),
+ SyscallFailsWithErrno(EINVAL));
+ // Not a multiple of word size.
+ EXPECT_THAT(sched_getaffinity(/*pid=*/0, cpuset_size_ + 1, &mask_),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_F(AffinityTest, Sanity) {
+ ASSERT_NO_ERRNO(ClearLowestBit());
+ EXPECT_THAT(sched_setaffinity(/*pid=*/0, sizeof(cpu_set_t), &mask_),
+ SyscallSucceeds());
+ cpu_set_t newmask;
+ EXPECT_THAT(sched_getaffinity(/*pid=*/0, sizeof(cpu_set_t), &newmask),
+ SyscallSucceeds());
+ EXPECT_TRUE(CPU_EQUAL(&mask_, &newmask))
+ << "got: " << CPUSetToString(newmask)
+ << " != expected: " << CPUSetToString(mask_);
+}
+
+TEST_F(AffinityTest, NewThread) {
+ SKIP_IF(CPU_COUNT(&mask_) < 3);
+ ASSERT_NO_ERRNO(ClearLowestBit());
+ ASSERT_NO_ERRNO(ClearLowestBit());
+ EXPECT_THAT(sched_setaffinity(/*pid=*/0, sizeof(cpu_set_t), &mask_),
+ SyscallSucceeds());
+ ScopedThread([this]() {
+ cpu_set_t child_mask;
+ ASSERT_THAT(sched_getaffinity(/*pid=*/0, sizeof(cpu_set_t), &child_mask),
+ SyscallSucceeds());
+ ASSERT_TRUE(CPU_EQUAL(&child_mask, &mask_))
+ << "child cpu mask: " << CPUSetToString(child_mask)
+ << " != parent cpu mask: " << CPUSetToString(mask_);
+ });
+}
+
+TEST_F(AffinityTest, ConsistentWithProcCpuInfo) {
+ // Count how many cpus are shown in /proc/cpuinfo.
+ std::string cpuinfo = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/cpuinfo"));
+ int count = 0;
+ for (auto const& line : absl::StrSplit(cpuinfo, '\n')) {
+ if (absl::StartsWith(line, "processor")) {
+ count++;
+ }
+ }
+ EXPECT_GE(count, CPU_COUNT(&mask_));
+}
+
+TEST_F(AffinityTest, ConsistentWithProcStat) {
+ // Count how many cpus are shown in /proc/stat.
+ std::string stat = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/stat"));
+ int count = 0;
+ for (auto const& line : absl::StrSplit(stat, '\n')) {
+ if (absl::StartsWith(line, "cpu") && !absl::StartsWith(line, "cpu ")) {
+ count++;
+ }
+ }
+ EXPECT_GE(count, CPU_COUNT(&mask_));
+}
+
+TEST_F(AffinityTest, SmallCpuMask) {
+ const int num_cpus = NumCPUs();
+ const size_t mask_size = CPU_ALLOC_SIZE(num_cpus);
+ cpu_set_t* mask = CPU_ALLOC(num_cpus);
+ ASSERT_NE(mask, nullptr);
+ const auto free_mask = Cleanup([&] { CPU_FREE(mask); });
+
+ CPU_ZERO_S(mask_size, mask);
+ ASSERT_THAT(sched_getaffinity(0, mask_size, mask), SyscallSucceeds());
+}
+
+TEST_F(AffinityTest, LargeCpuMask) {
+ // Allocate mask bigger than cpu_set_t normally allocates.
+ const size_t cpus = CPU_SETSIZE * 8;
+ const size_t mask_size = CPU_ALLOC_SIZE(cpus);
+
+ cpu_set_t* large_mask = CPU_ALLOC(cpus);
+ auto free_mask = Cleanup([large_mask] { CPU_FREE(large_mask); });
+ CPU_ZERO_S(mask_size, large_mask);
+
+ // Check that get affinity with large mask works as expected.
+ ASSERT_THAT(sched_getaffinity(/*pid=*/0, mask_size, large_mask),
+ SyscallSucceeds());
+ EXPECT_TRUE(CPU_EQUAL(&mask_, large_mask))
+ << "got: " << CPUSetToString(*large_mask, cpus)
+ << " != expected: " << CPUSetToString(mask_);
+
+ // Check that set affinity with large mask works as expected.
+ ASSERT_NO_ERRNO(ClearLowestBit(large_mask, cpus));
+ EXPECT_THAT(sched_setaffinity(/*pid=*/0, mask_size, large_mask),
+ SyscallSucceeds());
+
+ cpu_set_t* new_mask = CPU_ALLOC(cpus);
+ auto free_new_mask = Cleanup([new_mask] { CPU_FREE(new_mask); });
+ CPU_ZERO_S(mask_size, new_mask);
+ EXPECT_THAT(sched_getaffinity(/*pid=*/0, mask_size, new_mask),
+ SyscallSucceeds());
+
+ EXPECT_TRUE(CPU_EQUAL_S(mask_size, large_mask, new_mask))
+ << "got: " << CPUSetToString(*new_mask, cpus)
+ << " != expected: " << CPUSetToString(*large_mask, cpus);
+}
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/aio.cc b/test/syscalls/linux/aio.cc
new file mode 100644
index 000000000..806d5729e
--- /dev/null
+++ b/test/syscalls/linux/aio.cc
@@ -0,0 +1,430 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <linux/aio_abi.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <string>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/file_base.h"
+#include "test/util/cleanup.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/memory_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/proc_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+using ::testing::_;
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+// Returns the size of the VMA containing the given address.
+PosixErrorOr<size_t> VmaSizeAt(uintptr_t addr) {
+ ASSIGN_OR_RETURN_ERRNO(std::string proc_self_maps,
+ GetContents("/proc/self/maps"));
+ ASSIGN_OR_RETURN_ERRNO(auto entries, ParseProcMaps(proc_self_maps));
+ // Use binary search to find the first VMA that might contain addr.
+ ProcMapsEntry target = {};
+ target.end = addr;
+ auto it =
+ std::upper_bound(entries.begin(), entries.end(), target,
+ [](const ProcMapsEntry& x, const ProcMapsEntry& y) {
+ return x.end < y.end;
+ });
+ // Check that it actually contains addr.
+ if (it == entries.end() || addr < it->start) {
+ return PosixError(ENOENT, absl::StrCat("no VMA contains address ", addr));
+ }
+ return it->end - it->start;
+}
+
+constexpr char kData[] = "hello world!";
+
+int SubmitCtx(aio_context_t ctx, long nr, struct iocb** iocbpp) {
+ return syscall(__NR_io_submit, ctx, nr, iocbpp);
+}
+
+class AIOTest : public FileTest {
+ public:
+ AIOTest() : ctx_(0) {}
+
+ int SetupContext(unsigned int nr) {
+ return syscall(__NR_io_setup, nr, &ctx_);
+ }
+
+ int Submit(long nr, struct iocb** iocbpp) {
+ return SubmitCtx(ctx_, nr, iocbpp);
+ }
+
+ int GetEvents(long min, long max, struct io_event* events,
+ struct timespec* timeout) {
+ return RetryEINTR(syscall)(__NR_io_getevents, ctx_, min, max, events,
+ timeout);
+ }
+
+ int DestroyContext() { return syscall(__NR_io_destroy, ctx_); }
+
+ void TearDown() override {
+ FileTest::TearDown();
+ if (ctx_ != 0) {
+ ASSERT_THAT(DestroyContext(), SyscallSucceeds());
+ ctx_ = 0;
+ }
+ }
+
+ struct iocb CreateCallback() {
+ struct iocb cb = {};
+ cb.aio_data = 0x123;
+ cb.aio_fildes = test_file_fd_.get();
+ cb.aio_lio_opcode = IOCB_CMD_PWRITE;
+ cb.aio_buf = reinterpret_cast<uint64_t>(kData);
+ cb.aio_offset = 0;
+ cb.aio_nbytes = strlen(kData);
+ return cb;
+ }
+
+ protected:
+ aio_context_t ctx_;
+};
+
+TEST_F(AIOTest, BasicWrite) {
+ // Copied from fs/aio.c.
+ constexpr unsigned AIO_RING_MAGIC = 0xa10a10a1;
+ struct aio_ring {
+ unsigned id;
+ unsigned nr;
+ unsigned head;
+ unsigned tail;
+ unsigned magic;
+ unsigned compat_features;
+ unsigned incompat_features;
+ unsigned header_length;
+ struct io_event io_events[0];
+ };
+
+ // Setup a context that is 128 entries deep.
+ ASSERT_THAT(SetupContext(128), SyscallSucceeds());
+
+ // Check that 'ctx_' points to a valid address. libaio uses it to check if
+ // aio implementation uses aio_ring. gVisor doesn't and returns all zeroes.
+ // Linux implements aio_ring, so skip the zeroes check.
+ //
+ // TODO(gvisor.dev/issue/204): Remove when gVisor implements aio_ring.
+ auto ring = reinterpret_cast<struct aio_ring*>(ctx_);
+ auto magic = IsRunningOnGvisor() ? 0 : AIO_RING_MAGIC;
+ EXPECT_EQ(ring->magic, magic);
+
+ struct iocb cb = CreateCallback();
+ struct iocb* cbs[1] = {&cb};
+
+ // Submit the request.
+ ASSERT_THAT(Submit(1, cbs), SyscallSucceedsWithValue(1));
+
+ // Get the reply.
+ struct io_event events[1];
+ ASSERT_THAT(GetEvents(1, 1, events, nullptr), SyscallSucceedsWithValue(1));
+
+ // Verify that it is as expected.
+ EXPECT_EQ(events[0].data, 0x123);
+ EXPECT_EQ(events[0].obj, reinterpret_cast<long>(&cb));
+ EXPECT_EQ(events[0].res, strlen(kData));
+
+ // Verify that the file contains the contents.
+ char verify_buf[sizeof(kData)] = {};
+ ASSERT_THAT(read(test_file_fd_.get(), verify_buf, sizeof(kData)),
+ SyscallSucceedsWithValue(strlen(kData)));
+ EXPECT_STREQ(verify_buf, kData);
+}
+
+TEST_F(AIOTest, BadWrite) {
+ // Create a pipe and immediately close the read end.
+ int pipefd[2];
+ ASSERT_THAT(pipe(pipefd), SyscallSucceeds());
+
+ FileDescriptor rfd(pipefd[0]);
+ FileDescriptor wfd(pipefd[1]);
+
+ rfd.reset(); // Close the read end.
+
+ // Setup a context that is 128 entries deep.
+ ASSERT_THAT(SetupContext(128), SyscallSucceeds());
+
+ struct iocb cb = CreateCallback();
+ // Try to write to the read end.
+ cb.aio_fildes = wfd.get();
+ struct iocb* cbs[1] = {&cb};
+
+ // Submit the request.
+ ASSERT_THAT(Submit(1, cbs), SyscallSucceedsWithValue(1));
+
+ // Get the reply.
+ struct io_event events[1];
+ ASSERT_THAT(GetEvents(1, 1, events, nullptr), SyscallSucceedsWithValue(1));
+
+ // Verify that it fails with the right error code.
+ EXPECT_EQ(events[0].data, 0x123);
+ EXPECT_EQ(events[0].obj, reinterpret_cast<uint64_t>(&cb));
+ EXPECT_LT(events[0].res, 0);
+}
+
+TEST_F(AIOTest, ExitWithPendingIo) {
+ // Setup a context that is 100 entries deep.
+ ASSERT_THAT(SetupContext(100), SyscallSucceeds());
+
+ struct iocb cb = CreateCallback();
+ struct iocb* cbs[] = {&cb};
+
+ // Submit a request but don't complete it to make it pending.
+ for (int i = 0; i < 100; ++i) {
+ EXPECT_THAT(Submit(1, cbs), SyscallSucceeds());
+ }
+
+ ASSERT_THAT(DestroyContext(), SyscallSucceeds());
+ ctx_ = 0;
+}
+
+int Submitter(void* arg) {
+ auto test = reinterpret_cast<AIOTest*>(arg);
+
+ struct iocb cb = test->CreateCallback();
+ struct iocb* cbs[1] = {&cb};
+
+ // Submit the request.
+ TEST_CHECK(test->Submit(1, cbs) == 1);
+ return 0;
+}
+
+TEST_F(AIOTest, CloneVm) {
+ // Setup a context that is 128 entries deep.
+ ASSERT_THAT(SetupContext(128), SyscallSucceeds());
+
+ const size_t kStackSize = 5 * kPageSize;
+ std::unique_ptr<char[]> stack(new char[kStackSize]);
+ char* bp = stack.get() + kStackSize;
+ pid_t child;
+ ASSERT_THAT(child = clone(Submitter, bp, CLONE_VM | SIGCHLD,
+ reinterpret_cast<void*>(this)),
+ SyscallSucceeds());
+
+ // Get the reply.
+ struct io_event events[1];
+ ASSERT_THAT(GetEvents(1, 1, events, nullptr), SyscallSucceedsWithValue(1));
+
+ // Verify that it is as expected.
+ EXPECT_EQ(events[0].data, 0x123);
+ EXPECT_EQ(events[0].res, strlen(kData));
+
+ // Verify that the file contains the contents.
+ char verify_buf[32] = {};
+ ASSERT_THAT(read(test_file_fd_.get(), &verify_buf[0], strlen(kData)),
+ SyscallSucceeds());
+ EXPECT_EQ(strcmp(kData, &verify_buf[0]), 0);
+
+ int status;
+ ASSERT_THAT(RetryEINTR(waitpid)(child, &status, 0),
+ SyscallSucceedsWithValue(child));
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+ << " status " << status;
+}
+
+// Tests that AIO context can be remapped to a different address.
+TEST_F(AIOTest, Mremap) {
+ // Setup a context that is 128 entries deep.
+ ASSERT_THAT(SetupContext(128), SyscallSucceeds());
+ const size_t ctx_size =
+ ASSERT_NO_ERRNO_AND_VALUE(VmaSizeAt(reinterpret_cast<uintptr_t>(ctx_)));
+
+ struct iocb cb = CreateCallback();
+ struct iocb* cbs[1] = {&cb};
+
+ // Reserve address space for the mremap target so we have something safe to
+ // map over.
+ Mapping dst =
+ ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(ctx_size, PROT_READ, MAP_PRIVATE));
+
+ // Remap context 'handle' to a different address.
+ ASSERT_THAT(Mremap(reinterpret_cast<void*>(ctx_), ctx_size, dst.len(),
+ MREMAP_FIXED | MREMAP_MAYMOVE, dst.ptr()),
+ IsPosixErrorOkAndHolds(dst.ptr()));
+ aio_context_t old_ctx = ctx_;
+ ctx_ = reinterpret_cast<aio_context_t>(dst.addr());
+ // io_destroy() will unmap dst now.
+ dst.release();
+
+ // Check that submitting the request with the old 'ctx_' fails.
+ ASSERT_THAT(SubmitCtx(old_ctx, 1, cbs), SyscallFailsWithErrno(EINVAL));
+
+ // Submit the request with the new 'ctx_'.
+ ASSERT_THAT(Submit(1, cbs), SyscallSucceedsWithValue(1));
+
+ // Remap again.
+ dst = ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(ctx_size, PROT_READ, MAP_PRIVATE));
+ ASSERT_THAT(Mremap(reinterpret_cast<void*>(ctx_), ctx_size, dst.len(),
+ MREMAP_FIXED | MREMAP_MAYMOVE, dst.ptr()),
+ IsPosixErrorOkAndHolds(dst.ptr()));
+ ctx_ = reinterpret_cast<aio_context_t>(dst.addr());
+ dst.release();
+
+ // Get the reply with yet another 'ctx_' and verify it.
+ struct io_event events[1];
+ ASSERT_THAT(GetEvents(1, 1, events, nullptr), SyscallSucceedsWithValue(1));
+ EXPECT_EQ(events[0].data, 0x123);
+ EXPECT_EQ(events[0].obj, reinterpret_cast<long>(&cb));
+ EXPECT_EQ(events[0].res, strlen(kData));
+
+ // Verify that the file contains the contents.
+ char verify_buf[sizeof(kData)] = {};
+ ASSERT_THAT(read(test_file_fd_.get(), verify_buf, sizeof(kData)),
+ SyscallSucceedsWithValue(strlen(kData)));
+ EXPECT_STREQ(verify_buf, kData);
+}
+
+// Tests that AIO context cannot be expanded with mremap.
+TEST_F(AIOTest, MremapExpansion) {
+ // Setup a context that is 128 entries deep.
+ ASSERT_THAT(SetupContext(128), SyscallSucceeds());
+ const size_t ctx_size =
+ ASSERT_NO_ERRNO_AND_VALUE(VmaSizeAt(reinterpret_cast<uintptr_t>(ctx_)));
+
+ // Reserve address space for the mremap target so we have something safe to
+ // map over.
+ Mapping dst = ASSERT_NO_ERRNO_AND_VALUE(
+ MmapAnon(ctx_size + kPageSize, PROT_NONE, MAP_PRIVATE));
+
+ // Test that remapping to a larger address range fails.
+ ASSERT_THAT(Mremap(reinterpret_cast<void*>(ctx_), ctx_size, dst.len(),
+ MREMAP_FIXED | MREMAP_MAYMOVE, dst.ptr()),
+ PosixErrorIs(EFAULT, _));
+
+ // mm/mremap.c:sys_mremap() => mremap_to() does do_munmap() of the destination
+ // before it hits the VM_DONTEXPAND check in vma_to_resize(), so we should no
+ // longer munmap it (another thread may have created a mapping there).
+ dst.release();
+}
+
+// Tests that AIO calls fail if context's address is inaccessible.
+TEST_F(AIOTest, Mprotect) {
+ // Setup a context that is 128 entries deep.
+ ASSERT_THAT(SetupContext(128), SyscallSucceeds());
+
+ struct iocb cb = CreateCallback();
+ struct iocb* cbs[1] = {&cb};
+
+ ASSERT_THAT(Submit(1, cbs), SyscallSucceedsWithValue(1));
+
+ // Makes the context 'handle' inaccessible and check that all subsequent
+ // calls fail.
+ ASSERT_THAT(mprotect(reinterpret_cast<void*>(ctx_), kPageSize, PROT_NONE),
+ SyscallSucceeds());
+ struct io_event events[1];
+ EXPECT_THAT(GetEvents(1, 1, events, nullptr), SyscallFailsWithErrno(EINVAL));
+ ASSERT_THAT(Submit(1, cbs), SyscallFailsWithErrno(EINVAL));
+ EXPECT_THAT(DestroyContext(), SyscallFailsWithErrno(EINVAL));
+
+ // Prevent TearDown from attempting to destroy the context and fail.
+ ctx_ = 0;
+}
+
+TEST_F(AIOTest, Timeout) {
+ // Setup a context that is 128 entries deep.
+ ASSERT_THAT(SetupContext(128), SyscallSucceeds());
+
+ struct timespec timeout;
+ timeout.tv_sec = 0;
+ timeout.tv_nsec = 10;
+ struct io_event events[1];
+ ASSERT_THAT(GetEvents(1, 1, events, &timeout), SyscallSucceedsWithValue(0));
+}
+
+class AIOReadWriteParamTest : public AIOTest,
+ public ::testing::WithParamInterface<int> {};
+
+TEST_P(AIOReadWriteParamTest, BadOffset) {
+ // Setup a context that is 128 entries deep.
+ ASSERT_THAT(SetupContext(128), SyscallSucceeds());
+
+ struct iocb cb = CreateCallback();
+ struct iocb* cbs[1] = {&cb};
+
+ // Create a buffer that we can write to.
+ char buf[] = "hello world!";
+ cb.aio_buf = reinterpret_cast<uint64_t>(buf);
+
+ // Set the operation on the callback and give a negative offset.
+ const int opcode = GetParam();
+ cb.aio_lio_opcode = opcode;
+
+ iovec iov = {};
+ if (opcode == IOCB_CMD_PREADV || opcode == IOCB_CMD_PWRITEV) {
+ // Create a valid iovec and set it in the callback.
+ iov.iov_base = reinterpret_cast<void*>(buf);
+ iov.iov_len = 1;
+ cb.aio_buf = reinterpret_cast<uint64_t>(&iov);
+ // aio_nbytes is the number of iovecs.
+ cb.aio_nbytes = 1;
+ }
+
+ // Pass a negative offset.
+ cb.aio_offset = -1;
+
+ // Should get error on submission.
+ ASSERT_THAT(Submit(1, cbs), SyscallFailsWithErrno(EINVAL));
+}
+
+INSTANTIATE_TEST_SUITE_P(BadOffset, AIOReadWriteParamTest,
+ ::testing::Values(IOCB_CMD_PREAD, IOCB_CMD_PWRITE,
+ IOCB_CMD_PREADV, IOCB_CMD_PWRITEV));
+
+class AIOVectorizedParamTest : public AIOTest,
+ public ::testing::WithParamInterface<int> {};
+
+TEST_P(AIOVectorizedParamTest, BadIOVecs) {
+ // Setup a context that is 128 entries deep.
+ ASSERT_THAT(SetupContext(128), SyscallSucceeds());
+
+ struct iocb cb = CreateCallback();
+ struct iocb* cbs[1] = {&cb};
+
+ // Modify the callback to use the operation from the param.
+ cb.aio_lio_opcode = GetParam();
+
+ // Create an iovec with address in kernel range, and pass that as the buffer.
+ iovec iov = {};
+ iov.iov_base = reinterpret_cast<void*>(0xFFFFFFFF00000000);
+ iov.iov_len = 1;
+ cb.aio_buf = reinterpret_cast<uint64_t>(&iov);
+ // aio_nbytes is the number of iovecs.
+ cb.aio_nbytes = 1;
+
+ // Should get error on submission.
+ ASSERT_THAT(Submit(1, cbs), SyscallFailsWithErrno(EFAULT));
+}
+
+INSTANTIATE_TEST_SUITE_P(BadIOVecs, AIOVectorizedParamTest,
+ ::testing::Values(IOCB_CMD_PREADV, IOCB_CMD_PWRITEV));
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/alarm.cc b/test/syscalls/linux/alarm.cc
new file mode 100644
index 000000000..940c97285
--- /dev/null
+++ b/test/syscalls/linux/alarm.cc
@@ -0,0 +1,192 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <signal.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/logging.h"
+#include "test/util/signal_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// N.B. Below, main blocks SIGALRM. Test cases must unblock it if they want
+// delivery.
+
+void do_nothing_handler(int sig, siginfo_t* siginfo, void* arg) {}
+
+// No random save as the test relies on alarm timing. Cooperative save tests
+// already cover the save between alarm and read.
+TEST(AlarmTest, Interrupt_NoRandomSave) {
+ int pipe_fds[2];
+ ASSERT_THAT(pipe(pipe_fds), SyscallSucceeds());
+
+ FileDescriptor read_fd(pipe_fds[0]);
+ FileDescriptor write_fd(pipe_fds[1]);
+
+ // Use a signal handler that interrupts but does nothing rather than using the
+ // default terminate action.
+ struct sigaction sa;
+ sa.sa_sigaction = do_nothing_handler;
+ sigfillset(&sa.sa_mask);
+ sa.sa_flags = 0;
+ auto sa_cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGALRM, sa));
+
+ // Actually allow SIGALRM delivery.
+ auto mask_cleanup =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_UNBLOCK, SIGALRM));
+
+ // Alarm in 20 second, which should be well after read blocks below.
+ ASSERT_THAT(alarm(20), SyscallSucceeds());
+
+ char buf;
+ ASSERT_THAT(read(read_fd.get(), &buf, 1), SyscallFailsWithErrno(EINTR));
+}
+
+/* Count of the number of SIGALARMS handled. */
+static volatile int alarms_received = 0;
+
+void inc_alarms_handler(int sig, siginfo_t* siginfo, void* arg) {
+ alarms_received++;
+}
+
+// No random save as the test relies on alarm timing. Cooperative save tests
+// already cover the save between alarm and read.
+TEST(AlarmTest, Restart_NoRandomSave) {
+ alarms_received = 0;
+
+ int pipe_fds[2];
+ ASSERT_THAT(pipe(pipe_fds), SyscallSucceeds());
+
+ FileDescriptor read_fd(pipe_fds[0]);
+ // Write end closed by thread below.
+
+ struct sigaction sa;
+ sa.sa_sigaction = inc_alarms_handler;
+ sigfillset(&sa.sa_mask);
+ sa.sa_flags = SA_RESTART;
+ auto sa_cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGALRM, sa));
+
+ // Spawn a thread to eventually unblock the read below.
+ ScopedThread t([pipe_fds] {
+ absl::SleepFor(absl::Seconds(30));
+ EXPECT_THAT(close(pipe_fds[1]), SyscallSucceeds());
+ });
+
+ // Actually allow SIGALRM delivery.
+ auto mask_cleanup =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_UNBLOCK, SIGALRM));
+
+ // Alarm in 20 second, which should be well after read blocks below, but
+ // before it returns.
+ ASSERT_THAT(alarm(20), SyscallSucceeds());
+
+ // Read and eventually get an EOF from the writer closing. If SA_RESTART
+ // didn't work, then the alarm would not have fired and we wouldn't increment
+ // our alarms_received count in our signal handler, or we would have not
+ // restarted the syscall gracefully, which we expect below in order to be
+ // able to get the final EOF on the pipe.
+ char buf;
+ ASSERT_THAT(read(read_fd.get(), &buf, 1), SyscallSucceeds());
+ EXPECT_EQ(alarms_received, 1);
+
+ t.Join();
+}
+
+// No random save as the test relies on alarm timing. Cooperative save tests
+// already cover the save between alarm and pause.
+TEST(AlarmTest, SaSiginfo_NoRandomSave) {
+ // Use a signal handler that interrupts but does nothing rather than using the
+ // default terminate action.
+ struct sigaction sa;
+ sa.sa_sigaction = do_nothing_handler;
+ sigfillset(&sa.sa_mask);
+ sa.sa_flags = SA_SIGINFO;
+ auto sa_cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGALRM, sa));
+
+ // Actually allow SIGALRM delivery.
+ auto mask_cleanup =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_UNBLOCK, SIGALRM));
+
+ // Alarm in 20 second, which should be well after pause blocks below.
+ ASSERT_THAT(alarm(20), SyscallSucceeds());
+ ASSERT_THAT(pause(), SyscallFailsWithErrno(EINTR));
+}
+
+// No random save as the test relies on alarm timing. Cooperative save tests
+// already cover the save between alarm and pause.
+TEST(AlarmTest, SaInterrupt_NoRandomSave) {
+ // Use a signal handler that interrupts but does nothing rather than using the
+ // default terminate action.
+ struct sigaction sa;
+ sa.sa_sigaction = do_nothing_handler;
+ sigfillset(&sa.sa_mask);
+ sa.sa_flags = SA_INTERRUPT;
+ auto sa_cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGALRM, sa));
+
+ // Actually allow SIGALRM delivery.
+ auto mask_cleanup =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_UNBLOCK, SIGALRM));
+
+ // Alarm in 20 second, which should be well after pause blocks below.
+ ASSERT_THAT(alarm(20), SyscallSucceeds());
+ ASSERT_THAT(pause(), SyscallFailsWithErrno(EINTR));
+}
+
+TEST(AlarmTest, UserModeSpinning) {
+ alarms_received = 0;
+
+ struct sigaction sa = {};
+ sa.sa_sigaction = inc_alarms_handler;
+ sigfillset(&sa.sa_mask);
+ sa.sa_flags = SA_SIGINFO;
+ auto sa_cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGALRM, sa));
+
+ // Actually allow SIGALRM delivery.
+ auto mask_cleanup =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_UNBLOCK, SIGALRM));
+
+ // Alarm in 20 second, which should be well into the loop below.
+ ASSERT_THAT(alarm(20), SyscallSucceeds());
+ // Make sure that the signal gets delivered even if we are spinning in user
+ // mode when it arrives.
+ while (!alarms_received) {
+ }
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
+
+int main(int argc, char** argv) {
+ // These tests depend on delivering SIGALRM to the main thread. Block SIGALRM
+ // so that any other threads created by TestInit will also have SIGALRM
+ // blocked.
+ sigset_t set;
+ sigemptyset(&set);
+ sigaddset(&set, SIGALRM);
+ TEST_PCHECK(sigprocmask(SIG_BLOCK, &set, nullptr) == 0);
+
+ gvisor::testing::TestInit(&argc, &argv);
+ return gvisor::testing::RunAllTests();
+}
diff --git a/test/syscalls/linux/arch_prctl.cc b/test/syscalls/linux/arch_prctl.cc
new file mode 100644
index 000000000..81bf5a775
--- /dev/null
+++ b/test/syscalls/linux/arch_prctl.cc
@@ -0,0 +1,48 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <asm/prctl.h>
+#include <sys/prctl.h>
+
+#include "gtest/gtest.h"
+#include "test/util/test_util.h"
+
+// glibc does not provide a prototype for arch_prctl() so declare it here.
+extern "C" int arch_prctl(int code, uintptr_t addr);
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(ArchPrctlTest, GetSetFS) {
+ uintptr_t orig;
+ const uintptr_t kNonCanonicalFsbase = 0x4141414142424242;
+
+ // Get the original FS.base and then set it to the same value (this is
+ // intentional because FS.base is the TLS pointer so we cannot change it
+ // arbitrarily).
+ ASSERT_THAT(arch_prctl(ARCH_GET_FS, reinterpret_cast<uintptr_t>(&orig)),
+ SyscallSucceeds());
+ ASSERT_THAT(arch_prctl(ARCH_SET_FS, orig), SyscallSucceeds());
+
+ // Trying to set FS.base to a non-canonical value should return an error.
+ ASSERT_THAT(arch_prctl(ARCH_SET_FS, kNonCanonicalFsbase),
+ SyscallFailsWithErrno(EPERM));
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/bad.cc b/test/syscalls/linux/bad.cc
new file mode 100644
index 000000000..a26fc6af3
--- /dev/null
+++ b/test/syscalls/linux/bad.cc
@@ -0,0 +1,45 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+#ifdef __x86_64__
+// get_kernel_syms is not supported in Linux > 2.6, and not implemented in
+// gVisor.
+constexpr uint32_t kNotImplementedSyscall = SYS_get_kernel_syms;
+#elif __aarch64__
+// Use the last of arch_specific_syscalls which are not implemented on arm64.
+constexpr uint32_t kNotImplementedSyscall = __NR_arch_specific_syscall + 15;
+#endif
+
+TEST(BadSyscallTest, NotImplemented) {
+ EXPECT_THAT(syscall(kNotImplementedSyscall), SyscallFailsWithErrno(ENOSYS));
+}
+
+TEST(BadSyscallTest, NegativeOne) {
+ EXPECT_THAT(syscall(-1), SyscallFailsWithErrno(ENOSYS));
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/base_poll_test.cc b/test/syscalls/linux/base_poll_test.cc
new file mode 100644
index 000000000..ab7a19dd0
--- /dev/null
+++ b/test/syscalls/linux/base_poll_test.cc
@@ -0,0 +1,65 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/base_poll_test.h"
+
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <syscall.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "absl/memory/memory.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+static volatile int timer_fired = 0;
+static void SigAlarmHandler(int, siginfo_t*, void*) { timer_fired = 1; }
+
+BasePollTest::BasePollTest() {
+ // Register our SIGALRM handler, but save the original so we can restore in
+ // the destructor.
+ struct sigaction sa = {};
+ sa.sa_sigaction = SigAlarmHandler;
+ sigfillset(&sa.sa_mask);
+ TEST_PCHECK(sigaction(SIGALRM, &sa, &original_alarm_sa_) == 0);
+}
+
+BasePollTest::~BasePollTest() {
+ ClearTimer();
+ TEST_PCHECK(sigaction(SIGALRM, &original_alarm_sa_, nullptr) == 0);
+}
+
+void BasePollTest::SetTimer(absl::Duration duration) {
+ pid_t tgid = getpid();
+ pid_t tid = gettid();
+ ClearTimer();
+
+ // Create a new timer thread.
+ timer_ = absl::make_unique<TimerThread>(absl::Now() + duration, tgid, tid);
+}
+
+bool BasePollTest::TimerFired() const { return timer_fired; }
+
+void BasePollTest::ClearTimer() {
+ timer_.reset();
+ timer_fired = 0;
+}
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/base_poll_test.h b/test/syscalls/linux/base_poll_test.h
new file mode 100644
index 000000000..0d4a6701e
--- /dev/null
+++ b/test/syscalls/linux/base_poll_test.h
@@ -0,0 +1,101 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_BASE_POLL_TEST_H_
+#define GVISOR_TEST_SYSCALLS_BASE_POLL_TEST_H_
+
+#include <signal.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <syscall.h>
+#include <time.h>
+#include <unistd.h>
+
+#include <memory>
+
+#include "gtest/gtest.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
+#include "test/util/logging.h"
+#include "test/util/signal_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// TimerThread is a cancelable timer.
+class TimerThread {
+ public:
+ TimerThread(absl::Time deadline, pid_t tgid, pid_t tid)
+ : thread_([=] {
+ mu_.Lock();
+ mu_.AwaitWithDeadline(absl::Condition(&cancel_), deadline);
+ if (!cancel_) {
+ TEST_PCHECK(tgkill(tgid, tid, SIGALRM) == 0);
+ }
+ mu_.Unlock();
+ }) {}
+
+ ~TimerThread() { Cancel(); }
+
+ void Cancel() {
+ absl::MutexLock ml(&mu_);
+ cancel_ = true;
+ }
+
+ private:
+ mutable absl::Mutex mu_;
+ bool cancel_ ABSL_GUARDED_BY(mu_) = false;
+
+ // Must be last to ensure that the destructor for the thread is run before
+ // any other member of the object is destroyed.
+ ScopedThread thread_;
+};
+
+// Base test fixture for poll, select, ppoll, and pselect tests.
+//
+// This fixture makes use of SIGALRM. The handler is saved in SetUp() and
+// restored in TearDown().
+class BasePollTest : public ::testing::Test {
+ protected:
+ BasePollTest();
+ ~BasePollTest() override;
+
+ // Sets a timer that will send a signal to the calling thread after
+ // `duration`.
+ void SetTimer(absl::Duration duration);
+
+ // Returns true if the timer has fired.
+ bool TimerFired() const;
+
+ // Stops the pending timer (if any) and clear the "fired" state.
+ void ClearTimer();
+
+ private:
+ // Thread that implements the timer. If the timer is stopped, timer_ is null.
+ //
+ // We have to use a thread for this purpose because tests using this fixture
+ // expect to be interrupted by the timer signal, but itimers/alarm(2) send
+ // thread-group-directed signals, which may be handled by any thread in the
+ // test process.
+ std::unique_ptr<TimerThread> timer_;
+
+ // The original SIGALRM handler, to restore in destructor.
+ struct sigaction original_alarm_sa_;
+};
+
+} // namespace testing
+} // namespace gvisor
+
+#endif // GVISOR_TEST_SYSCALLS_BASE_POLL_TEST_H_
diff --git a/test/syscalls/linux/bind.cc b/test/syscalls/linux/bind.cc
new file mode 100644
index 000000000..9547c4ab2
--- /dev/null
+++ b/test/syscalls/linux/bind.cc
@@ -0,0 +1,145 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST_P(AllSocketPairTest, Bind) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+}
+
+TEST_P(AllSocketPairTest, BindTooLong) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ // first_addr is a sockaddr_storage being used as a sockaddr_un. Use the full
+ // length which is longer than expected for a Unix socket.
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sizeof(sockaddr_storage)),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(AllSocketPairTest, DoubleBindSocket) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ EXPECT_THAT(
+ bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ // Linux 4.09 returns EINVAL here, but some time before 4.19 it switched
+ // to EADDRINUSE.
+ AnyOf(SyscallFailsWithErrno(EADDRINUSE), SyscallFailsWithErrno(EINVAL)));
+}
+
+TEST_P(AllSocketPairTest, GetLocalAddr) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+ socklen_t addressLength = sockets->first_addr_size();
+ struct sockaddr_storage address = {};
+ ASSERT_THAT(getsockname(sockets->first_fd(), (struct sockaddr*)(&address),
+ &addressLength),
+ SyscallSucceeds());
+ EXPECT_EQ(
+ 0, memcmp(&address, sockets->first_addr(), sockets->first_addr_size()));
+}
+
+TEST_P(AllSocketPairTest, GetLocalAddrWithoutBind) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ socklen_t addressLength = sockets->first_addr_size();
+ struct sockaddr_storage received_address = {};
+ ASSERT_THAT(
+ getsockname(sockets->first_fd(), (struct sockaddr*)(&received_address),
+ &addressLength),
+ SyscallSucceeds());
+ struct sockaddr_storage want_address = {};
+ want_address.ss_family = sockets->first_addr()->sa_family;
+ EXPECT_EQ(0, memcmp(&received_address, &want_address, addressLength));
+}
+
+TEST_P(AllSocketPairTest, GetRemoteAddressWithoutConnect) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ socklen_t addressLength = sockets->first_addr_size();
+ struct sockaddr_storage address = {};
+ ASSERT_THAT(getpeername(sockets->second_fd(), (struct sockaddr*)(&address),
+ &addressLength),
+ SyscallFailsWithErrno(ENOTCONN));
+}
+
+TEST_P(AllSocketPairTest, DoubleBindAddress) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ EXPECT_THAT(bind(sockets->second_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallFailsWithErrno(EADDRINUSE));
+}
+
+TEST_P(AllSocketPairTest, Unbind) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+ ASSERT_THAT(close(sockets->release_first_fd()), SyscallSucceeds());
+
+ // Filesystem Unix sockets do not release their address when closed.
+ if (sockets->first_addr()->sa_data[0] != 0) {
+ ASSERT_THAT(bind(sockets->second_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallFailsWithErrno(EADDRINUSE));
+ return;
+ }
+
+ ASSERT_THAT(bind(sockets->second_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+ ASSERT_THAT(close(sockets->release_second_fd()), SyscallSucceeds());
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ AllUnixDomainSockets, AllSocketPairTest,
+ ::testing::ValuesIn(VecCat<SocketPairKind>(
+ ApplyVec<SocketPairKind>(
+ FilesystemUnboundUnixDomainSocketPair,
+ AllBitwiseCombinations(List<int>{SOCK_STREAM, SOCK_DGRAM,
+ SOCK_SEQPACKET},
+ List<int>{0, SOCK_NONBLOCK})),
+ ApplyVec<SocketPairKind>(
+ AbstractUnboundUnixDomainSocketPair,
+ AllBitwiseCombinations(List<int>{SOCK_STREAM, SOCK_DGRAM,
+ SOCK_SEQPACKET},
+ List<int>{0, SOCK_NONBLOCK})))));
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/brk.cc b/test/syscalls/linux/brk.cc
new file mode 100644
index 000000000..a03a44465
--- /dev/null
+++ b/test/syscalls/linux/brk.cc
@@ -0,0 +1,31 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdint.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+TEST(BrkTest, BrkSyscallReturnsOldBrkOnFailure) {
+ auto old_brk = sbrk(0);
+ EXPECT_THAT(syscall(SYS_brk, reinterpret_cast<void*>(-1)),
+ SyscallSucceedsWithValue(reinterpret_cast<uintptr_t>(old_brk)));
+}
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/chdir.cc b/test/syscalls/linux/chdir.cc
new file mode 100644
index 000000000..3182c228b
--- /dev/null
+++ b/test/syscalls/linux/chdir.cc
@@ -0,0 +1,64 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <linux/limits.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/un.h>
+
+#include "gtest/gtest.h"
+#include "test/util/capability_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(ChdirTest, Success) {
+ auto old_dir = GetAbsoluteTestTmpdir();
+ auto temp_dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ EXPECT_THAT(chdir(temp_dir.path().c_str()), SyscallSucceeds());
+ // Temp path destructor deletes the newly created tmp dir and Sentry rejects
+ // saving when its current dir is still pointing to the path. Switch to a
+ // permanent path here.
+ EXPECT_THAT(chdir(old_dir.c_str()), SyscallSucceeds());
+}
+
+TEST(ChdirTest, PermissionDenied) {
+ // Drop capabilities that allow us to override directory permissions.
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+ auto temp_dir = ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateDirWith(GetAbsoluteTestTmpdir(), 0666 /* mode */));
+ EXPECT_THAT(chdir(temp_dir.path().c_str()), SyscallFailsWithErrno(EACCES));
+}
+
+TEST(ChdirTest, NotDir) {
+ auto temp_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ EXPECT_THAT(chdir(temp_file.path().c_str()), SyscallFailsWithErrno(ENOTDIR));
+}
+
+TEST(ChdirTest, NotExist) {
+ EXPECT_THAT(chdir("/foo/bar"), SyscallFailsWithErrno(ENOENT));
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/chmod.cc b/test/syscalls/linux/chmod.cc
new file mode 100644
index 000000000..a06b5cfd6
--- /dev/null
+++ b/test/syscalls/linux/chmod.cc
@@ -0,0 +1,264 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <string>
+
+#include "gtest/gtest.h"
+#include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(ChmodTest, ChmodFileSucceeds) {
+ // Drop capabilities that allow us to override file permissions.
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+
+ auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+ ASSERT_THAT(chmod(file.path().c_str(), 0466), SyscallSucceeds());
+ EXPECT_THAT(open(file.path().c_str(), O_RDWR), SyscallFailsWithErrno(EACCES));
+}
+
+TEST(ChmodTest, ChmodDirSucceeds) {
+ // Drop capabilities that allow us to override file and directory permissions.
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+ auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const std::string fileInDir = NewTempAbsPathInDir(dir.path());
+
+ ASSERT_THAT(chmod(dir.path().c_str(), 0466), SyscallSucceeds());
+ EXPECT_THAT(open(fileInDir.c_str(), O_RDONLY), SyscallFailsWithErrno(EACCES));
+}
+
+TEST(ChmodTest, FchmodFileSucceeds_NoRandomSave) {
+ // Drop capabilities that allow us to file directory permissions.
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+
+ auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileMode(0666));
+ int fd;
+ ASSERT_THAT(fd = open(file.path().c_str(), O_RDWR), SyscallSucceeds());
+
+ {
+ const DisableSave ds; // File permissions are reduced.
+ ASSERT_THAT(fchmod(fd, 0444), SyscallSucceeds());
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+ }
+
+ EXPECT_THAT(open(file.path().c_str(), O_RDWR), SyscallFailsWithErrno(EACCES));
+}
+
+TEST(ChmodTest, FchmodDirSucceeds_NoRandomSave) {
+ // Drop capabilities that allow us to override file and directory permissions.
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+ auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ int fd;
+ ASSERT_THAT(fd = open(dir.path().c_str(), O_RDONLY | O_DIRECTORY),
+ SyscallSucceeds());
+
+ {
+ const DisableSave ds; // File permissions are reduced.
+ ASSERT_THAT(fchmod(fd, 0), SyscallSucceeds());
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+ }
+
+ EXPECT_THAT(open(dir.path().c_str(), O_RDONLY),
+ SyscallFailsWithErrno(EACCES));
+}
+
+TEST(ChmodTest, FchmodBadF) {
+ ASSERT_THAT(fchmod(-1, 0444), SyscallFailsWithErrno(EBADF));
+}
+
+TEST(ChmodTest, FchmodatBadF) {
+ ASSERT_THAT(fchmodat(-1, "foo", 0444, 0), SyscallFailsWithErrno(EBADF));
+}
+
+TEST(ChmodTest, FchmodatNotDir) {
+ ASSERT_THAT(fchmodat(-1, "", 0444, 0), SyscallFailsWithErrno(ENOENT));
+}
+
+TEST(ChmodTest, FchmodatFileAbsolutePath) {
+ // Drop capabilities that allow us to override file permissions.
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+
+ auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+ ASSERT_THAT(fchmodat(-1, file.path().c_str(), 0444, 0), SyscallSucceeds());
+ EXPECT_THAT(open(file.path().c_str(), O_RDWR), SyscallFailsWithErrno(EACCES));
+}
+
+TEST(ChmodTest, FchmodatDirAbsolutePath) {
+ // Drop capabilities that allow us to override file and directory permissions.
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+ auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+ int fd;
+ ASSERT_THAT(fd = open(dir.path().c_str(), O_RDONLY | O_DIRECTORY),
+ SyscallSucceeds());
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+
+ ASSERT_THAT(fchmodat(-1, dir.path().c_str(), 0, 0), SyscallSucceeds());
+ EXPECT_THAT(open(dir.path().c_str(), O_RDONLY),
+ SyscallFailsWithErrno(EACCES));
+}
+
+TEST(ChmodTest, FchmodatFile) {
+ // Drop capabilities that allow us to override file permissions.
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+
+ auto temp_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+ int parent_fd;
+ ASSERT_THAT(
+ parent_fd = open(GetAbsoluteTestTmpdir().c_str(), O_RDONLY | O_DIRECTORY),
+ SyscallSucceeds());
+
+ ASSERT_THAT(
+ fchmodat(parent_fd, std::string(Basename(temp_file.path())).c_str(), 0444,
+ 0),
+ SyscallSucceeds());
+ EXPECT_THAT(close(parent_fd), SyscallSucceeds());
+
+ EXPECT_THAT(open(temp_file.path().c_str(), O_RDWR),
+ SyscallFailsWithErrno(EACCES));
+}
+
+TEST(ChmodTest, FchmodatDir) {
+ // Drop capabilities that allow us to override file and directory permissions.
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+ auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+ int parent_fd;
+ ASSERT_THAT(
+ parent_fd = open(GetAbsoluteTestTmpdir().c_str(), O_RDONLY | O_DIRECTORY),
+ SyscallSucceeds());
+
+ int fd;
+ ASSERT_THAT(fd = open(dir.path().c_str(), O_RDONLY | O_DIRECTORY),
+ SyscallSucceeds());
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+
+ ASSERT_THAT(
+ fchmodat(parent_fd, std::string(Basename(dir.path())).c_str(), 0, 0),
+ SyscallSucceeds());
+ EXPECT_THAT(close(parent_fd), SyscallSucceeds());
+
+ EXPECT_THAT(open(dir.path().c_str(), O_RDONLY | O_DIRECTORY),
+ SyscallFailsWithErrno(EACCES));
+}
+
+TEST(ChmodTest, ChmodDowngradeWritability_NoRandomSave) {
+ auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileMode(0666));
+
+ int fd;
+ ASSERT_THAT(fd = open(file.path().c_str(), O_RDWR), SyscallSucceeds());
+
+ const DisableSave ds; // Permissions are dropped.
+ ASSERT_THAT(chmod(file.path().c_str(), 0444), SyscallSucceeds());
+ EXPECT_THAT(write(fd, "hello", 5), SyscallSucceedsWithValue(5));
+
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
+TEST(ChmodTest, ChmodFileToNoPermissionsSucceeds) {
+ // Drop capabilities that allow us to override file permissions.
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+ auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileMode(0666));
+
+ ASSERT_THAT(chmod(file.path().c_str(), 0), SyscallSucceeds());
+
+ EXPECT_THAT(open(file.path().c_str(), O_RDONLY),
+ SyscallFailsWithErrno(EACCES));
+}
+
+TEST(ChmodTest, FchmodDowngradeWritability_NoRandomSave) {
+ auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+ int fd;
+ ASSERT_THAT(fd = open(file.path().c_str(), O_RDWR | O_CREAT, 0666),
+ SyscallSucceeds());
+
+ const DisableSave ds; // Permissions are dropped.
+ ASSERT_THAT(fchmod(fd, 0444), SyscallSucceeds());
+ EXPECT_THAT(write(fd, "hello", 5), SyscallSucceedsWithValue(5));
+
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
+TEST(ChmodTest, FchmodFileToNoPermissionsSucceeds_NoRandomSave) {
+ // Drop capabilities that allow us to override file permissions.
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+ auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileMode(0666));
+
+ int fd;
+ ASSERT_THAT(fd = open(file.path().c_str(), O_RDWR), SyscallSucceeds());
+
+ {
+ const DisableSave ds; // Permissions are dropped.
+ ASSERT_THAT(fchmod(fd, 0), SyscallSucceeds());
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+ }
+
+ EXPECT_THAT(open(file.path().c_str(), O_RDONLY),
+ SyscallFailsWithErrno(EACCES));
+}
+
+// Verify that we can get a RW FD after chmod, even if a RO fd is left open.
+TEST(ChmodTest, ChmodWritableWithOpenFD) {
+ // FIXME(b/72455313): broken on hostfs.
+ if (IsRunningOnGvisor()) {
+ return;
+ }
+
+ TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileMode(0444));
+
+ FileDescriptor fd1 = ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY));
+
+ ASSERT_THAT(fchmod(fd1.get(), 0644), SyscallSucceeds());
+
+ // This FD is writable, even though fd1 has a read-only reference to the file.
+ FileDescriptor fd2 = ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR));
+
+ // fd1 is not writable, but fd2 is.
+ char c = 'a';
+ EXPECT_THAT(WriteFd(fd1.get(), &c, 1), SyscallFailsWithErrno(EBADF));
+ EXPECT_THAT(WriteFd(fd2.get(), &c, 1), SyscallSucceedsWithValue(1));
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/chown.cc b/test/syscalls/linux/chown.cc
new file mode 100644
index 000000000..7a28b674d
--- /dev/null
+++ b/test/syscalls/linux/chown.cc
@@ -0,0 +1,206 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <grp.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/flags/flag.h"
+#include "absl/synchronization/notification.h"
+#include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+ABSL_FLAG(int32_t, scratch_uid1, 65534, "first scratch UID");
+ABSL_FLAG(int32_t, scratch_uid2, 65533, "second scratch UID");
+ABSL_FLAG(int32_t, scratch_gid, 65534, "first scratch GID");
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(ChownTest, FchownBadF) {
+ ASSERT_THAT(fchown(-1, 0, 0), SyscallFailsWithErrno(EBADF));
+}
+
+TEST(ChownTest, FchownatBadF) {
+ ASSERT_THAT(fchownat(-1, "fff", 0, 0, 0), SyscallFailsWithErrno(EBADF));
+}
+
+TEST(ChownTest, FchownatEmptyPath) {
+ const auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const auto fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_DIRECTORY | O_RDONLY));
+ ASSERT_THAT(fchownat(fd.get(), "", 0, 0, 0), SyscallFailsWithErrno(ENOENT));
+}
+
+using Chown =
+ std::function<PosixError(const std::string&, uid_t owner, gid_t group)>;
+
+class ChownParamTest : public ::testing::TestWithParam<Chown> {};
+
+TEST_P(ChownParamTest, ChownFileSucceeds) {
+ if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_CHOWN))) {
+ ASSERT_NO_ERRNO(SetCapability(CAP_CHOWN, false));
+ }
+
+ const auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+ // At least *try* setting to a group other than the EGID.
+ gid_t gid;
+ EXPECT_THAT(gid = getegid(), SyscallSucceeds());
+ int num_groups;
+ EXPECT_THAT(num_groups = getgroups(0, nullptr), SyscallSucceeds());
+ if (num_groups > 0) {
+ std::vector<gid_t> list(num_groups);
+ EXPECT_THAT(getgroups(list.size(), list.data()), SyscallSucceeds());
+ gid = list[0];
+ }
+
+ EXPECT_NO_ERRNO(GetParam()(file.path(), geteuid(), gid));
+
+ struct stat s = {};
+ ASSERT_THAT(stat(file.path().c_str(), &s), SyscallSucceeds());
+ EXPECT_EQ(s.st_uid, geteuid());
+ EXPECT_EQ(s.st_gid, gid);
+}
+
+TEST_P(ChownParamTest, ChownFilePermissionDenied) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SETUID)));
+
+ const auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileMode(0777));
+
+ // Drop privileges and change IDs only in child thread, or else this parent
+ // thread won't be able to open some log files after the test ends.
+ ScopedThread([&] {
+ // Drop privileges.
+ if (HaveCapability(CAP_CHOWN).ValueOrDie()) {
+ EXPECT_NO_ERRNO(SetCapability(CAP_CHOWN, false));
+ }
+
+ // Change EUID and EGID.
+ //
+ // See note about POSIX below.
+ EXPECT_THAT(
+ syscall(SYS_setresgid, -1, absl::GetFlag(FLAGS_scratch_gid), -1),
+ SyscallSucceeds());
+ EXPECT_THAT(
+ syscall(SYS_setresuid, -1, absl::GetFlag(FLAGS_scratch_uid1), -1),
+ SyscallSucceeds());
+
+ EXPECT_THAT(GetParam()(file.path(), geteuid(), getegid()),
+ PosixErrorIs(EPERM, ::testing::ContainsRegex("chown")));
+ });
+}
+
+TEST_P(ChownParamTest, ChownFileSucceedsAsRoot) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability((CAP_CHOWN))));
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability((CAP_SETUID))));
+
+ const std::string filename = NewTempAbsPath();
+
+ absl::Notification fileCreated, fileChowned;
+ // Change UID only in child thread, or else this parent thread won't be able
+ // to open some log files after the test ends.
+ ScopedThread t([&] {
+ // POSIX requires that all threads in a process share the same UIDs, so
+ // the NPTL setresuid wrappers use signals to make all threads execute the
+ // setresuid syscall. However, we want this thread to have its own set of
+ // credentials different from the parent process, so we use the raw
+ // syscall.
+ EXPECT_THAT(
+ syscall(SYS_setresuid, -1, absl::GetFlag(FLAGS_scratch_uid2), -1),
+ SyscallSucceeds());
+
+ // Create file and immediately close it.
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(filename, O_CREAT | O_RDWR, 0644));
+ fd.reset(); // Close the fd.
+
+ fileCreated.Notify();
+ fileChowned.WaitForNotification();
+
+ EXPECT_THAT(open(filename.c_str(), O_RDWR), SyscallFailsWithErrno(EACCES));
+ FileDescriptor fd2 = ASSERT_NO_ERRNO_AND_VALUE(Open(filename, O_RDONLY));
+ });
+
+ fileCreated.WaitForNotification();
+
+ // Set file's owners to someone different.
+ EXPECT_NO_ERRNO(GetParam()(filename, absl::GetFlag(FLAGS_scratch_uid1),
+ absl::GetFlag(FLAGS_scratch_gid)));
+
+ struct stat s;
+ EXPECT_THAT(stat(filename.c_str(), &s), SyscallSucceeds());
+ EXPECT_EQ(s.st_uid, absl::GetFlag(FLAGS_scratch_uid1));
+ EXPECT_EQ(s.st_gid, absl::GetFlag(FLAGS_scratch_gid));
+
+ fileChowned.Notify();
+}
+
+PosixError errorFromReturn(const std::string& name, int ret) {
+ if (ret == -1) {
+ return PosixError(errno, absl::StrCat(name, " failed"));
+ }
+ return NoError();
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ ChownKinds, ChownParamTest,
+ ::testing::Values(
+ [](const std::string& path, uid_t owner, gid_t group) -> PosixError {
+ int rc = chown(path.c_str(), owner, group);
+ MaybeSave();
+ return errorFromReturn("chown", rc);
+ },
+ [](const std::string& path, uid_t owner, gid_t group) -> PosixError {
+ int rc = lchown(path.c_str(), owner, group);
+ MaybeSave();
+ return errorFromReturn("lchown", rc);
+ },
+ [](const std::string& path, uid_t owner, gid_t group) -> PosixError {
+ ASSIGN_OR_RETURN_ERRNO(auto fd, Open(path, O_RDWR));
+ int rc = fchown(fd.get(), owner, group);
+ MaybeSave();
+ return errorFromReturn("fchown", rc);
+ },
+ [](const std::string& path, uid_t owner, gid_t group) -> PosixError {
+ ASSIGN_OR_RETURN_ERRNO(auto fd, Open(path, O_RDWR));
+ int rc = fchownat(fd.get(), "", owner, group, AT_EMPTY_PATH);
+ MaybeSave();
+ return errorFromReturn("fchownat-fd", rc);
+ },
+ [](const std::string& path, uid_t owner, gid_t group) -> PosixError {
+ ASSIGN_OR_RETURN_ERRNO(auto dirfd, Open(std::string(Dirname(path)),
+ O_DIRECTORY | O_RDONLY));
+ int rc = fchownat(dirfd.get(), std::string(Basename(path)).c_str(),
+ owner, group, 0);
+ MaybeSave();
+ return errorFromReturn("fchownat-dirfd", rc);
+ }));
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/chroot.cc b/test/syscalls/linux/chroot.cc
new file mode 100644
index 000000000..85ec013d5
--- /dev/null
+++ b/test/syscalls/linux/chroot.cc
@@ -0,0 +1,366 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stddef.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <syscall.h>
+#include <unistd.h>
+
+#include <string>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "test/util/capability_util.h"
+#include "test/util/cleanup.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/mount_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+using ::testing::HasSubstr;
+using ::testing::Not;
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(ChrootTest, Success) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_CHROOT)));
+
+ auto temp_dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ EXPECT_THAT(chroot(temp_dir.path().c_str()), SyscallSucceeds());
+}
+
+TEST(ChrootTest, PermissionDenied) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_CHROOT)));
+
+ // CAP_DAC_READ_SEARCH and CAP_DAC_OVERRIDE may override Execute permission on
+ // directories.
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+
+ auto temp_dir = ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateDirWith(GetAbsoluteTestTmpdir(), 0666 /* mode */));
+ EXPECT_THAT(chroot(temp_dir.path().c_str()), SyscallFailsWithErrno(EACCES));
+}
+
+TEST(ChrootTest, NotDir) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_CHROOT)));
+
+ auto temp_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ EXPECT_THAT(chroot(temp_file.path().c_str()), SyscallFailsWithErrno(ENOTDIR));
+}
+
+TEST(ChrootTest, NotExist) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_CHROOT)));
+
+ EXPECT_THAT(chroot("/foo/bar"), SyscallFailsWithErrno(ENOENT));
+}
+
+TEST(ChrootTest, WithoutCapability) {
+ // Unset CAP_SYS_CHROOT.
+ ASSERT_NO_ERRNO(SetCapability(CAP_SYS_CHROOT, false));
+
+ auto temp_dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ EXPECT_THAT(chroot(temp_dir.path().c_str()), SyscallFailsWithErrno(EPERM));
+}
+
+TEST(ChrootTest, CreatesNewRoot) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_CHROOT)));
+
+ // Grab the initial cwd.
+ char initial_cwd[1024];
+ ASSERT_THAT(syscall(__NR_getcwd, initial_cwd, sizeof(initial_cwd)),
+ SyscallSucceeds());
+
+ auto new_root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ auto file_in_new_root =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(new_root.path()));
+
+ // chroot into new_root.
+ ASSERT_THAT(chroot(new_root.path().c_str()), SyscallSucceeds());
+
+ // getcwd should return "(unreachable)" followed by the initial_cwd.
+ char cwd[1024];
+ ASSERT_THAT(syscall(__NR_getcwd, cwd, sizeof(cwd)), SyscallSucceeds());
+ std::string expected_cwd = "(unreachable)";
+ expected_cwd += initial_cwd;
+ EXPECT_STREQ(cwd, expected_cwd.c_str());
+
+ // Should not be able to stat file by its full path.
+ struct stat statbuf;
+ EXPECT_THAT(stat(file_in_new_root.path().c_str(), &statbuf),
+ SyscallFailsWithErrno(ENOENT));
+
+ // Should be able to stat file at new rooted path.
+ auto basename = std::string(Basename(file_in_new_root.path()));
+ auto rootedFile = "/" + basename;
+ ASSERT_THAT(stat(rootedFile.c_str(), &statbuf), SyscallSucceeds());
+
+ // Should be able to stat cwd at '.' even though it's outside root.
+ ASSERT_THAT(stat(".", &statbuf), SyscallSucceeds());
+
+ // chdir into new root.
+ ASSERT_THAT(chdir("/"), SyscallSucceeds());
+
+ // getcwd should return "/".
+ EXPECT_THAT(syscall(__NR_getcwd, cwd, sizeof(cwd)), SyscallSucceeds());
+ EXPECT_STREQ(cwd, "/");
+
+ // Statting '.', '..', '/', and '/..' all return the same dev and inode.
+ struct stat statbuf_dot;
+ ASSERT_THAT(stat(".", &statbuf_dot), SyscallSucceeds());
+ struct stat statbuf_dotdot;
+ ASSERT_THAT(stat("..", &statbuf_dotdot), SyscallSucceeds());
+ EXPECT_EQ(statbuf_dot.st_dev, statbuf_dotdot.st_dev);
+ EXPECT_EQ(statbuf_dot.st_ino, statbuf_dotdot.st_ino);
+ struct stat statbuf_slash;
+ ASSERT_THAT(stat("/", &statbuf_slash), SyscallSucceeds());
+ EXPECT_EQ(statbuf_dot.st_dev, statbuf_slash.st_dev);
+ EXPECT_EQ(statbuf_dot.st_ino, statbuf_slash.st_ino);
+ struct stat statbuf_slashdotdot;
+ ASSERT_THAT(stat("/..", &statbuf_slashdotdot), SyscallSucceeds());
+ EXPECT_EQ(statbuf_dot.st_dev, statbuf_slashdotdot.st_dev);
+ EXPECT_EQ(statbuf_dot.st_ino, statbuf_slashdotdot.st_ino);
+}
+
+TEST(ChrootTest, DotDotFromOpenFD) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_CHROOT)));
+
+ auto dir_outside_root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ auto fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Open(dir_outside_root.path(), O_RDONLY | O_DIRECTORY));
+ auto new_root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+ // chroot into new_root.
+ ASSERT_THAT(chroot(new_root.path().c_str()), SyscallSucceeds());
+
+ // openat on fd with path .. will succeed.
+ int other_fd;
+ ASSERT_THAT(other_fd = openat(fd.get(), "..", O_RDONLY), SyscallSucceeds());
+ EXPECT_THAT(close(other_fd), SyscallSucceeds());
+
+ // getdents on fd should not error.
+ char buf[1024];
+ ASSERT_THAT(syscall(SYS_getdents64, fd.get(), buf, sizeof(buf)),
+ SyscallSucceeds());
+}
+
+// Test that link resolution in a chroot can escape the root by following an
+// open proc fd. Regression test for b/32316719.
+TEST(ChrootTest, ProcFdLinkResolutionInChroot) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_CHROOT)));
+
+ const TempPath file_outside_chroot =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file_outside_chroot.path(), O_RDONLY));
+
+ const FileDescriptor proc_fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Open("/proc", O_DIRECTORY | O_RDONLY | O_CLOEXEC));
+
+ auto temp_dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ ASSERT_THAT(chroot(temp_dir.path().c_str()), SyscallSucceeds());
+
+ // Opening relative to an already open fd to a node outside the chroot works.
+ const FileDescriptor proc_self_fd = ASSERT_NO_ERRNO_AND_VALUE(
+ OpenAt(proc_fd.get(), "self/fd", O_DIRECTORY | O_RDONLY | O_CLOEXEC));
+
+ // Proc fd symlinks can escape the chroot if the fd the symlink refers to
+ // refers to an object outside the chroot.
+ struct stat s = {};
+ EXPECT_THAT(
+ fstatat(proc_self_fd.get(), absl::StrCat(fd.get()).c_str(), &s, 0),
+ SyscallSucceeds());
+
+ // Try to stat the stdin fd. Internally, this is handled differently from a
+ // proc fd entry pointing to a file, since stdin is backed by a host fd, and
+ // isn't a walkable path on the filesystem inside the sandbox.
+ EXPECT_THAT(fstatat(proc_self_fd.get(), "0", &s, 0), SyscallSucceeds());
+}
+
+// This test will verify that when you hold a fd to proc before entering
+// a chroot that any files inside the chroot will appear rooted to the
+// base chroot when examining /proc/self/fd/{num}.
+TEST(ChrootTest, ProcMemSelfFdsNoEscapeProcOpen) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_CHROOT)));
+
+ // Get a FD to /proc before we enter the chroot.
+ const FileDescriptor proc =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/proc", O_RDONLY));
+
+ // Create and enter a chroot directory.
+ const auto temp_dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ ASSERT_THAT(chroot(temp_dir.path().c_str()), SyscallSucceeds());
+
+ // Open a file inside the chroot at /foo.
+ const FileDescriptor foo =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/foo", O_CREAT | O_RDONLY, 0644));
+
+ // Examine /proc/self/fd/{foo_fd} to see if it exposes the fact that we're
+ // inside a chroot, the path should be /foo and NOT {chroot_dir}/foo.
+ const std::string fd_path = absl::StrCat("self/fd/", foo.get());
+ char buf[1024] = {};
+ size_t bytes_read = 0;
+ ASSERT_THAT(bytes_read =
+ readlinkat(proc.get(), fd_path.c_str(), buf, sizeof(buf) - 1),
+ SyscallSucceeds());
+
+ // The link should resolve to something.
+ ASSERT_GT(bytes_read, 0);
+
+ // Assert that the link doesn't contain the chroot path and is only /foo.
+ EXPECT_STREQ(buf, "/foo");
+}
+
+// This test will verify that a file inside a chroot when mmapped will not
+// expose the full file path via /proc/self/maps and instead honor the chroot.
+TEST(ChrootTest, ProcMemSelfMapsNoEscapeProcOpen) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_CHROOT)));
+
+ // Get a FD to /proc before we enter the chroot.
+ const FileDescriptor proc =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/proc", O_RDONLY));
+
+ // Create and enter a chroot directory.
+ const auto temp_dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ ASSERT_THAT(chroot(temp_dir.path().c_str()), SyscallSucceeds());
+
+ // Open a file inside the chroot at /foo.
+ const FileDescriptor foo =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/foo", O_CREAT | O_RDONLY, 0644));
+
+ // Mmap the newly created file.
+ void* foo_map = mmap(nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE,
+ foo.get(), 0);
+ ASSERT_THAT(reinterpret_cast<int64_t>(foo_map), SyscallSucceeds());
+
+ // Always unmap.
+ auto cleanup_map = Cleanup(
+ [&] { EXPECT_THAT(munmap(foo_map, kPageSize), SyscallSucceeds()); });
+
+ // Examine /proc/self/maps to be sure that /foo doesn't appear to be
+ // mapped with the full chroot path.
+ const FileDescriptor maps =
+ ASSERT_NO_ERRNO_AND_VALUE(OpenAt(proc.get(), "self/maps", O_RDONLY));
+
+ size_t bytes_read = 0;
+ char buf[8 * 1024] = {};
+ ASSERT_THAT(bytes_read = ReadFd(maps.get(), buf, sizeof(buf)),
+ SyscallSucceeds());
+
+ // The maps file should have something.
+ ASSERT_GT(bytes_read, 0);
+
+ // Finally we want to make sure the maps don't contain the chroot path
+ ASSERT_EQ(std::string(buf, bytes_read).find(temp_dir.path()),
+ std::string::npos);
+}
+
+// Test that mounts outside the chroot will not appear in /proc/self/mounts or
+// /proc/self/mountinfo.
+TEST(ChrootTest, ProcMountsMountinfoNoEscape) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_CHROOT)));
+
+ // We are going to create some mounts and then chroot. In order to be able to
+ // unmount the mounts after the test run, we must chdir to the root and use
+ // relative paths for all mounts. That way, as long as we never chdir into
+ // the new root, we can access the mounts via relative paths and unmount them.
+ ASSERT_THAT(chdir("/"), SyscallSucceeds());
+
+ // Create nested tmpfs mounts. Note the use of relative paths in Mount calls.
+ auto const outer_dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ auto const outer_mount = ASSERT_NO_ERRNO_AND_VALUE(Mount(
+ "none", JoinPath(".", outer_dir.path()), "tmpfs", 0, "mode=0700", 0));
+
+ auto const inner_dir =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(outer_dir.path()));
+ auto const inner_mount = ASSERT_NO_ERRNO_AND_VALUE(Mount(
+ "none", JoinPath(".", inner_dir.path()), "tmpfs", 0, "mode=0700", 0));
+
+ // Filenames that will be checked for mounts, all relative to /proc dir.
+ std::string paths[3] = {"mounts", "self/mounts", "self/mountinfo"};
+
+ for (const std::string& path : paths) {
+ // We should have both inner and outer mounts.
+ const std::string contents =
+ ASSERT_NO_ERRNO_AND_VALUE(GetContents(JoinPath("/proc", path)));
+ EXPECT_THAT(contents, AllOf(HasSubstr(outer_dir.path()),
+ HasSubstr(inner_dir.path())));
+ // We better have at least two mounts: the mounts we created plus the root.
+ std::vector<absl::string_view> submounts =
+ absl::StrSplit(contents, '\n', absl::SkipWhitespace());
+ EXPECT_GT(submounts.size(), 2);
+ }
+
+ // Get a FD to /proc before we enter the chroot.
+ const FileDescriptor proc =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/proc", O_RDONLY));
+
+ // Chroot to outer mount.
+ ASSERT_THAT(chroot(outer_dir.path().c_str()), SyscallSucceeds());
+
+ for (const std::string& path : paths) {
+ const FileDescriptor proc_file =
+ ASSERT_NO_ERRNO_AND_VALUE(OpenAt(proc.get(), path, O_RDONLY));
+
+ // Only two mounts visible from this chroot: the inner and outer. Both
+ // paths should be relative to the new chroot.
+ const std::string contents =
+ ASSERT_NO_ERRNO_AND_VALUE(GetContentsFD(proc_file.get()));
+ EXPECT_THAT(contents,
+ AllOf(HasSubstr(absl::StrCat(Basename(inner_dir.path()))),
+ Not(HasSubstr(outer_dir.path())),
+ Not(HasSubstr(inner_dir.path()))));
+ std::vector<absl::string_view> submounts =
+ absl::StrSplit(contents, '\n', absl::SkipWhitespace());
+ EXPECT_EQ(submounts.size(), 2);
+ }
+
+ // Chroot to inner mount. We must use an absolute path accessible to our
+ // chroot.
+ const std::string inner_dir_basename =
+ absl::StrCat("/", Basename(inner_dir.path()));
+ ASSERT_THAT(chroot(inner_dir_basename.c_str()), SyscallSucceeds());
+
+ for (const std::string& path : paths) {
+ const FileDescriptor proc_file =
+ ASSERT_NO_ERRNO_AND_VALUE(OpenAt(proc.get(), path, O_RDONLY));
+ const std::string contents =
+ ASSERT_NO_ERRNO_AND_VALUE(GetContentsFD(proc_file.get()));
+
+ // Only the inner mount visible from this chroot.
+ std::vector<absl::string_view> submounts =
+ absl::StrSplit(contents, '\n', absl::SkipWhitespace());
+ EXPECT_EQ(submounts.size(), 1);
+ }
+
+ // Chroot back to ".".
+ ASSERT_THAT(chroot("."), SyscallSucceeds());
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/clock_getres.cc b/test/syscalls/linux/clock_getres.cc
new file mode 100644
index 000000000..c408b936c
--- /dev/null
+++ b/test/syscalls/linux/clock_getres.cc
@@ -0,0 +1,37 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/time.h>
+#include <time.h>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// clock_getres works regardless of whether or not a timespec is passed.
+TEST(ClockGetres, Timespec) {
+ struct timespec ts;
+ EXPECT_THAT(clock_getres(CLOCK_MONOTONIC, &ts), SyscallSucceeds());
+ EXPECT_THAT(clock_getres(CLOCK_MONOTONIC, nullptr), SyscallSucceeds());
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/clock_gettime.cc b/test/syscalls/linux/clock_gettime.cc
new file mode 100644
index 000000000..7f6015049
--- /dev/null
+++ b/test/syscalls/linux/clock_gettime.cc
@@ -0,0 +1,163 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <pthread.h>
+#include <sys/time.h>
+
+#include <cerrno>
+#include <cstdint>
+#include <ctime>
+#include <list>
+#include <memory>
+#include <string>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+int64_t clock_gettime_nsecs(clockid_t id) {
+ struct timespec ts;
+ TEST_PCHECK(clock_gettime(id, &ts) == 0);
+ return (ts.tv_sec * 1000000000 + ts.tv_nsec);
+}
+
+// Spin on the CPU for at least ns nanoseconds, based on
+// CLOCK_THREAD_CPUTIME_ID.
+void spin_ns(int64_t ns) {
+ int64_t start = clock_gettime_nsecs(CLOCK_THREAD_CPUTIME_ID);
+ int64_t end = start + ns;
+
+ do {
+ constexpr int kLoopCount = 1000000; // large and arbitrary
+ // volatile to prevent the compiler from skipping this loop.
+ for (volatile int i = 0; i < kLoopCount; i++) {
+ }
+ } while (clock_gettime_nsecs(CLOCK_THREAD_CPUTIME_ID) < end);
+}
+
+// Test that CLOCK_PROCESS_CPUTIME_ID is a superset of CLOCK_THREAD_CPUTIME_ID.
+TEST(ClockGettime, CputimeId) {
+ constexpr int kNumThreads = 13; // arbitrary
+
+ absl::Duration spin_time = absl::Seconds(1);
+
+ // Start off the worker threads and compute the aggregate time spent by
+ // the workers. Note that we test CLOCK_PROCESS_CPUTIME_ID by having the
+ // workers execute in parallel and verifying that CLOCK_PROCESS_CPUTIME_ID
+ // accumulates the runtime of all threads.
+ int64_t start = clock_gettime_nsecs(CLOCK_PROCESS_CPUTIME_ID);
+
+ // Create a kNumThreads threads.
+ std::list<ScopedThread> threads;
+ for (int i = 0; i < kNumThreads; i++) {
+ threads.emplace_back(
+ [spin_time] { spin_ns(absl::ToInt64Nanoseconds(spin_time)); });
+ }
+ for (auto& t : threads) {
+ t.Join();
+ }
+
+ int64_t end = clock_gettime_nsecs(CLOCK_PROCESS_CPUTIME_ID);
+
+ // The aggregate time spent in the worker threads must be at least
+ // 'kNumThreads' times the time each thread spun.
+ ASSERT_GE(end - start, kNumThreads * absl::ToInt64Nanoseconds(spin_time));
+}
+
+TEST(ClockGettime, JavaThreadTime) {
+ clockid_t clockid;
+ ASSERT_EQ(0, pthread_getcpuclockid(pthread_self(), &clockid));
+ struct timespec tp;
+ ASSERT_THAT(clock_getres(clockid, &tp), SyscallSucceeds());
+ EXPECT_TRUE(tp.tv_sec > 0 || tp.tv_nsec > 0);
+ // A thread cputime is updated each 10msec and there is no approximation
+ // if a task is running.
+ do {
+ ASSERT_THAT(clock_gettime(clockid, &tp), SyscallSucceeds());
+ } while (tp.tv_sec == 0 && tp.tv_nsec == 0);
+ EXPECT_TRUE(tp.tv_sec > 0 || tp.tv_nsec > 0);
+}
+
+// There is not much to test here, since CLOCK_REALTIME may be discontiguous.
+TEST(ClockGettime, RealtimeWorks) {
+ struct timespec tp;
+ EXPECT_THAT(clock_gettime(CLOCK_REALTIME, &tp), SyscallSucceeds());
+}
+
+class MonotonicClockTest : public ::testing::TestWithParam<clockid_t> {};
+
+TEST_P(MonotonicClockTest, IsMonotonic) {
+ auto end = absl::Now() + absl::Seconds(5);
+
+ struct timespec tp;
+ EXPECT_THAT(clock_gettime(GetParam(), &tp), SyscallSucceeds());
+
+ auto prev = absl::TimeFromTimespec(tp);
+ while (absl::Now() < end) {
+ EXPECT_THAT(clock_gettime(GetParam(), &tp), SyscallSucceeds());
+ auto now = absl::TimeFromTimespec(tp);
+ EXPECT_GE(now, prev);
+ prev = now;
+ }
+}
+
+std::string PrintClockId(::testing::TestParamInfo<clockid_t> info) {
+ switch (info.param) {
+ case CLOCK_MONOTONIC:
+ return "CLOCK_MONOTONIC";
+ case CLOCK_MONOTONIC_COARSE:
+ return "CLOCK_MONOTONIC_COARSE";
+ case CLOCK_MONOTONIC_RAW:
+ return "CLOCK_MONOTONIC_RAW";
+ case CLOCK_BOOTTIME:
+ // CLOCK_BOOTTIME is a monotonic clock.
+ return "CLOCK_BOOTTIME";
+ default:
+ return absl::StrCat(info.param);
+ }
+}
+
+INSTANTIATE_TEST_SUITE_P(ClockGettime, MonotonicClockTest,
+ ::testing::Values(CLOCK_MONOTONIC,
+ CLOCK_MONOTONIC_COARSE,
+ CLOCK_MONOTONIC_RAW, CLOCK_BOOTTIME),
+ PrintClockId);
+
+TEST(ClockGettime, UnimplementedReturnsEINVAL) {
+ SKIP_IF(!IsRunningOnGvisor());
+
+ struct timespec tp;
+ EXPECT_THAT(clock_gettime(CLOCK_REALTIME_ALARM, &tp),
+ SyscallFailsWithErrno(EINVAL));
+ EXPECT_THAT(clock_gettime(CLOCK_BOOTTIME_ALARM, &tp),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(ClockGettime, InvalidClockIDReturnsEINVAL) {
+ struct timespec tp;
+ EXPECT_THAT(clock_gettime(-1, &tp), SyscallFailsWithErrno(EINVAL));
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/clock_nanosleep.cc b/test/syscalls/linux/clock_nanosleep.cc
new file mode 100644
index 000000000..b55cddc52
--- /dev/null
+++ b/test/syscalls/linux/clock_nanosleep.cc
@@ -0,0 +1,179 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <time.h>
+
+#include <atomic>
+#include <utility>
+
+#include "gtest/gtest.h"
+#include "absl/time/time.h"
+#include "test/util/cleanup.h"
+#include "test/util/posix_error.h"
+#include "test/util/signal_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+#include "test/util/timer_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// sys_clock_nanosleep is defined because the glibc clock_nanosleep returns
+// error numbers directly and does not set errno. This makes our Syscall
+// matchers look a little weird when expecting failure:
+// "SyscallSucceedsWithValue(ERRNO)".
+int sys_clock_nanosleep(clockid_t clkid, int flags,
+ const struct timespec* request,
+ struct timespec* remain) {
+ return syscall(SYS_clock_nanosleep, clkid, flags, request, remain);
+}
+
+PosixErrorOr<absl::Time> GetTime(clockid_t clk) {
+ struct timespec ts = {};
+ const int rc = clock_gettime(clk, &ts);
+ MaybeSave();
+ if (rc < 0) {
+ return PosixError(errno, "clock_gettime");
+ }
+ return absl::TimeFromTimespec(ts);
+}
+
+class WallClockNanosleepTest : public ::testing::TestWithParam<clockid_t> {};
+
+TEST_P(WallClockNanosleepTest, InvalidValues) {
+ const struct timespec invalid[] = {
+ {.tv_sec = -1, .tv_nsec = -1}, {.tv_sec = 0, .tv_nsec = INT32_MIN},
+ {.tv_sec = 0, .tv_nsec = INT32_MAX}, {.tv_sec = 0, .tv_nsec = -1},
+ {.tv_sec = -1, .tv_nsec = 0},
+ };
+
+ for (auto const ts : invalid) {
+ EXPECT_THAT(sys_clock_nanosleep(GetParam(), 0, &ts, nullptr),
+ SyscallFailsWithErrno(EINVAL));
+ }
+}
+
+TEST_P(WallClockNanosleepTest, SleepOneSecond) {
+ constexpr absl::Duration kSleepDuration = absl::Seconds(1);
+ struct timespec duration = absl::ToTimespec(kSleepDuration);
+
+ const absl::Time before = ASSERT_NO_ERRNO_AND_VALUE(GetTime(GetParam()));
+ EXPECT_THAT(
+ RetryEINTR(sys_clock_nanosleep)(GetParam(), 0, &duration, &duration),
+ SyscallSucceeds());
+ const absl::Time after = ASSERT_NO_ERRNO_AND_VALUE(GetTime(GetParam()));
+
+ EXPECT_GE(after - before, kSleepDuration);
+}
+
+TEST_P(WallClockNanosleepTest, InterruptedNanosleep) {
+ constexpr absl::Duration kSleepDuration = absl::Seconds(60);
+ struct timespec duration = absl::ToTimespec(kSleepDuration);
+
+ // Install no-op signal handler for SIGALRM.
+ struct sigaction sa = {};
+ sigfillset(&sa.sa_mask);
+ sa.sa_handler = +[](int signo) {};
+ const auto cleanup_sa =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGALRM, sa));
+
+ // Measure time since setting the alarm, since the alarm will interrupt the
+ // sleep and hence determine how long we sleep.
+ const absl::Time before = ASSERT_NO_ERRNO_AND_VALUE(GetTime(GetParam()));
+
+ // Set an alarm to go off while sleeping.
+ struct itimerval timer = {};
+ timer.it_value.tv_sec = 1;
+ timer.it_value.tv_usec = 0;
+ timer.it_interval.tv_sec = 1;
+ timer.it_interval.tv_usec = 0;
+ const auto cleanup =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedItimer(ITIMER_REAL, timer));
+
+ EXPECT_THAT(sys_clock_nanosleep(GetParam(), 0, &duration, &duration),
+ SyscallFailsWithErrno(EINTR));
+ const absl::Time after = ASSERT_NO_ERRNO_AND_VALUE(GetTime(GetParam()));
+
+ // Remaining time updated.
+ const absl::Duration remaining = absl::DurationFromTimespec(duration);
+ EXPECT_GE(after - before + remaining, kSleepDuration);
+}
+
+// Remaining time is *not* updated if nanosleep completes uninterrupted.
+TEST_P(WallClockNanosleepTest, UninterruptedNanosleep) {
+ constexpr absl::Duration kSleepDuration = absl::Milliseconds(10);
+ const struct timespec duration = absl::ToTimespec(kSleepDuration);
+
+ while (true) {
+ constexpr int kRemainingMagic = 42;
+ struct timespec remaining;
+ remaining.tv_sec = kRemainingMagic;
+ remaining.tv_nsec = kRemainingMagic;
+
+ int ret = sys_clock_nanosleep(GetParam(), 0, &duration, &remaining);
+ if (ret == EINTR) {
+ // Retry from beginning. We want a single uninterrupted call.
+ continue;
+ }
+
+ EXPECT_THAT(ret, SyscallSucceeds());
+ EXPECT_EQ(remaining.tv_sec, kRemainingMagic);
+ EXPECT_EQ(remaining.tv_nsec, kRemainingMagic);
+ break;
+ }
+}
+
+TEST_P(WallClockNanosleepTest, SleepUntil) {
+ const absl::Time now = ASSERT_NO_ERRNO_AND_VALUE(GetTime(GetParam()));
+ const absl::Time until = now + absl::Seconds(2);
+ const struct timespec ts = absl::ToTimespec(until);
+
+ EXPECT_THAT(
+ RetryEINTR(sys_clock_nanosleep)(GetParam(), TIMER_ABSTIME, &ts, nullptr),
+ SyscallSucceeds());
+ const absl::Time after = ASSERT_NO_ERRNO_AND_VALUE(GetTime(GetParam()));
+
+ EXPECT_GE(after, until);
+}
+
+INSTANTIATE_TEST_SUITE_P(Sleepers, WallClockNanosleepTest,
+ ::testing::Values(CLOCK_REALTIME, CLOCK_MONOTONIC));
+
+TEST(ClockNanosleepProcessTest, SleepFiveSeconds) {
+ const absl::Duration kSleepDuration = absl::Seconds(5);
+ struct timespec duration = absl::ToTimespec(kSleepDuration);
+
+ // Ensure that CLOCK_PROCESS_CPUTIME_ID advances.
+ std::atomic<bool> done(false);
+ ScopedThread t([&] {
+ while (!done.load()) {
+ }
+ });
+ const auto cleanup_done = Cleanup([&] { done.store(true); });
+
+ const absl::Time before =
+ ASSERT_NO_ERRNO_AND_VALUE(GetTime(CLOCK_PROCESS_CPUTIME_ID));
+ EXPECT_THAT(RetryEINTR(sys_clock_nanosleep)(CLOCK_PROCESS_CPUTIME_ID, 0,
+ &duration, &duration),
+ SyscallSucceeds());
+ const absl::Time after =
+ ASSERT_NO_ERRNO_AND_VALUE(GetTime(CLOCK_PROCESS_CPUTIME_ID));
+ EXPECT_GE(after - before, kSleepDuration);
+}
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/concurrency.cc b/test/syscalls/linux/concurrency.cc
new file mode 100644
index 000000000..7cd6a75bd
--- /dev/null
+++ b/test/syscalls/linux/concurrency.cc
@@ -0,0 +1,127 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <signal.h>
+
+#include <atomic>
+
+#include "gtest/gtest.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/platform_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+// Test that a thread that never yields to the OS does not prevent other threads
+// from running.
+TEST(ConcurrencyTest, SingleProcessMultithreaded) {
+ std::atomic<int> a(0);
+
+ ScopedThread t([&a]() {
+ while (!a.load()) {
+ }
+ });
+
+ absl::SleepFor(absl::Seconds(1));
+
+ // We are still able to execute code in this thread. The other hasn't
+ // permanently hung execution in both threads.
+ a.store(1);
+}
+
+// Test that multiple threads in this process continue to execute in parallel,
+// even if an unrelated second process is spawned. Regression test for
+// b/32119508.
+TEST(ConcurrencyTest, MultiProcessMultithreaded) {
+ // In PID 1, start TIDs 1 and 2, and put both to sleep.
+ //
+ // Start PID 3, which spins for 5 seconds, then exits.
+ //
+ // TIDs 1 and 2 wake and attempt to Activate, which cannot occur until PID 3
+ // exits.
+ //
+ // Both TIDs 1 and 2 should be woken. If they are not both woken, the test
+ // hangs.
+ //
+ // This is all fundamentally racy. If we are failing to wake all threads, the
+ // expectation is that this test becomes flaky, rather than consistently
+ // failing.
+ //
+ // If additional background threads fail to block, we may never schedule the
+ // child, at which point this test effectively becomes
+ // MultiProcessConcurrency. That's not expected to occur.
+
+ std::atomic<int> a(0);
+ ScopedThread t([&a]() {
+ // Block so that PID 3 can execute and we can wait on its exit.
+ absl::SleepFor(absl::Seconds(1));
+ while (!a.load()) {
+ }
+ });
+
+ pid_t child_pid = fork();
+ if (child_pid == 0) {
+ // Busy wait without making any blocking syscalls.
+ auto end = absl::Now() + absl::Seconds(5);
+ while (absl::Now() < end) {
+ }
+ _exit(0);
+ }
+ ASSERT_THAT(child_pid, SyscallSucceeds());
+
+ absl::SleepFor(absl::Seconds(1));
+
+ // If only TID 1 is woken, thread.Join will hang.
+ // If only TID 2 is woken, both will hang.
+ a.store(1);
+ t.Join();
+
+ int status = 0;
+ EXPECT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+ EXPECT_TRUE(WIFEXITED(status));
+ EXPECT_EQ(WEXITSTATUS(status), 0);
+}
+
+// Test that multiple processes can execute concurrently, even if one process
+// never yields.
+TEST(ConcurrencyTest, MultiProcessConcurrency) {
+ SKIP_IF(PlatformSupportMultiProcess() == PlatformSupport::NotSupported);
+
+ pid_t child_pid = fork();
+ if (child_pid == 0) {
+ while (true) {
+ }
+ }
+ ASSERT_THAT(child_pid, SyscallSucceeds());
+
+ absl::SleepFor(absl::Seconds(5));
+
+ // We are still able to execute code in this process. The other hasn't
+ // permanently hung execution in both processes.
+ ASSERT_THAT(kill(child_pid, SIGKILL), SyscallSucceeds());
+ int status = 0;
+
+ ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_EQ(WTERMSIG(status), SIGKILL);
+}
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/connect_external.cc b/test/syscalls/linux/connect_external.cc
new file mode 100644
index 000000000..1edb50e47
--- /dev/null
+++ b/test/syscalls/linux/connect_external.cc
@@ -0,0 +1,163 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+
+#include <string>
+#include <tuple>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/test_util.h"
+
+// This file contains tests specific to connecting to host UDS managed outside
+// the sandbox / test.
+//
+// A set of ultity sockets will be created externally in $TEST_UDS_TREE and
+// $TEST_UDS_ATTACH_TREE for these tests to interact with.
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+struct ProtocolSocket {
+ int protocol;
+ std::string name;
+};
+
+// Parameter is (socket root dir, ProtocolSocket).
+using GoferStreamSeqpacketTest =
+ ::testing::TestWithParam<std::tuple<std::string, ProtocolSocket>>;
+
+// Connect to a socket and verify that write/read work.
+//
+// An "echo" socket doesn't work for dgram sockets because our socket is
+// unnamed. The server thus has no way to reply to us.
+TEST_P(GoferStreamSeqpacketTest, Echo) {
+ std::string env;
+ ProtocolSocket proto;
+ std::tie(env, proto) = GetParam();
+
+ char* val = getenv(env.c_str());
+ ASSERT_NE(val, nullptr);
+ std::string root(val);
+
+ FileDescriptor sock =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_UNIX, proto.protocol, 0));
+
+ std::string socket_path = JoinPath(root, proto.name, "echo");
+
+ struct sockaddr_un addr = {};
+ addr.sun_family = AF_UNIX;
+ memcpy(addr.sun_path, socket_path.c_str(), socket_path.length());
+
+ ASSERT_THAT(connect(sock.get(), reinterpret_cast<struct sockaddr*>(&addr),
+ sizeof(addr)),
+ SyscallSucceeds());
+
+ constexpr int kBufferSize = 64;
+ char send_buffer[kBufferSize];
+ memset(send_buffer, 'a', sizeof(send_buffer));
+
+ ASSERT_THAT(WriteFd(sock.get(), send_buffer, sizeof(send_buffer)),
+ SyscallSucceedsWithValue(sizeof(send_buffer)));
+
+ char recv_buffer[kBufferSize];
+ ASSERT_THAT(ReadFd(sock.get(), recv_buffer, sizeof(recv_buffer)),
+ SyscallSucceedsWithValue(sizeof(recv_buffer)));
+ ASSERT_EQ(0, memcmp(send_buffer, recv_buffer, sizeof(send_buffer)));
+}
+
+// It is not possible to connect to a bound but non-listening socket.
+TEST_P(GoferStreamSeqpacketTest, NonListening) {
+ std::string env;
+ ProtocolSocket proto;
+ std::tie(env, proto) = GetParam();
+
+ char* val = getenv(env.c_str());
+ ASSERT_NE(val, nullptr);
+ std::string root(val);
+
+ FileDescriptor sock =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_UNIX, proto.protocol, 0));
+
+ std::string socket_path = JoinPath(root, proto.name, "nonlistening");
+
+ struct sockaddr_un addr = {};
+ addr.sun_family = AF_UNIX;
+ memcpy(addr.sun_path, socket_path.c_str(), socket_path.length());
+
+ ASSERT_THAT(connect(sock.get(), reinterpret_cast<struct sockaddr*>(&addr),
+ sizeof(addr)),
+ SyscallFailsWithErrno(ECONNREFUSED));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ StreamSeqpacket, GoferStreamSeqpacketTest,
+ ::testing::Combine(
+ // Test access via standard path and attach point.
+ ::testing::Values("TEST_UDS_TREE", "TEST_UDS_ATTACH_TREE"),
+ ::testing::Values(ProtocolSocket{SOCK_STREAM, "stream"},
+ ProtocolSocket{SOCK_SEQPACKET, "seqpacket"})));
+
+// Parameter is socket root dir.
+using GoferDgramTest = ::testing::TestWithParam<std::string>;
+
+// Connect to a socket and verify that write works.
+//
+// An "echo" socket doesn't work for dgram sockets because our socket is
+// unnamed. The server thus has no way to reply to us.
+TEST_P(GoferDgramTest, Null) {
+ std::string env = GetParam();
+ char* val = getenv(env.c_str());
+ ASSERT_NE(val, nullptr);
+ std::string root(val);
+
+ FileDescriptor sock =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_UNIX, SOCK_DGRAM, 0));
+
+ std::string socket_path = JoinPath(root, "dgram/null");
+
+ struct sockaddr_un addr = {};
+ addr.sun_family = AF_UNIX;
+ memcpy(addr.sun_path, socket_path.c_str(), socket_path.length());
+
+ ASSERT_THAT(connect(sock.get(), reinterpret_cast<struct sockaddr*>(&addr),
+ sizeof(addr)),
+ SyscallSucceeds());
+
+ constexpr int kBufferSize = 64;
+ char send_buffer[kBufferSize];
+ memset(send_buffer, 'a', sizeof(send_buffer));
+
+ ASSERT_THAT(WriteFd(sock.get(), send_buffer, sizeof(send_buffer)),
+ SyscallSucceedsWithValue(sizeof(send_buffer)));
+}
+
+INSTANTIATE_TEST_SUITE_P(Dgram, GoferDgramTest,
+ // Test access via standard path and attach point.
+ ::testing::Values("TEST_UDS_TREE",
+ "TEST_UDS_ATTACH_TREE"));
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/creat.cc b/test/syscalls/linux/creat.cc
new file mode 100644
index 000000000..3c270d6da
--- /dev/null
+++ b/test/syscalls/linux/creat.cc
@@ -0,0 +1,68 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include <string>
+
+#include "gtest/gtest.h"
+#include "test/util/fs_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+constexpr int kMode = 0666;
+
+TEST(CreatTest, CreatCreatesNewFile) {
+ std::string const path = NewTempAbsPath();
+ struct stat buf;
+ int fd;
+ ASSERT_THAT(stat(path.c_str(), &buf), SyscallFailsWithErrno(ENOENT));
+ ASSERT_THAT(fd = creat(path.c_str(), kMode), SyscallSucceeds());
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+ EXPECT_THAT(stat(path.c_str(), &buf), SyscallSucceeds());
+}
+
+TEST(CreatTest, CreatTruncatesExistingFile) {
+ auto temp_path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ int fd;
+ ASSERT_NO_ERRNO(SetContents(temp_path.path(), "non-empty"));
+ ASSERT_THAT(fd = creat(temp_path.path().c_str(), kMode), SyscallSucceeds());
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+ std::string new_contents;
+ ASSERT_NO_ERRNO(GetContents(temp_path.path(), &new_contents));
+ EXPECT_EQ("", new_contents);
+}
+
+TEST(CreatTest, CreatWithNameTooLong) {
+ // Start with a unique name, and pad it to NAME_MAX + 1;
+ std::string name = NewTempRelPath();
+ int padding = (NAME_MAX + 1) - name.size();
+ name.append(padding, 'x');
+ const std::string& path = JoinPath(GetAbsoluteTestTmpdir(), name);
+
+ // Creation should return ENAMETOOLONG.
+ ASSERT_THAT(creat(path.c_str(), kMode), SyscallFailsWithErrno(ENAMETOOLONG));
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/dev.cc b/test/syscalls/linux/dev.cc
new file mode 100644
index 000000000..3c88c4cbd
--- /dev/null
+++ b/test/syscalls/linux/dev.cc
@@ -0,0 +1,167 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(DevTest, LseekDevUrandom) {
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/urandom", O_RDONLY));
+ EXPECT_THAT(lseek(fd.get(), -10, SEEK_CUR), SyscallSucceeds());
+ EXPECT_THAT(lseek(fd.get(), -10, SEEK_SET), SyscallSucceeds());
+ EXPECT_THAT(lseek(fd.get(), 0, SEEK_CUR), SyscallSucceeds());
+}
+
+TEST(DevTest, LseekDevNull) {
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/null", O_RDONLY));
+ EXPECT_THAT(lseek(fd.get(), -10, SEEK_CUR), SyscallSucceeds());
+ EXPECT_THAT(lseek(fd.get(), -10, SEEK_SET), SyscallSucceeds());
+ EXPECT_THAT(lseek(fd.get(), 0, SEEK_CUR), SyscallSucceeds());
+ EXPECT_THAT(lseek(fd.get(), 0, SEEK_END), SyscallSucceeds());
+}
+
+TEST(DevTest, LseekDevZero) {
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/zero", O_RDONLY));
+ EXPECT_THAT(lseek(fd.get(), 0, SEEK_CUR), SyscallSucceeds());
+ EXPECT_THAT(lseek(fd.get(), 0, SEEK_END), SyscallSucceeds());
+}
+
+TEST(DevTest, LseekDevFull) {
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/full", O_RDONLY));
+ EXPECT_THAT(lseek(fd.get(), 123, SEEK_SET), SyscallSucceedsWithValue(0));
+ EXPECT_THAT(lseek(fd.get(), 123, SEEK_CUR), SyscallSucceedsWithValue(0));
+ EXPECT_THAT(lseek(fd.get(), 123, SEEK_END), SyscallSucceedsWithValue(0));
+}
+
+TEST(DevTest, LseekDevNullFreshFile) {
+ // Seeks to /dev/null always return 0.
+ const FileDescriptor fd1 =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/null", O_RDONLY));
+ const FileDescriptor fd2 =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/null", O_RDONLY));
+
+ EXPECT_THAT(lseek(fd1.get(), 0, SEEK_CUR), SyscallSucceedsWithValue(0));
+ EXPECT_THAT(lseek(fd1.get(), 1000, SEEK_CUR), SyscallSucceedsWithValue(0));
+ EXPECT_THAT(lseek(fd2.get(), 0, SEEK_CUR), SyscallSucceedsWithValue(0));
+
+ const FileDescriptor fd3 =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/null", O_RDONLY));
+ EXPECT_THAT(lseek(fd3.get(), 0, SEEK_CUR), SyscallSucceedsWithValue(0));
+}
+
+TEST(DevTest, OpenTruncate) {
+ // Truncation is ignored on linux and gvisor for device files.
+ ASSERT_NO_ERRNO_AND_VALUE(
+ Open("/dev/null", O_CREAT | O_TRUNC | O_WRONLY, 0644));
+ ASSERT_NO_ERRNO_AND_VALUE(
+ Open("/dev/zero", O_CREAT | O_TRUNC | O_WRONLY, 0644));
+ ASSERT_NO_ERRNO_AND_VALUE(
+ Open("/dev/full", O_CREAT | O_TRUNC | O_WRONLY, 0644));
+}
+
+TEST(DevTest, Pread64DevNull) {
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/null", O_RDONLY));
+ char buf[1];
+ EXPECT_THAT(pread64(fd.get(), buf, 1, 0), SyscallSucceedsWithValue(0));
+}
+
+TEST(DevTest, Pread64DevZero) {
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/zero", O_RDONLY));
+ char buf[1];
+ EXPECT_THAT(pread64(fd.get(), buf, 1, 0), SyscallSucceedsWithValue(1));
+}
+
+TEST(DevTest, Pread64DevFull) {
+ // /dev/full behaves like /dev/zero with respect to reads.
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/full", O_RDONLY));
+ char buf[1];
+ EXPECT_THAT(pread64(fd.get(), buf, 1, 0), SyscallSucceedsWithValue(1));
+}
+
+TEST(DevTest, ReadDevNull) {
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/null", O_RDONLY));
+ std::vector<char> buf(1);
+ EXPECT_THAT(ReadFd(fd.get(), buf.data(), 1), SyscallSucceeds());
+}
+
+// Do not allow random save as it could lead to partial reads.
+TEST(DevTest, ReadDevZero_NoRandomSave) {
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/zero", O_RDONLY));
+
+ constexpr int kReadSize = 128 * 1024;
+ std::vector<char> buf(kReadSize, 1);
+ EXPECT_THAT(ReadFd(fd.get(), buf.data(), kReadSize),
+ SyscallSucceedsWithValue(kReadSize));
+ EXPECT_EQ(std::vector<char>(kReadSize, 0), buf);
+}
+
+TEST(DevTest, WriteDevNull) {
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/null", O_WRONLY));
+ EXPECT_THAT(WriteFd(fd.get(), "a", 1), SyscallSucceedsWithValue(1));
+}
+
+TEST(DevTest, WriteDevZero) {
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/zero", O_WRONLY));
+ EXPECT_THAT(WriteFd(fd.get(), "a", 1), SyscallSucceedsWithValue(1));
+}
+
+TEST(DevTest, WriteDevFull) {
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/full", O_WRONLY));
+ EXPECT_THAT(WriteFd(fd.get(), "a", 1), SyscallFailsWithErrno(ENOSPC));
+}
+
+TEST(DevTest, TTYExists) {
+ struct stat statbuf = {};
+ ASSERT_THAT(stat("/dev/tty", &statbuf), SyscallSucceeds());
+ // Check that it's a character device with rw-rw-rw- permissions.
+ EXPECT_EQ(statbuf.st_mode, S_IFCHR | 0666);
+}
+
+TEST(DevTest, OpenDevFuse) {
+ // Note(gvisor.dev/issue/3076) This won't work in the sentry until the new
+ // device registration is complete.
+ SKIP_IF(IsRunningWithVFS1() || IsRunningOnGvisor());
+
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/fuse", O_RDONLY));
+}
+
+} // namespace
+} // namespace testing
+
+} // namespace gvisor
diff --git a/test/syscalls/linux/dup.cc b/test/syscalls/linux/dup.cc
new file mode 100644
index 000000000..4f773bc75
--- /dev/null
+++ b/test/syscalls/linux/dup.cc
@@ -0,0 +1,133 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/util/eventfd_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/posix_error.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+PosixErrorOr<FileDescriptor> Dup2(const FileDescriptor& fd, int target_fd) {
+ int new_fd = dup2(fd.get(), target_fd);
+ if (new_fd < 0) {
+ return PosixError(errno, "Dup2");
+ }
+ return FileDescriptor(new_fd);
+}
+
+PosixErrorOr<FileDescriptor> Dup3(const FileDescriptor& fd, int target_fd,
+ int flags) {
+ int new_fd = dup3(fd.get(), target_fd, flags);
+ if (new_fd < 0) {
+ return PosixError(errno, "Dup2");
+ }
+ return FileDescriptor(new_fd);
+}
+
+void CheckSameFile(const FileDescriptor& fd1, const FileDescriptor& fd2) {
+ struct stat stat_result1, stat_result2;
+ ASSERT_THAT(fstat(fd1.get(), &stat_result1), SyscallSucceeds());
+ ASSERT_THAT(fstat(fd2.get(), &stat_result2), SyscallSucceeds());
+ EXPECT_EQ(stat_result1.st_dev, stat_result2.st_dev);
+ EXPECT_EQ(stat_result1.st_ino, stat_result2.st_ino);
+}
+
+TEST(DupTest, Dup) {
+ auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(f.path(), O_RDONLY));
+
+ // Dup the descriptor and make sure it's the same file.
+ FileDescriptor nfd = ASSERT_NO_ERRNO_AND_VALUE(fd.Dup());
+ ASSERT_NE(fd.get(), nfd.get());
+ CheckSameFile(fd, nfd);
+}
+
+TEST(DupTest, DupClearsCloExec) {
+ // Open an eventfd file descriptor with FD_CLOEXEC descriptor flag set.
+ FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(0, EFD_CLOEXEC));
+ EXPECT_THAT(fcntl(fd.get(), F_GETFD), SyscallSucceedsWithValue(FD_CLOEXEC));
+
+ // Duplicate the descriptor. Ensure that it doesn't have FD_CLOEXEC set.
+ FileDescriptor nfd = ASSERT_NO_ERRNO_AND_VALUE(fd.Dup());
+ ASSERT_NE(fd.get(), nfd.get());
+ CheckSameFile(fd, nfd);
+ EXPECT_THAT(fcntl(nfd.get(), F_GETFD), SyscallSucceedsWithValue(0));
+}
+
+TEST(DupTest, Dup2) {
+ auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(f.path(), O_RDONLY));
+
+ // Regular dup once.
+ FileDescriptor nfd = ASSERT_NO_ERRNO_AND_VALUE(fd.Dup());
+
+ ASSERT_NE(fd.get(), nfd.get());
+ CheckSameFile(fd, nfd);
+
+ // Dup over the file above.
+ int target_fd = nfd.release();
+ FileDescriptor nfd2 = ASSERT_NO_ERRNO_AND_VALUE(Dup2(fd, target_fd));
+ EXPECT_EQ(target_fd, nfd2.get());
+ CheckSameFile(fd, nfd2);
+}
+
+TEST(DupTest, Dup2SameFD) {
+ auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(f.path(), O_RDONLY));
+
+ // Should succeed.
+ ASSERT_THAT(dup2(fd.get(), fd.get()), SyscallSucceedsWithValue(fd.get()));
+}
+
+TEST(DupTest, Dup3) {
+ auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(f.path(), O_RDONLY));
+
+ // Regular dup once.
+ FileDescriptor nfd = ASSERT_NO_ERRNO_AND_VALUE(fd.Dup());
+ ASSERT_NE(fd.get(), nfd.get());
+ CheckSameFile(fd, nfd);
+
+ // Dup over the file above, check that it has no CLOEXEC.
+ nfd = ASSERT_NO_ERRNO_AND_VALUE(Dup3(fd, nfd.release(), 0));
+ CheckSameFile(fd, nfd);
+ EXPECT_THAT(fcntl(nfd.get(), F_GETFD), SyscallSucceedsWithValue(0));
+
+ // Dup over the file again, check that it does not CLOEXEC.
+ nfd = ASSERT_NO_ERRNO_AND_VALUE(Dup3(fd, nfd.release(), O_CLOEXEC));
+ CheckSameFile(fd, nfd);
+ EXPECT_THAT(fcntl(nfd.get(), F_GETFD), SyscallSucceedsWithValue(FD_CLOEXEC));
+}
+
+TEST(DupTest, Dup3FailsSameFD) {
+ auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(f.path(), O_RDONLY));
+
+ // Only dup3 fails if the new and old fd are the same.
+ ASSERT_THAT(dup3(fd.get(), fd.get(), 0), SyscallFailsWithErrno(EINVAL));
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/epoll.cc b/test/syscalls/linux/epoll.cc
new file mode 100644
index 000000000..f57d38dc7
--- /dev/null
+++ b/test/syscalls/linux/epoll.cc
@@ -0,0 +1,428 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <limits.h>
+#include <pthread.h>
+#include <signal.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/epoll.h>
+#include <sys/eventfd.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/util/epoll_util.h"
+#include "test/util/eventfd_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+constexpr int kFDsPerEpoll = 3;
+constexpr uint64_t kMagicConstant = 0x0102030405060708;
+
+uint64_t ms_elapsed(const struct timespec* begin, const struct timespec* end) {
+ return (end->tv_sec - begin->tv_sec) * 1000 +
+ (end->tv_nsec - begin->tv_nsec) / 1000000;
+}
+
+TEST(EpollTest, AllWritable) {
+ auto epollfd = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD());
+ std::vector<FileDescriptor> eventfds;
+ for (int i = 0; i < kFDsPerEpoll; i++) {
+ eventfds.push_back(ASSERT_NO_ERRNO_AND_VALUE(NewEventFD()));
+ ASSERT_NO_ERRNO(RegisterEpollFD(epollfd.get(), eventfds[i].get(),
+ EPOLLIN | EPOLLOUT, kMagicConstant + i));
+ }
+
+ struct epoll_event result[kFDsPerEpoll];
+ ASSERT_THAT(RetryEINTR(epoll_wait)(epollfd.get(), result, kFDsPerEpoll, -1),
+ SyscallSucceedsWithValue(kFDsPerEpoll));
+ for (int i = 0; i < kFDsPerEpoll; i++) {
+ ASSERT_EQ(result[i].events, EPOLLOUT);
+ }
+}
+
+TEST(EpollTest, LastReadable) {
+ auto epollfd = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD());
+ std::vector<FileDescriptor> eventfds;
+ for (int i = 0; i < kFDsPerEpoll; i++) {
+ eventfds.push_back(ASSERT_NO_ERRNO_AND_VALUE(NewEventFD()));
+ ASSERT_NO_ERRNO(RegisterEpollFD(epollfd.get(), eventfds[i].get(),
+ EPOLLIN | EPOLLOUT, kMagicConstant + i));
+ }
+
+ uint64_t tmp = 1;
+ ASSERT_THAT(WriteFd(eventfds[kFDsPerEpoll - 1].get(), &tmp, sizeof(tmp)),
+ SyscallSucceedsWithValue(sizeof(tmp)));
+
+ struct epoll_event result[kFDsPerEpoll];
+ ASSERT_THAT(RetryEINTR(epoll_wait)(epollfd.get(), result, kFDsPerEpoll, -1),
+ SyscallSucceedsWithValue(kFDsPerEpoll));
+
+ int i;
+ for (i = 0; i < kFDsPerEpoll - 1; i++) {
+ EXPECT_EQ(result[i].events, EPOLLOUT);
+ }
+ EXPECT_EQ(result[i].events, EPOLLOUT | EPOLLIN);
+ EXPECT_EQ(result[i].data.u64, kMagicConstant + i);
+}
+
+TEST(EpollTest, LastNonWritable) {
+ auto epollfd = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD());
+ std::vector<FileDescriptor> eventfds;
+ for (int i = 0; i < kFDsPerEpoll; i++) {
+ eventfds.push_back(ASSERT_NO_ERRNO_AND_VALUE(NewEventFD()));
+ ASSERT_NO_ERRNO(RegisterEpollFD(epollfd.get(), eventfds[i].get(),
+ EPOLLIN | EPOLLOUT, kMagicConstant + i));
+ }
+
+ // Write the maximum value to the event fd so that writing to it again would
+ // block.
+ uint64_t tmp = ULLONG_MAX - 1;
+ ASSERT_THAT(WriteFd(eventfds[kFDsPerEpoll - 1].get(), &tmp, sizeof(tmp)),
+ SyscallSucceedsWithValue(sizeof(tmp)));
+
+ struct epoll_event result[kFDsPerEpoll];
+ ASSERT_THAT(RetryEINTR(epoll_wait)(epollfd.get(), result, kFDsPerEpoll, -1),
+ SyscallSucceedsWithValue(kFDsPerEpoll));
+
+ int i;
+ for (i = 0; i < kFDsPerEpoll - 1; i++) {
+ EXPECT_EQ(result[i].events, EPOLLOUT);
+ }
+ EXPECT_EQ(result[i].events, EPOLLIN);
+ EXPECT_THAT(ReadFd(eventfds[kFDsPerEpoll - 1].get(), &tmp, sizeof(tmp)),
+ sizeof(tmp));
+ EXPECT_THAT(RetryEINTR(epoll_wait)(epollfd.get(), result, kFDsPerEpoll, -1),
+ SyscallSucceedsWithValue(kFDsPerEpoll));
+
+ for (i = 0; i < kFDsPerEpoll; i++) {
+ EXPECT_EQ(result[i].events, EPOLLOUT);
+ }
+}
+
+TEST(EpollTest, Timeout_NoRandomSave) {
+ auto epollfd = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD());
+ std::vector<FileDescriptor> eventfds;
+ for (int i = 0; i < kFDsPerEpoll; i++) {
+ eventfds.push_back(ASSERT_NO_ERRNO_AND_VALUE(NewEventFD()));
+ ASSERT_NO_ERRNO(RegisterEpollFD(epollfd.get(), eventfds[i].get(), EPOLLIN,
+ kMagicConstant + i));
+ }
+
+ constexpr int kTimeoutMs = 200;
+ struct timespec begin;
+ struct timespec end;
+ struct epoll_event result[kFDsPerEpoll];
+
+ {
+ const DisableSave ds; // Timing-related.
+ EXPECT_THAT(clock_gettime(CLOCK_MONOTONIC, &begin), SyscallSucceeds());
+
+ ASSERT_THAT(
+ RetryEINTR(epoll_wait)(epollfd.get(), result, kFDsPerEpoll, kTimeoutMs),
+ SyscallSucceedsWithValue(0));
+ EXPECT_THAT(clock_gettime(CLOCK_MONOTONIC, &end), SyscallSucceeds());
+ }
+
+ // Check the lower bound on the timeout. Checking for an upper bound is
+ // fragile because Linux can overrun the timeout due to scheduling delays.
+ EXPECT_GT(ms_elapsed(&begin, &end), kTimeoutMs - 1);
+}
+
+void* writer(void* arg) {
+ int fd = *reinterpret_cast<int*>(arg);
+ uint64_t tmp = 1;
+
+ usleep(200000);
+ if (WriteFd(fd, &tmp, sizeof(tmp)) != sizeof(tmp)) {
+ fprintf(stderr, "writer failed: errno %s\n", strerror(errno));
+ }
+
+ return nullptr;
+}
+
+TEST(EpollTest, WaitThenUnblock) {
+ auto epollfd = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD());
+ std::vector<FileDescriptor> eventfds;
+ for (int i = 0; i < kFDsPerEpoll; i++) {
+ eventfds.push_back(ASSERT_NO_ERRNO_AND_VALUE(NewEventFD()));
+ ASSERT_NO_ERRNO(RegisterEpollFD(epollfd.get(), eventfds[i].get(), EPOLLIN,
+ kMagicConstant + i));
+ }
+
+ // Fire off a thread that will make at least one of the event fds readable.
+ pthread_t thread;
+ int make_readable = eventfds[0].get();
+ ASSERT_THAT(pthread_create(&thread, nullptr, writer, &make_readable),
+ SyscallSucceedsWithValue(0));
+
+ struct epoll_event result[kFDsPerEpoll];
+ EXPECT_THAT(RetryEINTR(epoll_wait)(epollfd.get(), result, kFDsPerEpoll, -1),
+ SyscallSucceedsWithValue(1));
+ EXPECT_THAT(pthread_detach(thread), SyscallSucceeds());
+}
+
+void sighandler(int s) {}
+
+void* signaler(void* arg) {
+ pthread_t* t = reinterpret_cast<pthread_t*>(arg);
+ // Repeatedly send the real-time signal until we are detached, because it's
+ // difficult to know exactly when epoll_wait on another thread (which this
+ // is intending to interrupt) has started blocking.
+ while (1) {
+ usleep(200000);
+ pthread_kill(*t, SIGRTMIN);
+ }
+ return nullptr;
+}
+
+TEST(EpollTest, UnblockWithSignal) {
+ auto epollfd = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD());
+ std::vector<FileDescriptor> eventfds;
+ for (int i = 0; i < kFDsPerEpoll; i++) {
+ eventfds.push_back(ASSERT_NO_ERRNO_AND_VALUE(NewEventFD()));
+ ASSERT_NO_ERRNO(RegisterEpollFD(epollfd.get(), eventfds[i].get(), EPOLLIN,
+ kMagicConstant + i));
+ }
+
+ signal(SIGRTMIN, sighandler);
+ // Unblock the real time signals that InitGoogle blocks :(
+ sigset_t unblock;
+ sigemptyset(&unblock);
+ sigaddset(&unblock, SIGRTMIN);
+ ASSERT_THAT(sigprocmask(SIG_UNBLOCK, &unblock, nullptr), SyscallSucceeds());
+
+ pthread_t thread;
+ pthread_t cur = pthread_self();
+ ASSERT_THAT(pthread_create(&thread, nullptr, signaler, &cur),
+ SyscallSucceedsWithValue(0));
+
+ struct epoll_event result[kFDsPerEpoll];
+ EXPECT_THAT(epoll_wait(epollfd.get(), result, kFDsPerEpoll, -1),
+ SyscallFailsWithErrno(EINTR));
+ EXPECT_THAT(pthread_cancel(thread), SyscallSucceeds());
+ EXPECT_THAT(pthread_detach(thread), SyscallSucceeds());
+}
+
+TEST(EpollTest, TimeoutNoFds) {
+ auto epollfd = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD());
+ struct epoll_event result[kFDsPerEpoll];
+ EXPECT_THAT(RetryEINTR(epoll_wait)(epollfd.get(), result, kFDsPerEpoll, 100),
+ SyscallSucceedsWithValue(0));
+}
+
+struct addr_ctx {
+ int epollfd;
+ int eventfd;
+};
+
+void* fd_adder(void* arg) {
+ struct addr_ctx* actx = reinterpret_cast<struct addr_ctx*>(arg);
+ struct epoll_event event;
+ event.events = EPOLLIN | EPOLLOUT;
+ event.data.u64 = 0xdeadbeeffacefeed;
+
+ usleep(200000);
+ if (epoll_ctl(actx->epollfd, EPOLL_CTL_ADD, actx->eventfd, &event) == -1) {
+ fprintf(stderr, "epoll_ctl failed: %s\n", strerror(errno));
+ }
+
+ return nullptr;
+}
+
+TEST(EpollTest, UnblockWithNewFD) {
+ auto epollfd = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD());
+ auto eventfd = ASSERT_NO_ERRNO_AND_VALUE(NewEventFD());
+
+ pthread_t thread;
+ struct addr_ctx actx = {epollfd.get(), eventfd.get()};
+ ASSERT_THAT(pthread_create(&thread, nullptr, fd_adder, &actx),
+ SyscallSucceedsWithValue(0));
+
+ struct epoll_event result[kFDsPerEpoll];
+ // Wait while no FDs are ready, but after 200ms fd_adder will add a ready FD
+ // to epoll which will wake us up.
+ EXPECT_THAT(RetryEINTR(epoll_wait)(epollfd.get(), result, kFDsPerEpoll, -1),
+ SyscallSucceedsWithValue(1));
+ EXPECT_THAT(pthread_detach(thread), SyscallSucceeds());
+ EXPECT_EQ(result[0].data.u64, 0xdeadbeeffacefeed);
+}
+
+TEST(EpollTest, Oneshot) {
+ auto epollfd = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD());
+ std::vector<FileDescriptor> eventfds;
+ for (int i = 0; i < kFDsPerEpoll; i++) {
+ eventfds.push_back(ASSERT_NO_ERRNO_AND_VALUE(NewEventFD()));
+ ASSERT_NO_ERRNO(RegisterEpollFD(epollfd.get(), eventfds[i].get(), EPOLLIN,
+ kMagicConstant + i));
+ }
+
+ struct epoll_event event;
+ event.events = EPOLLOUT | EPOLLONESHOT;
+ event.data.u64 = kMagicConstant;
+ ASSERT_THAT(
+ epoll_ctl(epollfd.get(), EPOLL_CTL_MOD, eventfds[0].get(), &event),
+ SyscallSucceeds());
+
+ struct epoll_event result[kFDsPerEpoll];
+ // One-shot entry means that the first epoll_wait should succeed.
+ ASSERT_THAT(RetryEINTR(epoll_wait)(epollfd.get(), result, kFDsPerEpoll, -1),
+ SyscallSucceedsWithValue(1));
+ EXPECT_EQ(result[0].data.u64, kMagicConstant);
+
+ // One-shot entry means that the second epoll_wait should timeout.
+ EXPECT_THAT(RetryEINTR(epoll_wait)(epollfd.get(), result, kFDsPerEpoll, 100),
+ SyscallSucceedsWithValue(0));
+}
+
+TEST(EpollTest, EdgeTriggered_NoRandomSave) {
+ // Test edge-triggered entry: make it edge-triggered, first wait should
+ // return it, second one should time out, make it writable again, third wait
+ // should return it, fourth wait should timeout.
+ auto epollfd = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD());
+ auto eventfd = ASSERT_NO_ERRNO_AND_VALUE(NewEventFD());
+ ASSERT_NO_ERRNO(RegisterEpollFD(epollfd.get(), eventfd.get(),
+ EPOLLOUT | EPOLLET, kMagicConstant));
+
+ struct epoll_event result[kFDsPerEpoll];
+
+ {
+ const DisableSave ds; // May trigger spurious event.
+
+ // Edge-triggered entry means that the first epoll_wait should return the
+ // event.
+ ASSERT_THAT(epoll_wait(epollfd.get(), result, kFDsPerEpoll, -1),
+ SyscallSucceedsWithValue(1));
+ EXPECT_EQ(result[0].data.u64, kMagicConstant);
+
+ // Edge-triggered entry means that the second epoll_wait should time out.
+ ASSERT_THAT(epoll_wait(epollfd.get(), result, kFDsPerEpoll, 100),
+ SyscallSucceedsWithValue(0));
+ }
+
+ uint64_t tmp = ULLONG_MAX - 1;
+
+ // Make an fd non-writable.
+ ASSERT_THAT(WriteFd(eventfd.get(), &tmp, sizeof(tmp)),
+ SyscallSucceedsWithValue(sizeof(tmp)));
+
+ // Make the same fd non-writable to trigger a change, which will trigger an
+ // edge-triggered event.
+ ASSERT_THAT(ReadFd(eventfd.get(), &tmp, sizeof(tmp)),
+ SyscallSucceedsWithValue(sizeof(tmp)));
+
+ {
+ const DisableSave ds; // May trigger spurious event.
+
+ // An edge-triggered event should now be returned.
+ ASSERT_THAT(epoll_wait(epollfd.get(), result, kFDsPerEpoll, -1),
+ SyscallSucceedsWithValue(1));
+ EXPECT_EQ(result[0].data.u64, kMagicConstant);
+
+ // The edge-triggered event had been consumed above, we don't expect to
+ // get it again.
+ ASSERT_THAT(epoll_wait(epollfd.get(), result, kFDsPerEpoll, 100),
+ SyscallSucceedsWithValue(0));
+ }
+}
+
+TEST(EpollTest, OneshotAndEdgeTriggered) {
+ auto epollfd = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD());
+ auto eventfd = ASSERT_NO_ERRNO_AND_VALUE(NewEventFD());
+ ASSERT_NO_ERRNO(RegisterEpollFD(epollfd.get(), eventfd.get(),
+ EPOLLOUT | EPOLLET | EPOLLONESHOT,
+ kMagicConstant));
+
+ struct epoll_event result[kFDsPerEpoll];
+ // First time one shot edge-triggered entry means that epoll_wait should
+ // return the event.
+ ASSERT_THAT(RetryEINTR(epoll_wait)(epollfd.get(), result, kFDsPerEpoll, -1),
+ SyscallSucceedsWithValue(1));
+ EXPECT_EQ(result[0].data.u64, kMagicConstant);
+
+ // Edge-triggered entry means that the second epoll_wait should time out.
+ ASSERT_THAT(RetryEINTR(epoll_wait)(epollfd.get(), result, kFDsPerEpoll, 100),
+ SyscallSucceedsWithValue(0));
+
+ uint64_t tmp = ULLONG_MAX - 1;
+ // Make an fd non-writable.
+ ASSERT_THAT(WriteFd(eventfd.get(), &tmp, sizeof(tmp)),
+ SyscallSucceedsWithValue(sizeof(tmp)));
+ // Make the same fd non-writable to trigger a change, which will not trigger
+ // an edge-triggered event because we've also included EPOLLONESHOT.
+ ASSERT_THAT(ReadFd(eventfd.get(), &tmp, sizeof(tmp)),
+ SyscallSucceedsWithValue(sizeof(tmp)));
+ ASSERT_THAT(RetryEINTR(epoll_wait)(epollfd.get(), result, kFDsPerEpoll, 100),
+ SyscallSucceedsWithValue(0));
+}
+
+TEST(EpollTest, CycleOfOneDisallowed) {
+ auto epollfd = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD());
+
+ struct epoll_event event;
+ event.events = EPOLLOUT;
+ event.data.u64 = kMagicConstant;
+
+ ASSERT_THAT(epoll_ctl(epollfd.get(), EPOLL_CTL_ADD, epollfd.get(), &event),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(EpollTest, CycleOfThreeDisallowed) {
+ auto epollfd = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD());
+ auto epollfd1 = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD());
+ auto epollfd2 = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD());
+
+ ASSERT_NO_ERRNO(
+ RegisterEpollFD(epollfd.get(), epollfd1.get(), EPOLLIN, kMagicConstant));
+ ASSERT_NO_ERRNO(
+ RegisterEpollFD(epollfd1.get(), epollfd2.get(), EPOLLIN, kMagicConstant));
+
+ struct epoll_event event;
+ event.events = EPOLLIN;
+ event.data.u64 = kMagicConstant;
+ EXPECT_THAT(epoll_ctl(epollfd2.get(), EPOLL_CTL_ADD, epollfd.get(), &event),
+ SyscallFailsWithErrno(ELOOP));
+}
+
+TEST(EpollTest, CloseFile) {
+ auto epollfd = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD());
+ auto eventfd = ASSERT_NO_ERRNO_AND_VALUE(NewEventFD());
+ ASSERT_NO_ERRNO(
+ RegisterEpollFD(epollfd.get(), eventfd.get(), EPOLLOUT, kMagicConstant));
+
+ struct epoll_event result[kFDsPerEpoll];
+ ASSERT_THAT(RetryEINTR(epoll_wait)(epollfd.get(), result, kFDsPerEpoll, -1),
+ SyscallSucceedsWithValue(1));
+ EXPECT_EQ(result[0].data.u64, kMagicConstant);
+
+ // Close the event fd early.
+ eventfd.reset();
+
+ EXPECT_THAT(RetryEINTR(epoll_wait)(epollfd.get(), result, kFDsPerEpoll, 100),
+ SyscallSucceedsWithValue(0));
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/eventfd.cc b/test/syscalls/linux/eventfd.cc
new file mode 100644
index 000000000..dc794415e
--- /dev/null
+++ b/test/syscalls/linux/eventfd.cc
@@ -0,0 +1,222 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/epoll.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/util/epoll_util.h"
+#include "test/util/eventfd_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(EventfdTest, Nonblock) {
+ FileDescriptor efd =
+ ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(0, EFD_NONBLOCK | EFD_SEMAPHORE));
+
+ uint64_t l;
+ ASSERT_THAT(read(efd.get(), &l, sizeof(l)), SyscallFailsWithErrno(EAGAIN));
+
+ l = 1;
+ ASSERT_THAT(write(efd.get(), &l, sizeof(l)), SyscallSucceeds());
+
+ l = 0;
+ ASSERT_THAT(read(efd.get(), &l, sizeof(l)), SyscallSucceeds());
+ EXPECT_EQ(l, 1);
+
+ ASSERT_THAT(read(efd.get(), &l, sizeof(l)), SyscallFailsWithErrno(EAGAIN));
+}
+
+void* read_three_times(void* arg) {
+ int efd = *reinterpret_cast<int*>(arg);
+ uint64_t l;
+ EXPECT_THAT(read(efd, &l, sizeof(l)), SyscallSucceedsWithValue(sizeof(l)));
+ EXPECT_THAT(read(efd, &l, sizeof(l)), SyscallSucceedsWithValue(sizeof(l)));
+ EXPECT_THAT(read(efd, &l, sizeof(l)), SyscallSucceedsWithValue(sizeof(l)));
+ return nullptr;
+}
+
+TEST(EventfdTest, BlockingWrite) {
+ FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(0, EFD_SEMAPHORE));
+ int efd = fd.get();
+
+ pthread_t p;
+ ASSERT_THAT(pthread_create(&p, nullptr, read_three_times,
+ reinterpret_cast<void*>(&efd)),
+ SyscallSucceeds());
+
+ uint64_t l = 1;
+ ASSERT_THAT(write(efd, &l, sizeof(l)), SyscallSucceeds());
+ EXPECT_EQ(l, 1);
+
+ ASSERT_THAT(write(efd, &l, sizeof(l)), SyscallSucceeds());
+ EXPECT_EQ(l, 1);
+
+ ASSERT_THAT(write(efd, &l, sizeof(l)), SyscallSucceeds());
+ EXPECT_EQ(l, 1);
+
+ ASSERT_THAT(pthread_join(p, nullptr), SyscallSucceeds());
+}
+
+TEST(EventfdTest, SmallWrite) {
+ FileDescriptor efd =
+ ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(0, EFD_NONBLOCK | EFD_SEMAPHORE));
+
+ uint64_t l = 16;
+ ASSERT_THAT(write(efd.get(), &l, 4), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(EventfdTest, SmallRead) {
+ FileDescriptor efd =
+ ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(0, EFD_NONBLOCK | EFD_SEMAPHORE));
+
+ uint64_t l = 1;
+ ASSERT_THAT(write(efd.get(), &l, sizeof(l)), SyscallSucceeds());
+
+ l = 0;
+ ASSERT_THAT(read(efd.get(), &l, 4), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(EventfdTest, IllegalSeek) {
+ FileDescriptor efd = ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(0, 0));
+ EXPECT_THAT(lseek(efd.get(), 0, SEEK_SET), SyscallFailsWithErrno(ESPIPE));
+}
+
+TEST(EventfdTest, IllegalPread) {
+ FileDescriptor efd = ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(0, 0));
+ int l;
+ EXPECT_THAT(pread(efd.get(), &l, sizeof(l), 0),
+ SyscallFailsWithErrno(ESPIPE));
+}
+
+TEST(EventfdTest, IllegalPwrite) {
+ FileDescriptor efd = ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(0, 0));
+ EXPECT_THAT(pwrite(efd.get(), "x", 1, 0), SyscallFailsWithErrno(ESPIPE));
+}
+
+TEST(EventfdTest, BigWrite) {
+ FileDescriptor efd =
+ ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(0, EFD_NONBLOCK | EFD_SEMAPHORE));
+
+ uint64_t big[16];
+ big[0] = 16;
+ ASSERT_THAT(write(efd.get(), big, sizeof(big)), SyscallSucceeds());
+}
+
+TEST(EventfdTest, BigRead) {
+ FileDescriptor efd =
+ ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(0, EFD_NONBLOCK | EFD_SEMAPHORE));
+
+ uint64_t l = 1;
+ ASSERT_THAT(write(efd.get(), &l, sizeof(l)), SyscallSucceeds());
+
+ uint64_t big[16];
+ ASSERT_THAT(read(efd.get(), big, sizeof(big)), SyscallSucceeds());
+ EXPECT_EQ(big[0], 1);
+}
+
+TEST(EventfdTest, BigWriteBigRead) {
+ FileDescriptor efd =
+ ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(0, EFD_NONBLOCK | EFD_SEMAPHORE));
+
+ uint64_t l[16];
+ l[0] = 16;
+ ASSERT_THAT(write(efd.get(), l, sizeof(l)), SyscallSucceeds());
+ ASSERT_THAT(read(efd.get(), l, sizeof(l)), SyscallSucceeds());
+ EXPECT_EQ(l[0], 1);
+}
+
+TEST(EventfdTest, SpliceFromPipePartialSucceeds) {
+ int pipes[2];
+ ASSERT_THAT(pipe2(pipes, O_NONBLOCK), SyscallSucceeds());
+ const FileDescriptor pipe_rfd(pipes[0]);
+ const FileDescriptor pipe_wfd(pipes[1]);
+ constexpr uint64_t kVal{1};
+
+ FileDescriptor efd = ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(0, EFD_NONBLOCK));
+
+ uint64_t event_array[2];
+ event_array[0] = kVal;
+ event_array[1] = kVal;
+ ASSERT_THAT(write(pipe_wfd.get(), event_array, sizeof(event_array)),
+ SyscallSucceedsWithValue(sizeof(event_array)));
+ EXPECT_THAT(splice(pipe_rfd.get(), /*__offin=*/nullptr, efd.get(),
+ /*__offout=*/nullptr, sizeof(event_array[0]) + 1,
+ SPLICE_F_NONBLOCK),
+ SyscallSucceedsWithValue(sizeof(event_array[0])));
+
+ uint64_t val;
+ ASSERT_THAT(read(efd.get(), &val, sizeof(val)),
+ SyscallSucceedsWithValue(sizeof(val)));
+ EXPECT_EQ(val, kVal);
+}
+
+// NotifyNonZero is inherently racy, so random save is disabled.
+TEST(EventfdTest, NotifyNonZero_NoRandomSave) {
+ // Waits will time out at 10 seconds.
+ constexpr int kEpollTimeoutMs = 10000;
+ // Create an eventfd descriptor.
+ FileDescriptor efd =
+ ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(7, EFD_NONBLOCK | EFD_SEMAPHORE));
+ // Create an epoll fd to listen to efd.
+ FileDescriptor epollfd = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD());
+ // Add efd to epoll.
+ ASSERT_NO_ERRNO(
+ RegisterEpollFD(epollfd.get(), efd.get(), EPOLLIN | EPOLLET, efd.get()));
+
+ // Use epoll to get a value from efd.
+ struct epoll_event out_ev;
+ int wait_out = epoll_wait(epollfd.get(), &out_ev, 1, kEpollTimeoutMs);
+ EXPECT_EQ(wait_out, 1);
+ EXPECT_EQ(efd.get(), out_ev.data.fd);
+ uint64_t val = 0;
+ ASSERT_THAT(read(efd.get(), &val, sizeof(val)), SyscallSucceeds());
+ EXPECT_EQ(val, 1);
+
+ // Start a thread that, after this thread blocks on epoll_wait, will write to
+ // efd. This is racy -- it's possible that this write will happen after
+ // epoll_wait times out.
+ ScopedThread t([&efd] {
+ sleep(5);
+ uint64_t val = 1;
+ EXPECT_THAT(write(efd.get(), &val, sizeof(val)),
+ SyscallSucceedsWithValue(sizeof(val)));
+ });
+
+ // epoll_wait should return once the thread writes.
+ wait_out = epoll_wait(epollfd.get(), &out_ev, 1, kEpollTimeoutMs);
+ EXPECT_EQ(wait_out, 1);
+ EXPECT_EQ(efd.get(), out_ev.data.fd);
+
+ val = 0;
+ ASSERT_THAT(read(efd.get(), &val, sizeof(val)), SyscallSucceeds());
+ EXPECT_EQ(val, 1);
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/exceptions.cc b/test/syscalls/linux/exceptions.cc
new file mode 100644
index 000000000..420b9543f
--- /dev/null
+++ b/test/syscalls/linux/exceptions.cc
@@ -0,0 +1,367 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <signal.h>
+
+#include "gtest/gtest.h"
+#include "test/util/logging.h"
+#include "test/util/platform_util.h"
+#include "test/util/signal_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Default value for the x87 FPU control word. See Intel SDM Vol 1, Ch 8.1.5
+// "x87 FPU Control Word".
+constexpr uint16_t kX87ControlWordDefault = 0x37f;
+
+// Mask for the divide-by-zero exception.
+constexpr uint16_t kX87ControlWordDiv0Mask = 1 << 2;
+
+// Default value for the SSE control register (MXCSR). See Intel SDM Vol 1, Ch
+// 11.6.4 "Initialization of SSE/SSE3 Extensions".
+constexpr uint32_t kMXCSRDefault = 0x1f80;
+
+// Mask for the divide-by-zero exception.
+constexpr uint32_t kMXCSRDiv0Mask = 1 << 9;
+
+// Flag for a pending divide-by-zero exception.
+constexpr uint32_t kMXCSRDiv0Flag = 1 << 2;
+
+void inline Halt() { asm("hlt\r\n"); }
+
+void inline SetAlignmentCheck() {
+ asm("subq $128, %%rsp\r\n" // Avoid potential red zone clobber
+ "pushf\r\n"
+ "pop %%rax\r\n"
+ "or $0x40000, %%rax\r\n"
+ "push %%rax\r\n"
+ "popf\r\n"
+ "addq $128, %%rsp\r\n"
+ :
+ :
+ : "ax");
+}
+
+void inline ClearAlignmentCheck() {
+ asm("subq $128, %%rsp\r\n" // Avoid potential red zone clobber
+ "pushf\r\n"
+ "pop %%rax\r\n"
+ "mov $0x40000, %%rbx\r\n"
+ "not %%rbx\r\n"
+ "and %%rbx, %%rax\r\n"
+ "push %%rax\r\n"
+ "popf\r\n"
+ "addq $128, %%rsp\r\n"
+ :
+ :
+ : "ax", "bx");
+}
+
+void inline Int3Normal() { asm(".byte 0xcd, 0x03\r\n"); }
+
+void inline Int3Compact() { asm(".byte 0xcc\r\n"); }
+
+void InIOHelper(int width, int value) {
+ EXPECT_EXIT(
+ {
+ switch (width) {
+ case 1:
+ asm volatile("inb %%dx, %%al" ::"d"(value) : "%eax");
+ break;
+ case 2:
+ asm volatile("inw %%dx, %%ax" ::"d"(value) : "%eax");
+ break;
+ case 4:
+ asm volatile("inl %%dx, %%eax" ::"d"(value) : "%eax");
+ break;
+ default:
+ FAIL() << "invalid input width, only 1, 2 or 4 is allowed";
+ }
+ },
+ ::testing::KilledBySignal(SIGSEGV), "");
+}
+
+TEST(ExceptionTest, Halt) {
+ // In order to prevent the regular handler from messing with things (and
+ // perhaps refaulting until some other signal occurs), we reset the handler to
+ // the default action here and ensure that it dies correctly.
+ struct sigaction sa = {};
+ sa.sa_handler = SIG_DFL;
+ auto const cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGSEGV, sa));
+
+ EXPECT_EXIT(Halt(), ::testing::KilledBySignal(SIGSEGV), "");
+}
+
+TEST(ExceptionTest, DivideByZero) {
+ // See above.
+ struct sigaction sa = {};
+ sa.sa_handler = SIG_DFL;
+ auto const cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGFPE, sa));
+
+ EXPECT_EXIT(
+ {
+ uint32_t remainder;
+ uint32_t quotient;
+ uint32_t divisor = 0;
+ uint64_t value = 1;
+ asm("divl 0(%2)\r\n"
+ : "=d"(remainder), "=a"(quotient)
+ : "r"(&divisor), "d"(value >> 32), "a"(value));
+ TEST_CHECK(quotient > 0); // Force dependency.
+ },
+ ::testing::KilledBySignal(SIGFPE), "");
+}
+
+// By default, x87 exceptions are masked and simply return a default value.
+TEST(ExceptionTest, X87DivideByZeroMasked) {
+ int32_t quotient;
+ int32_t value = 1;
+ int32_t divisor = 0;
+ asm("fildl %[value]\r\n"
+ "fidivl %[divisor]\r\n"
+ "fistpl %[quotient]\r\n"
+ : [ quotient ] "=m"(quotient)
+ : [ value ] "m"(value), [ divisor ] "m"(divisor));
+
+ EXPECT_EQ(quotient, INT32_MIN);
+}
+
+// When unmasked, division by zero raises SIGFPE.
+TEST(ExceptionTest, X87DivideByZeroUnmasked) {
+ // See above.
+ struct sigaction sa = {};
+ sa.sa_handler = SIG_DFL;
+ auto const cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGFPE, sa));
+
+ EXPECT_EXIT(
+ {
+ // Clear the divide by zero exception mask.
+ constexpr uint16_t kControlWord =
+ kX87ControlWordDefault & ~kX87ControlWordDiv0Mask;
+
+ int32_t quotient;
+ int32_t value = 1;
+ int32_t divisor = 0;
+ asm volatile(
+ "fldcw %[cw]\r\n"
+ "fildl %[value]\r\n"
+ "fidivl %[divisor]\r\n"
+ "fistpl %[quotient]\r\n"
+ : [ quotient ] "=m"(quotient)
+ : [ cw ] "m"(kControlWord), [ value ] "m"(value),
+ [ divisor ] "m"(divisor));
+ },
+ ::testing::KilledBySignal(SIGFPE), "");
+}
+
+// Pending exceptions in the x87 status register are not clobbered by syscalls.
+TEST(ExceptionTest, X87StatusClobber) {
+ // See above.
+ struct sigaction sa = {};
+ sa.sa_handler = SIG_DFL;
+ auto const cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGFPE, sa));
+
+ EXPECT_EXIT(
+ {
+ // Clear the divide by zero exception mask.
+ constexpr uint16_t kControlWord =
+ kX87ControlWordDefault & ~kX87ControlWordDiv0Mask;
+
+ int32_t quotient;
+ int32_t value = 1;
+ int32_t divisor = 0;
+ asm volatile(
+ "fildl %[value]\r\n"
+ "fidivl %[divisor]\r\n"
+ // Exception is masked, so it does not occur here.
+ "fistpl %[quotient]\r\n"
+
+ // SYS_getpid placed in rax by constraint.
+ "syscall\r\n"
+
+ // Unmask exception. The syscall didn't clobber the pending
+ // exception, so now it can be raised.
+ //
+ // N.B. "a floating-point exception will be generated upon execution
+ // of the *next* floating-point instruction".
+ "fldcw %[cw]\r\n"
+ "fwait\r\n"
+ : [ quotient ] "=m"(quotient)
+ : [ value ] "m"(value), [ divisor ] "m"(divisor), "a"(SYS_getpid),
+ [ cw ] "m"(kControlWord)
+ : "rcx", "r11");
+ },
+ ::testing::KilledBySignal(SIGFPE), "");
+}
+
+// By default, SSE exceptions are masked and simply return a default value.
+TEST(ExceptionTest, SSEDivideByZeroMasked) {
+ uint32_t status;
+ int32_t quotient;
+ int32_t value = 1;
+ int32_t divisor = 0;
+ asm("cvtsi2ssl %[value], %%xmm0\r\n"
+ "cvtsi2ssl %[divisor], %%xmm1\r\n"
+ "divss %%xmm1, %%xmm0\r\n"
+ "cvtss2sil %%xmm0, %[quotient]\r\n"
+ : [ quotient ] "=r"(quotient), [ status ] "=r"(status)
+ : [ value ] "r"(value), [ divisor ] "r"(divisor)
+ : "xmm0", "xmm1");
+
+ EXPECT_EQ(quotient, INT32_MIN);
+}
+
+// When unmasked, division by zero raises SIGFPE.
+TEST(ExceptionTest, SSEDivideByZeroUnmasked) {
+ // See above.
+ struct sigaction sa = {};
+ sa.sa_handler = SIG_DFL;
+ auto const cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGFPE, sa));
+
+ EXPECT_EXIT(
+ {
+ // Clear the divide by zero exception mask.
+ constexpr uint32_t kMXCSR = kMXCSRDefault & ~kMXCSRDiv0Mask;
+
+ int32_t quotient;
+ int32_t value = 1;
+ int32_t divisor = 0;
+ asm volatile(
+ "ldmxcsr %[mxcsr]\r\n"
+ "cvtsi2ssl %[value], %%xmm0\r\n"
+ "cvtsi2ssl %[divisor], %%xmm1\r\n"
+ "divss %%xmm1, %%xmm0\r\n"
+ "cvtss2sil %%xmm0, %[quotient]\r\n"
+ : [ quotient ] "=r"(quotient)
+ : [ mxcsr ] "m"(kMXCSR), [ value ] "r"(value),
+ [ divisor ] "r"(divisor)
+ : "xmm0", "xmm1");
+ },
+ ::testing::KilledBySignal(SIGFPE), "");
+}
+
+// Pending exceptions in the SSE status register are not clobbered by syscalls.
+TEST(ExceptionTest, SSEStatusClobber) {
+ uint32_t mxcsr;
+ int32_t quotient;
+ int32_t value = 1;
+ int32_t divisor = 0;
+ asm("cvtsi2ssl %[value], %%xmm0\r\n"
+ "cvtsi2ssl %[divisor], %%xmm1\r\n"
+ "divss %%xmm1, %%xmm0\r\n"
+ // Exception is masked, so it does not occur here.
+ "cvtss2sil %%xmm0, %[quotient]\r\n"
+
+ // SYS_getpid placed in rax by constraint.
+ "syscall\r\n"
+
+ // Intel SDM Vol 1, Ch 10.2.3.1 "SIMD Floating-Point Mask and Flag Bits":
+ // "If LDMXCSR or FXRSTOR clears a mask bit and sets the corresponding
+ // exception flag bit, a SIMD floating-point exception will not be
+ // generated as a result of this change. The unmasked exception will be
+ // generated only upon the execution of the next SSE/SSE2/SSE3 instruction
+ // that detects the unmasked exception condition."
+ //
+ // Though ambiguous, empirical evidence indicates that this means that
+ // exception flags set in the status register will never cause an
+ // exception to be raised; only a new exception condition will do so.
+ //
+ // Thus here we just check for the flag itself rather than trying to raise
+ // the exception.
+ "stmxcsr %[mxcsr]\r\n"
+ : [ quotient ] "=r"(quotient), [ mxcsr ] "+m"(mxcsr)
+ : [ value ] "r"(value), [ divisor ] "r"(divisor), "a"(SYS_getpid)
+ : "xmm0", "xmm1", "rcx", "r11");
+
+ EXPECT_TRUE(mxcsr & kMXCSRDiv0Flag);
+}
+
+TEST(ExceptionTest, IOAccessFault) {
+ // See above.
+ struct sigaction sa = {};
+ sa.sa_handler = SIG_DFL;
+ auto const cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGSEGV, sa));
+
+ InIOHelper(1, 0x0);
+ InIOHelper(2, 0x7);
+ InIOHelper(4, 0x6);
+ InIOHelper(1, 0xffff);
+ InIOHelper(2, 0xffff);
+ InIOHelper(4, 0xfffd);
+}
+
+TEST(ExceptionTest, Alignment) {
+ SetAlignmentCheck();
+ ClearAlignmentCheck();
+}
+
+TEST(ExceptionTest, AlignmentHalt) {
+ // See above.
+ struct sigaction sa = {};
+ sa.sa_handler = SIG_DFL;
+ auto const cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGSEGV, sa));
+
+ // Reported upstream. We need to ensure that bad flags are cleared even in
+ // fault paths. Set the alignment flag and then generate an exception.
+ EXPECT_EXIT(
+ {
+ SetAlignmentCheck();
+ Halt();
+ },
+ ::testing::KilledBySignal(SIGSEGV), "");
+}
+
+TEST(ExceptionTest, AlignmentCheck) {
+ SKIP_IF(PlatformSupportAlignmentCheck() != PlatformSupport::Allowed);
+
+ // See above.
+ struct sigaction sa = {};
+ sa.sa_handler = SIG_DFL;
+ auto const cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGBUS, sa));
+
+ EXPECT_EXIT(
+ {
+ char array[16];
+ SetAlignmentCheck();
+ for (int i = 0; i < 8; i++) {
+ // At least 7/8 offsets will be unaligned here.
+ uint64_t* ptr = reinterpret_cast<uint64_t*>(&array[i]);
+ asm("mov %0, 0(%0)\r\n" : : "r"(ptr) : "ax");
+ }
+ },
+ ::testing::KilledBySignal(SIGBUS), "");
+}
+
+TEST(ExceptionTest, Int3Normal) {
+ // See above.
+ struct sigaction sa = {};
+ sa.sa_handler = SIG_DFL;
+ auto const cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGTRAP, sa));
+
+ EXPECT_EXIT(Int3Normal(), ::testing::KilledBySignal(SIGTRAP), "");
+}
+
+TEST(ExceptionTest, Int3Compact) {
+ // See above.
+ struct sigaction sa = {};
+ sa.sa_handler = SIG_DFL;
+ auto const cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGTRAP, sa));
+
+ EXPECT_EXIT(Int3Compact(), ::testing::KilledBySignal(SIGTRAP), "");
+}
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/exec.cc b/test/syscalls/linux/exec.cc
new file mode 100644
index 000000000..c5acfc794
--- /dev/null
+++ b/test/syscalls/linux/exec.cc
@@ -0,0 +1,904 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/exec.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/eventfd.h>
+#include <sys/resource.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#include <iostream>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "absl/strings/match.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/optional.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+constexpr char kBasicWorkload[] = "test/syscalls/linux/exec_basic_workload";
+constexpr char kExitScript[] = "test/syscalls/linux/exit_script";
+constexpr char kStateWorkload[] = "test/syscalls/linux/exec_state_workload";
+constexpr char kProcExeWorkload[] =
+ "test/syscalls/linux/exec_proc_exe_workload";
+constexpr char kAssertClosedWorkload[] =
+ "test/syscalls/linux/exec_assert_closed_workload";
+constexpr char kPriorityWorkload[] = "test/syscalls/linux/priority_execve";
+
+constexpr char kExit42[] = "--exec_exit_42";
+constexpr char kExecWithThread[] = "--exec_exec_with_thread";
+constexpr char kExecFromThread[] = "--exec_exec_from_thread";
+
+// Runs file specified by dirfd and pathname with argv and checks that the exit
+// status is expect_status and that stderr contains expect_stderr.
+void CheckExecHelper(const absl::optional<int32_t> dirfd,
+ const std::string& pathname, const ExecveArray& argv,
+ const ExecveArray& envv, const int flags,
+ int expect_status, const std::string& expect_stderr) {
+ int pipe_fds[2];
+ ASSERT_THAT(pipe2(pipe_fds, O_CLOEXEC), SyscallSucceeds());
+
+ FileDescriptor read_fd(pipe_fds[0]);
+ FileDescriptor write_fd(pipe_fds[1]);
+
+ pid_t child;
+ int execve_errno;
+
+ const auto remap_stderr = [pipe_fds] {
+ // Remap stdin and stdout to /dev/null.
+ int fd = open("/dev/null", O_RDWR | O_CLOEXEC);
+ if (fd < 0) {
+ _exit(errno);
+ }
+
+ int ret = dup2(fd, 0);
+ if (ret < 0) {
+ _exit(errno);
+ }
+
+ ret = dup2(fd, 1);
+ if (ret < 0) {
+ _exit(errno);
+ }
+
+ // And stderr to the pipe.
+ ret = dup2(pipe_fds[1], 2);
+ if (ret < 0) {
+ _exit(errno);
+ }
+
+ // Here, we'd ideally close all other FDs inherited from the parent.
+ // However, that's not worth the effort and CloexecNormalFile and
+ // CloexecEventfd depend on that not happening.
+ };
+
+ Cleanup kill;
+ if (dirfd.has_value()) {
+ kill = ASSERT_NO_ERRNO_AND_VALUE(ForkAndExecveat(*dirfd, pathname, argv,
+ envv, flags, remap_stderr,
+ &child, &execve_errno));
+ } else {
+ kill = ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec(pathname, argv, envv, remap_stderr, &child, &execve_errno));
+ }
+
+ ASSERT_EQ(0, execve_errno);
+
+ // Not needed anymore.
+ write_fd.reset();
+
+ // Read stderr until the child exits.
+ std::string output;
+ constexpr int kSize = 128;
+ char buf[kSize];
+ int n;
+ do {
+ ASSERT_THAT(n = ReadFd(read_fd.get(), buf, kSize), SyscallSucceeds());
+ if (n > 0) {
+ output.append(buf, n);
+ }
+ } while (n > 0);
+
+ int status;
+ ASSERT_THAT(RetryEINTR(waitpid)(child, &status, 0), SyscallSucceeds());
+ EXPECT_EQ(status, expect_status);
+
+ // Process cleanup no longer needed.
+ kill.Release();
+
+ EXPECT_TRUE(absl::StrContains(output, expect_stderr)) << output;
+}
+
+void CheckExec(const std::string& filename, const ExecveArray& argv,
+ const ExecveArray& envv, int expect_status,
+ const std::string& expect_stderr) {
+ CheckExecHelper(/*dirfd=*/absl::optional<int32_t>(), filename, argv, envv,
+ /*flags=*/0, expect_status, expect_stderr);
+}
+
+void CheckExecveat(const int32_t dirfd, const std::string& pathname,
+ const ExecveArray& argv, const ExecveArray& envv,
+ const int flags, int expect_status,
+ const std::string& expect_stderr) {
+ CheckExecHelper(absl::optional<int32_t>(dirfd), pathname, argv, envv, flags,
+ expect_status, expect_stderr);
+}
+
+TEST(ExecTest, EmptyPath) {
+ int execve_errno;
+ ASSERT_NO_ERRNO_AND_VALUE(ForkAndExec("", {}, {}, nullptr, &execve_errno));
+ EXPECT_EQ(execve_errno, ENOENT);
+}
+
+TEST(ExecTest, Basic) {
+ CheckExec(RunfilePath(kBasicWorkload), {RunfilePath(kBasicWorkload)}, {},
+ ArgEnvExitStatus(0, 0),
+ absl::StrCat(RunfilePath(kBasicWorkload), "\n"));
+}
+
+TEST(ExecTest, OneArg) {
+ CheckExec(RunfilePath(kBasicWorkload), {RunfilePath(kBasicWorkload), "1"}, {},
+ ArgEnvExitStatus(1, 0),
+ absl::StrCat(RunfilePath(kBasicWorkload), "\n1\n"));
+}
+
+TEST(ExecTest, FiveArg) {
+ CheckExec(RunfilePath(kBasicWorkload),
+ {RunfilePath(kBasicWorkload), "1", "2", "3", "4", "5"}, {},
+ ArgEnvExitStatus(5, 0),
+ absl::StrCat(RunfilePath(kBasicWorkload), "\n1\n2\n3\n4\n5\n"));
+}
+
+TEST(ExecTest, OneEnv) {
+ CheckExec(RunfilePath(kBasicWorkload), {RunfilePath(kBasicWorkload)}, {"1"},
+ ArgEnvExitStatus(0, 1),
+ absl::StrCat(RunfilePath(kBasicWorkload), "\n1\n"));
+}
+
+TEST(ExecTest, FiveEnv) {
+ CheckExec(RunfilePath(kBasicWorkload), {RunfilePath(kBasicWorkload)},
+ {"1", "2", "3", "4", "5"}, ArgEnvExitStatus(0, 5),
+ absl::StrCat(RunfilePath(kBasicWorkload), "\n1\n2\n3\n4\n5\n"));
+}
+
+TEST(ExecTest, OneArgOneEnv) {
+ CheckExec(RunfilePath(kBasicWorkload), {RunfilePath(kBasicWorkload), "arg"},
+ {"env"}, ArgEnvExitStatus(1, 1),
+ absl::StrCat(RunfilePath(kBasicWorkload), "\narg\nenv\n"));
+}
+
+TEST(ExecTest, InterpreterScript) {
+ CheckExec(RunfilePath(kExitScript), {RunfilePath(kExitScript), "25"}, {},
+ ArgEnvExitStatus(25, 0), "");
+}
+
+// Everything after the path in the interpreter script is a single argument.
+TEST(ExecTest, InterpreterScriptArgSplit) {
+ // Symlink through /tmp to ensure the path is short enough.
+ TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateSymlinkTo("/tmp", RunfilePath(kBasicWorkload)));
+
+ TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path(), " foo bar"),
+ 0755));
+
+ CheckExec(script.path(), {script.path()}, {}, ArgEnvExitStatus(2, 0),
+ absl::StrCat(link.path(), "\nfoo bar\n", script.path(), "\n"));
+}
+
+// Original argv[0] is replaced with the script path.
+TEST(ExecTest, InterpreterScriptArgvZero) {
+ // Symlink through /tmp to ensure the path is short enough.
+ TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateSymlinkTo("/tmp", RunfilePath(kBasicWorkload)));
+
+ TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path()), 0755));
+
+ CheckExec(script.path(), {"REPLACED"}, {}, ArgEnvExitStatus(1, 0),
+ absl::StrCat(link.path(), "\n", script.path(), "\n"));
+}
+
+// Original argv[0] is replaced with the script path, exactly as passed to
+// execve.
+TEST(ExecTest, InterpreterScriptArgvZeroRelative) {
+ // Symlink through /tmp to ensure the path is short enough.
+ TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateSymlinkTo("/tmp", RunfilePath(kBasicWorkload)));
+
+ TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path()), 0755));
+
+ auto cwd = ASSERT_NO_ERRNO_AND_VALUE(GetCWD());
+ auto script_relative =
+ ASSERT_NO_ERRNO_AND_VALUE(GetRelativePath(cwd, script.path()));
+
+ CheckExec(script_relative, {"REPLACED"}, {}, ArgEnvExitStatus(1, 0),
+ absl::StrCat(link.path(), "\n", script_relative, "\n"));
+}
+
+// argv[0] is added as the script path, even if there was none.
+TEST(ExecTest, InterpreterScriptArgvZeroAdded) {
+ // Symlink through /tmp to ensure the path is short enough.
+ TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateSymlinkTo("/tmp", RunfilePath(kBasicWorkload)));
+
+ TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path()), 0755));
+
+ CheckExec(script.path(), {}, {}, ArgEnvExitStatus(1, 0),
+ absl::StrCat(link.path(), "\n", script.path(), "\n"));
+}
+
+// A NUL byte in the script line ends parsing.
+TEST(ExecTest, InterpreterScriptArgNUL) {
+ // Symlink through /tmp to ensure the path is short enough.
+ TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateSymlinkTo("/tmp", RunfilePath(kBasicWorkload)));
+
+ TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(),
+ absl::StrCat("#!", link.path(), " foo", std::string(1, '\0'), "bar"),
+ 0755));
+
+ CheckExec(script.path(), {script.path()}, {}, ArgEnvExitStatus(2, 0),
+ absl::StrCat(link.path(), "\nfoo\n", script.path(), "\n"));
+}
+
+// Trailing whitespace following interpreter path is ignored.
+TEST(ExecTest, InterpreterScriptTrailingWhitespace) {
+ // Symlink through /tmp to ensure the path is short enough.
+ TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateSymlinkTo("/tmp", RunfilePath(kBasicWorkload)));
+
+ TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path(), " "), 0755));
+
+ CheckExec(script.path(), {script.path()}, {}, ArgEnvExitStatus(1, 0),
+ absl::StrCat(link.path(), "\n", script.path(), "\n"));
+}
+
+// Multiple whitespace characters between interpreter and arg allowed.
+TEST(ExecTest, InterpreterScriptArgWhitespace) {
+ // Symlink through /tmp to ensure the path is short enough.
+ TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateSymlinkTo("/tmp", RunfilePath(kBasicWorkload)));
+
+ TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path(), " foo"), 0755));
+
+ CheckExec(script.path(), {script.path()}, {}, ArgEnvExitStatus(2, 0),
+ absl::StrCat(link.path(), "\nfoo\n", script.path(), "\n"));
+}
+
+TEST(ExecTest, InterpreterScriptNoPath) {
+ TempPath script = ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateFileWith(GetAbsoluteTestTmpdir(), "#!", 0755));
+
+ int execve_errno;
+ ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec(script.path(), {script.path()}, {}, nullptr, &execve_errno));
+ EXPECT_EQ(execve_errno, ENOEXEC);
+}
+
+// AT_EXECFN is the path passed to execve.
+TEST(ExecTest, ExecFn) {
+ // Symlink through /tmp to ensure the path is short enough.
+ TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateSymlinkTo("/tmp", RunfilePath(kStateWorkload)));
+
+ TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), absl::StrCat("#!", link.path(), " PrintExecFn"),
+ 0755));
+
+ // Pass the script as a relative path and assert that is what appears in
+ // AT_EXECFN.
+ auto cwd = ASSERT_NO_ERRNO_AND_VALUE(GetCWD());
+ auto script_relative =
+ ASSERT_NO_ERRNO_AND_VALUE(GetRelativePath(cwd, script.path()));
+
+ CheckExec(script_relative, {script_relative}, {}, ArgEnvExitStatus(0, 0),
+ absl::StrCat(script_relative, "\n"));
+}
+
+TEST(ExecTest, ExecName) {
+ std::string path = RunfilePath(kStateWorkload);
+
+ CheckExec(path, {path, "PrintExecName"}, {}, ArgEnvExitStatus(0, 0),
+ absl::StrCat(Basename(path).substr(0, 15), "\n"));
+}
+
+TEST(ExecTest, ExecNameScript) {
+ // Symlink through /tmp to ensure the path is short enough.
+ TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateSymlinkTo("/tmp", RunfilePath(kStateWorkload)));
+
+ TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(),
+ absl::StrCat("#!", link.path(), " PrintExecName"), 0755));
+
+ std::string script_path = script.path();
+
+ CheckExec(script_path, {script_path}, {}, ArgEnvExitStatus(0, 0),
+ absl::StrCat(Basename(script_path).substr(0, 15), "\n"));
+}
+
+// execve may be called by a multithreaded process.
+TEST(ExecTest, WithSiblingThread) {
+ CheckExec("/proc/self/exe", {"/proc/self/exe", kExecWithThread}, {},
+ W_EXITCODE(42, 0), "");
+}
+
+// execve may be called from a thread other than the leader of a multithreaded
+// process.
+TEST(ExecTest, FromSiblingThread) {
+ CheckExec("/proc/self/exe", {"/proc/self/exe", kExecFromThread}, {},
+ W_EXITCODE(42, 0), "");
+}
+
+TEST(ExecTest, NotFound) {
+ char* const argv[] = {nullptr};
+ char* const envp[] = {nullptr};
+ EXPECT_THAT(execve("/file/does/not/exist", argv, envp),
+ SyscallFailsWithErrno(ENOENT));
+}
+
+TEST(ExecTest, NoExecPerm) {
+ char* const argv[] = {nullptr};
+ char* const envp[] = {nullptr};
+ auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ EXPECT_THAT(execve(f.path().c_str(), argv, envp),
+ SyscallFailsWithErrno(EACCES));
+}
+
+// A signal handler we never expect to be called.
+void SignalHandler(int signo) {
+ std::cerr << "Signal " << signo << " raised." << std::endl;
+ exit(1);
+}
+
+// Signal handlers are reset on execve(2), unless they have default or ignored
+// disposition.
+TEST(ExecStateTest, HandlerReset) {
+ struct sigaction sa;
+ sa.sa_handler = SignalHandler;
+ ASSERT_THAT(sigaction(SIGUSR1, &sa, nullptr), SyscallSucceeds());
+
+ ExecveArray args = {
+ RunfilePath(kStateWorkload),
+ "CheckSigHandler",
+ absl::StrCat(SIGUSR1),
+ absl::StrCat(absl::Hex(reinterpret_cast<uintptr_t>(SIG_DFL))),
+ };
+
+ CheckExec(RunfilePath(kStateWorkload), args, {}, W_EXITCODE(0, 0), "");
+}
+
+// Ignored signal dispositions are not reset.
+TEST(ExecStateTest, IgnorePreserved) {
+ struct sigaction sa;
+ sa.sa_handler = SIG_IGN;
+ ASSERT_THAT(sigaction(SIGUSR1, &sa, nullptr), SyscallSucceeds());
+
+ ExecveArray args = {
+ RunfilePath(kStateWorkload),
+ "CheckSigHandler",
+ absl::StrCat(SIGUSR1),
+ absl::StrCat(absl::Hex(reinterpret_cast<uintptr_t>(SIG_IGN))),
+ };
+
+ CheckExec(RunfilePath(kStateWorkload), args, {}, W_EXITCODE(0, 0), "");
+}
+
+// Signal masks are not reset on exec
+TEST(ExecStateTest, SignalMask) {
+ sigset_t s;
+ sigemptyset(&s);
+ sigaddset(&s, SIGUSR1);
+ ASSERT_THAT(sigprocmask(SIG_BLOCK, &s, nullptr), SyscallSucceeds());
+
+ ExecveArray args = {
+ RunfilePath(kStateWorkload),
+ "CheckSigBlocked",
+ absl::StrCat(SIGUSR1),
+ };
+
+ CheckExec(RunfilePath(kStateWorkload), args, {}, W_EXITCODE(0, 0), "");
+}
+
+// itimers persist across execve.
+// N.B. Timers created with timer_create(2) should not be preserved!
+TEST(ExecStateTest, ItimerPreserved) {
+ // The fork in ForkAndExec clears itimers, so only set them up after fork.
+ auto setup_itimer = [] {
+ // Ignore SIGALRM, as we don't actually care about timer
+ // expirations.
+ struct sigaction sa;
+ sa.sa_handler = SIG_IGN;
+ int ret = sigaction(SIGALRM, &sa, nullptr);
+ if (ret < 0) {
+ _exit(errno);
+ }
+
+ struct itimerval itv;
+ itv.it_interval.tv_sec = 1;
+ itv.it_interval.tv_usec = 0;
+ itv.it_value.tv_sec = 1;
+ itv.it_value.tv_usec = 0;
+ ret = setitimer(ITIMER_REAL, &itv, nullptr);
+ if (ret < 0) {
+ _exit(errno);
+ }
+ };
+
+ std::string filename = RunfilePath(kStateWorkload);
+ ExecveArray argv = {
+ filename,
+ "CheckItimerEnabled",
+ absl::StrCat(ITIMER_REAL),
+ };
+
+ pid_t child;
+ int execve_errno;
+ auto kill = ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec(filename, argv, {}, setup_itimer, &child, &execve_errno));
+ ASSERT_EQ(0, execve_errno);
+
+ int status;
+ ASSERT_THAT(RetryEINTR(waitpid)(child, &status, 0), SyscallSucceeds());
+ EXPECT_EQ(0, status);
+
+ // Process cleanup no longer needed.
+ kill.Release();
+}
+
+TEST(ProcSelfExe, ChangesAcrossExecve) {
+ // See exec_proc_exe_workload for more details. We simply
+ // assert that the /proc/self/exe link changes across execve.
+ CheckExec(RunfilePath(kProcExeWorkload),
+ {RunfilePath(kProcExeWorkload),
+ ASSERT_NO_ERRNO_AND_VALUE(ProcessExePath(getpid()))},
+ {}, W_EXITCODE(0, 0), "");
+}
+
+TEST(ExecTest, CloexecNormalFile) {
+ TempPath tempFile = ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateFileWith(GetAbsoluteTestTmpdir(), "bar", 0755));
+ const FileDescriptor fd_closed_on_exec =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(tempFile.path(), O_RDONLY | O_CLOEXEC));
+
+ CheckExec(RunfilePath(kAssertClosedWorkload),
+ {RunfilePath(kAssertClosedWorkload),
+ absl::StrCat(fd_closed_on_exec.get())},
+ {}, W_EXITCODE(0, 0), "");
+
+ // The assert closed workload exits with code 2 if the file still exists. We
+ // can use this to do a negative test.
+ const FileDescriptor fd_open_on_exec =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(tempFile.path(), O_RDONLY));
+
+ CheckExec(
+ RunfilePath(kAssertClosedWorkload),
+ {RunfilePath(kAssertClosedWorkload), absl::StrCat(fd_open_on_exec.get())},
+ {}, W_EXITCODE(2, 0), "");
+}
+
+TEST(ExecTest, CloexecEventfd) {
+ int efd;
+ ASSERT_THAT(efd = eventfd(0, EFD_CLOEXEC), SyscallSucceeds());
+ FileDescriptor fd(efd);
+
+ CheckExec(RunfilePath(kAssertClosedWorkload),
+ {RunfilePath(kAssertClosedWorkload), absl::StrCat(fd.get())}, {},
+ W_EXITCODE(0, 0), "");
+}
+
+constexpr int kLinuxMaxSymlinks = 40;
+
+TEST(ExecTest, SymlinkLimitExceeded) {
+ std::string path = RunfilePath(kBasicWorkload);
+
+ // Hold onto TempPath objects so they are not destructed prematurely.
+ std::vector<TempPath> symlinks;
+ for (int i = 0; i < kLinuxMaxSymlinks + 1; i++) {
+ symlinks.push_back(
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateSymlinkTo("/tmp", path)));
+ path = symlinks[i].path();
+ }
+
+ int execve_errno;
+ ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec(path, {path}, {}, /*child=*/nullptr, &execve_errno));
+ EXPECT_EQ(execve_errno, ELOOP);
+}
+
+TEST(ExecTest, SymlinkLimitRefreshedForInterpreter) {
+ std::string tmp_dir = "/tmp";
+ std::string interpreter_path = "/bin/echo";
+ TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ tmp_dir, absl::StrCat("#!", interpreter_path), 0755));
+ std::string script_path = script.path();
+
+ // Hold onto TempPath objects so they are not destructed prematurely.
+ std::vector<TempPath> interpreter_symlinks;
+ std::vector<TempPath> script_symlinks;
+ // Replace both the interpreter and script paths with symlink chains of just
+ // over half the symlink limit each; this is the minimum required to test that
+ // the symlink limit applies separately to each traversal, while tolerating
+ // some symlinks in the resolution of (the original) interpreter_path and
+ // script_path.
+ for (int i = 0; i < (kLinuxMaxSymlinks / 2) + 1; i++) {
+ interpreter_symlinks.push_back(ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateSymlinkTo(tmp_dir, interpreter_path)));
+ interpreter_path = interpreter_symlinks[i].path();
+ script_symlinks.push_back(ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateSymlinkTo(tmp_dir, script_path)));
+ script_path = script_symlinks[i].path();
+ }
+
+ CheckExec(script_path, {script_path}, {}, ArgEnvExitStatus(0, 0), "");
+}
+
+TEST(ExecveatTest, BasicWithFDCWD) {
+ std::string path = RunfilePath(kBasicWorkload);
+ CheckExecveat(AT_FDCWD, path, {path}, {}, /*flags=*/0, ArgEnvExitStatus(0, 0),
+ absl::StrCat(path, "\n"));
+}
+
+TEST(ExecveatTest, Basic) {
+ std::string absolute_path = RunfilePath(kBasicWorkload);
+ std::string parent_dir = std::string(Dirname(absolute_path));
+ std::string base = std::string(Basename(absolute_path));
+ const FileDescriptor dirfd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(parent_dir, O_DIRECTORY));
+
+ CheckExecveat(dirfd.get(), base, {absolute_path}, {}, /*flags=*/0,
+ ArgEnvExitStatus(0, 0), absl::StrCat(absolute_path, "\n"));
+}
+
+TEST(ExecveatTest, FDNotADirectory) {
+ std::string absolute_path = RunfilePath(kBasicWorkload);
+ std::string base = std::string(Basename(absolute_path));
+ const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(absolute_path, 0));
+
+ int execve_errno;
+ ASSERT_NO_ERRNO_AND_VALUE(ForkAndExecveat(fd.get(), base, {absolute_path}, {},
+ /*flags=*/0, /*child=*/nullptr,
+ &execve_errno));
+ EXPECT_EQ(execve_errno, ENOTDIR);
+}
+
+TEST(ExecveatTest, AbsolutePathWithFDCWD) {
+ std::string path = RunfilePath(kBasicWorkload);
+ CheckExecveat(AT_FDCWD, path, {path}, {}, ArgEnvExitStatus(0, 0), 0,
+ absl::StrCat(path, "\n"));
+}
+
+TEST(ExecveatTest, AbsolutePath) {
+ std::string path = RunfilePath(kBasicWorkload);
+ // File descriptor should be ignored when an absolute path is given.
+ const int32_t badFD = -1;
+ CheckExecveat(badFD, path, {path}, {}, ArgEnvExitStatus(0, 0), 0,
+ absl::StrCat(path, "\n"));
+}
+
+TEST(ExecveatTest, EmptyPathBasic) {
+ std::string path = RunfilePath(kBasicWorkload);
+ const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_PATH));
+
+ CheckExecveat(fd.get(), "", {path}, {}, AT_EMPTY_PATH, ArgEnvExitStatus(0, 0),
+ absl::StrCat(path, "\n"));
+}
+
+TEST(ExecveatTest, EmptyPathWithDirFD) {
+ std::string path = RunfilePath(kBasicWorkload);
+ std::string parent_dir = std::string(Dirname(path));
+ const FileDescriptor dirfd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(parent_dir, O_DIRECTORY));
+
+ int execve_errno;
+ ASSERT_NO_ERRNO_AND_VALUE(ForkAndExecveat(dirfd.get(), "", {path}, {},
+ AT_EMPTY_PATH,
+ /*child=*/nullptr, &execve_errno));
+ EXPECT_EQ(execve_errno, EACCES);
+}
+
+TEST(ExecveatTest, EmptyPathWithoutEmptyPathFlag) {
+ std::string path = RunfilePath(kBasicWorkload);
+ const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_PATH));
+
+ int execve_errno;
+ ASSERT_NO_ERRNO_AND_VALUE(ForkAndExecveat(
+ fd.get(), "", {path}, {}, /*flags=*/0, /*child=*/nullptr, &execve_errno));
+ EXPECT_EQ(execve_errno, ENOENT);
+}
+
+TEST(ExecveatTest, AbsolutePathWithEmptyPathFlag) {
+ std::string path = RunfilePath(kBasicWorkload);
+ const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_PATH));
+
+ CheckExecveat(fd.get(), path, {path}, {}, AT_EMPTY_PATH,
+ ArgEnvExitStatus(0, 0), absl::StrCat(path, "\n"));
+}
+
+TEST(ExecveatTest, RelativePathWithEmptyPathFlag) {
+ std::string absolute_path = RunfilePath(kBasicWorkload);
+ std::string parent_dir = std::string(Dirname(absolute_path));
+ std::string base = std::string(Basename(absolute_path));
+ const FileDescriptor dirfd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(parent_dir, O_DIRECTORY));
+
+ CheckExecveat(dirfd.get(), base, {absolute_path}, {}, AT_EMPTY_PATH,
+ ArgEnvExitStatus(0, 0), absl::StrCat(absolute_path, "\n"));
+}
+
+TEST(ExecveatTest, SymlinkNoFollowWithRelativePath) {
+ std::string parent_dir = "/tmp";
+ TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateSymlinkTo(parent_dir, RunfilePath(kBasicWorkload)));
+ const FileDescriptor dirfd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(parent_dir, O_DIRECTORY));
+ std::string base = std::string(Basename(link.path()));
+
+ int execve_errno;
+ ASSERT_NO_ERRNO_AND_VALUE(ForkAndExecveat(dirfd.get(), base, {base}, {},
+ AT_SYMLINK_NOFOLLOW,
+ /*child=*/nullptr, &execve_errno));
+ EXPECT_EQ(execve_errno, ELOOP);
+}
+
+TEST(ExecveatTest, UnshareFiles) {
+ TempPath tempFile = ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateFileWith(GetAbsoluteTestTmpdir(), "bar", 0755));
+ const FileDescriptor fd_closed_on_exec =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(tempFile.path(), O_RDONLY | O_CLOEXEC));
+
+ ExecveArray argv = {"test"};
+ ExecveArray envp;
+ std::string child_path = RunfilePath(kBasicWorkload);
+ pid_t child =
+ syscall(__NR_clone, SIGCHLD | CLONE_VFORK | CLONE_FILES, 0, 0, 0, 0);
+ if (child == 0) {
+ execve(child_path.c_str(), argv.get(), envp.get());
+ _exit(1);
+ }
+ ASSERT_THAT(child, SyscallSucceeds());
+
+ int status;
+ ASSERT_THAT(RetryEINTR(waitpid)(child, &status, 0), SyscallSucceeds());
+ EXPECT_EQ(status, 0);
+
+ struct stat st;
+ EXPECT_THAT(fstat(fd_closed_on_exec.get(), &st), SyscallSucceeds());
+}
+
+TEST(ExecveatTest, SymlinkNoFollowWithAbsolutePath) {
+ std::string parent_dir = "/tmp";
+ TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateSymlinkTo(parent_dir, RunfilePath(kBasicWorkload)));
+ std::string path = link.path();
+
+ int execve_errno;
+ ASSERT_NO_ERRNO_AND_VALUE(ForkAndExecveat(AT_FDCWD, path, {path}, {},
+ AT_SYMLINK_NOFOLLOW,
+ /*child=*/nullptr, &execve_errno));
+ EXPECT_EQ(execve_errno, ELOOP);
+}
+
+TEST(ExecveatTest, SymlinkNoFollowAndEmptyPath) {
+ TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateSymlinkTo("/tmp", RunfilePath(kBasicWorkload)));
+ std::string path = link.path();
+ const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, 0));
+
+ CheckExecveat(fd.get(), "", {path}, {}, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW,
+ ArgEnvExitStatus(0, 0), absl::StrCat(path, "\n"));
+}
+
+TEST(ExecveatTest, SymlinkNoFollowIgnoreSymlinkAncestor) {
+ TempPath parent_link =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateSymlinkTo("/tmp", "/bin"));
+ std::string path_with_symlink = JoinPath(parent_link.path(), "echo");
+
+ CheckExecveat(AT_FDCWD, path_with_symlink, {path_with_symlink}, {},
+ AT_SYMLINK_NOFOLLOW, ArgEnvExitStatus(0, 0), "");
+}
+
+TEST(ExecveatTest, SymlinkNoFollowWithNormalFile) {
+ const FileDescriptor dirfd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/bin", O_DIRECTORY));
+
+ CheckExecveat(dirfd.get(), "echo", {"echo"}, {}, AT_SYMLINK_NOFOLLOW,
+ ArgEnvExitStatus(0, 0), "");
+}
+
+TEST(ExecveatTest, BasicWithCloexecFD) {
+ std::string path = RunfilePath(kBasicWorkload);
+ const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_CLOEXEC));
+
+ CheckExecveat(fd.get(), "", {path}, {}, AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH,
+ ArgEnvExitStatus(0, 0), absl::StrCat(path, "\n"));
+}
+
+TEST(ExecveatTest, InterpreterScriptWithCloexecFD) {
+ std::string path = RunfilePath(kExitScript);
+ const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_CLOEXEC));
+
+ int execve_errno;
+ ASSERT_NO_ERRNO_AND_VALUE(ForkAndExecveat(fd.get(), "", {path}, {},
+ AT_EMPTY_PATH, /*child=*/nullptr,
+ &execve_errno));
+ EXPECT_EQ(execve_errno, ENOENT);
+}
+
+TEST(ExecveatTest, InterpreterScriptWithCloexecDirFD) {
+ std::string absolute_path = RunfilePath(kExitScript);
+ std::string parent_dir = std::string(Dirname(absolute_path));
+ std::string base = std::string(Basename(absolute_path));
+ const FileDescriptor dirfd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(parent_dir, O_CLOEXEC | O_DIRECTORY));
+
+ int execve_errno;
+ ASSERT_NO_ERRNO_AND_VALUE(ForkAndExecveat(dirfd.get(), base, {base}, {},
+ /*flags=*/0, /*child=*/nullptr,
+ &execve_errno));
+ EXPECT_EQ(execve_errno, ENOENT);
+}
+
+TEST(ExecveatTest, InvalidFlags) {
+ int execve_errno;
+ ASSERT_NO_ERRNO_AND_VALUE(ForkAndExecveat(
+ /*dirfd=*/-1, "", {}, {}, /*flags=*/0xFFFF, /*child=*/nullptr,
+ &execve_errno));
+ EXPECT_EQ(execve_errno, EINVAL);
+}
+
+// Priority consistent across calls to execve()
+TEST(GetpriorityTest, ExecveMaintainsPriority) {
+ int prio = 16;
+ ASSERT_THAT(setpriority(PRIO_PROCESS, getpid(), prio), SyscallSucceeds());
+
+ // To avoid trying to use negative exit values, check for
+ // 20 - prio. Since prio should always be in the range [-20, 19],
+ // this leave expected_exit_code in the range [1, 40].
+ int expected_exit_code = 20 - prio;
+
+ // Program run (priority_execve) will exit(X) where
+ // X=getpriority(PRIO_PROCESS,0). Check that this exit value is prio.
+ CheckExec(RunfilePath(kPriorityWorkload), {RunfilePath(kPriorityWorkload)},
+ {}, W_EXITCODE(expected_exit_code, 0), "");
+}
+
+void ExecWithThread() {
+ // Used to ensure that the thread has actually started.
+ absl::Mutex mu;
+ bool started = false;
+
+ ScopedThread t([&] {
+ mu.Lock();
+ started = true;
+ mu.Unlock();
+
+ while (true) {
+ pause();
+ }
+ });
+
+ mu.LockWhen(absl::Condition(&started));
+ mu.Unlock();
+
+ const ExecveArray argv = {"/proc/self/exe", kExit42};
+ const ExecveArray envv;
+
+ execve("/proc/self/exe", argv.get(), envv.get());
+ exit(errno);
+}
+
+void ExecFromThread() {
+ ScopedThread t([] {
+ const ExecveArray argv = {"/proc/self/exe", kExit42};
+ const ExecveArray envv;
+
+ execve("/proc/self/exe", argv.get(), envv.get());
+ exit(errno);
+ });
+
+ while (true) {
+ pause();
+ }
+}
+
+bool ValidateProcCmdlineVsArgv(const int argc, const char* const* argv) {
+ auto contents_or = GetContents("/proc/self/cmdline");
+ if (!contents_or.ok()) {
+ std::cerr << "Unable to get /proc/self/cmdline: " << contents_or.error()
+ << std::endl;
+ return false;
+ }
+ auto contents = contents_or.ValueOrDie();
+ if (contents.back() != '\0') {
+ std::cerr << "Non-null terminated /proc/self/cmdline!" << std::endl;
+ return false;
+ }
+ contents.pop_back();
+ std::vector<std::string> procfs_cmdline = absl::StrSplit(contents, '\0');
+
+ if (static_cast<int>(procfs_cmdline.size()) != argc) {
+ std::cerr << "argc = " << argc << " != " << procfs_cmdline.size()
+ << std::endl;
+ return false;
+ }
+
+ for (int i = 0; i < argc; ++i) {
+ if (procfs_cmdline[i] != argv[i]) {
+ std::cerr << "Procfs command line argument " << i << " mismatch "
+ << procfs_cmdline[i] << " != " << argv[i] << std::endl;
+ return false;
+ }
+ }
+ return true;
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
+
+int main(int argc, char** argv) {
+ // Start by validating that the stack argv is consistent with procfs.
+ if (!gvisor::testing::ValidateProcCmdlineVsArgv(argc, argv)) {
+ return 1;
+ }
+
+ // Some of these tests require no background threads, so check for them before
+ // TestInit.
+ for (int i = 0; i < argc; i++) {
+ absl::string_view arg(argv[i]);
+
+ if (arg == gvisor::testing::kExit42) {
+ return 42;
+ }
+ if (arg == gvisor::testing::kExecWithThread) {
+ gvisor::testing::ExecWithThread();
+ return 1;
+ }
+ if (arg == gvisor::testing::kExecFromThread) {
+ gvisor::testing::ExecFromThread();
+ return 1;
+ }
+ }
+
+ gvisor::testing::TestInit(&argc, &argv);
+ return gvisor::testing::RunAllTests();
+}
diff --git a/test/syscalls/linux/exec.h b/test/syscalls/linux/exec.h
new file mode 100644
index 000000000..5c0f7e654
--- /dev/null
+++ b/test/syscalls/linux/exec.h
@@ -0,0 +1,34 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_EXEC_H_
+#define GVISOR_TEST_SYSCALLS_EXEC_H_
+
+#include <sys/wait.h>
+
+namespace gvisor {
+namespace testing {
+
+// Returns the exit code used by exec_basic_workload.
+inline int ArgEnvExitCode(int args, int envs) { return args + envs * 10; }
+
+// Returns the exit status used by exec_basic_workload.
+inline int ArgEnvExitStatus(int args, int envs) {
+ return W_EXITCODE(ArgEnvExitCode(args, envs), 0);
+}
+
+} // namespace testing
+} // namespace gvisor
+
+#endif // GVISOR_TEST_SYSCALLS_EXEC_H_
diff --git a/test/syscalls/linux/exec_assert_closed_workload.cc b/test/syscalls/linux/exec_assert_closed_workload.cc
new file mode 100644
index 000000000..95643618d
--- /dev/null
+++ b/test/syscalls/linux/exec_assert_closed_workload.cc
@@ -0,0 +1,45 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <iostream>
+
+#include "absl/strings/numbers.h"
+
+int main(int argc, char** argv) {
+ if (argc != 2) {
+ std::cerr << "need two arguments, got " << argc;
+ exit(1);
+ }
+ int fd;
+ if (!absl::SimpleAtoi(argv[1], &fd)) {
+ std::cerr << "fd: " << argv[1] << " could not be parsed" << std::endl;
+ exit(1);
+ }
+ struct stat s;
+ if (fstat(fd, &s) == 0) {
+ std::cerr << "fd: " << argv[1] << " should not be valid" << std::endl;
+ exit(2);
+ }
+ if (errno != EBADF) {
+ std::cerr << "fstat fd: " << argv[1] << " got errno: " << errno
+ << " wanted: " << EBADF << std::endl;
+ exit(1);
+ }
+ return 0;
+}
diff --git a/test/syscalls/linux/exec_basic_workload.cc b/test/syscalls/linux/exec_basic_workload.cc
new file mode 100644
index 000000000..1bbd6437e
--- /dev/null
+++ b/test/syscalls/linux/exec_basic_workload.cc
@@ -0,0 +1,31 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdlib.h>
+
+#include <iostream>
+
+#include "test/syscalls/linux/exec.h"
+
+int main(int argc, char** argv, char** envp) {
+ int i;
+ for (i = 0; i < argc; i++) {
+ std::cerr << argv[i] << std::endl;
+ }
+ for (i = 0; envp[i] != nullptr; i++) {
+ std::cerr << envp[i] << std::endl;
+ }
+ exit(gvisor::testing::ArgEnvExitCode(argc - 1, i));
+ return 0;
+}
diff --git a/test/syscalls/linux/exec_binary.cc b/test/syscalls/linux/exec_binary.cc
new file mode 100644
index 000000000..18d2f22c1
--- /dev/null
+++ b/test/syscalls/linux/exec_binary.cc
@@ -0,0 +1,1646 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <elf.h>
+#include <errno.h>
+#include <signal.h>
+#include <sys/ptrace.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/user.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <functional>
+#include <iterator>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "test/util/cleanup.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/proc_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+using ::testing::AnyOf;
+using ::testing::Eq;
+
+#if !defined(__x86_64__) && !defined(__aarch64__)
+// The assembly stub and ELF internal details must be ported to other arches.
+#error "Test only supported on x86-64/arm64"
+#endif // __x86_64__ || __aarch64__
+
+#if defined(__x86_64__)
+#define EM_TYPE EM_X86_64
+#define IP_REG(p) ((p).rip)
+#define RAX_REG(p) ((p).rax)
+#define RDI_REG(p) ((p).rdi)
+#define RETURN_REG(p) ((p).rax)
+
+// amd64 stub that calls PTRACE_TRACEME and sends itself SIGSTOP.
+const char kPtraceCode[] = {
+ // movq $101, %rax /* ptrace */
+ '\x48',
+ '\xc7',
+ '\xc0',
+ '\x65',
+ '\x00',
+ '\x00',
+ '\x00',
+ // movq $0, %rsi /* PTRACE_TRACEME */
+ '\x48',
+ '\xc7',
+ '\xc6',
+ '\x00',
+ '\x00',
+ '\x00',
+ '\x00',
+ // movq $0, %rdi
+ '\x48',
+ '\xc7',
+ '\xc7',
+ '\x00',
+ '\x00',
+ '\x00',
+ '\x00',
+ // movq $0, %rdx
+ '\x48',
+ '\xc7',
+ '\xc2',
+ '\x00',
+ '\x00',
+ '\x00',
+ '\x00',
+ // movq $0, %r10
+ '\x49',
+ '\xc7',
+ '\xc2',
+ '\x00',
+ '\x00',
+ '\x00',
+ '\x00',
+ // syscall
+ '\x0f',
+ '\x05',
+
+ // movq $39, %rax /* getpid */
+ '\x48',
+ '\xc7',
+ '\xc0',
+ '\x27',
+ '\x00',
+ '\x00',
+ '\x00',
+ // syscall
+ '\x0f',
+ '\x05',
+
+ // movq %rax, %rdi /* pid */
+ '\x48',
+ '\x89',
+ '\xc7',
+ // movq $62, %rax /* kill */
+ '\x48',
+ '\xc7',
+ '\xc0',
+ '\x3e',
+ '\x00',
+ '\x00',
+ '\x00',
+ // movq $19, %rsi /* SIGSTOP */
+ '\x48',
+ '\xc7',
+ '\xc6',
+ '\x13',
+ '\x00',
+ '\x00',
+ '\x00',
+ // syscall
+ '\x0f',
+ '\x05',
+};
+
+// Size of a syscall instruction.
+constexpr int kSyscallSize = 2;
+
+#elif defined(__aarch64__)
+#define EM_TYPE EM_AARCH64
+#define IP_REG(p) ((p).pc)
+#define RAX_REG(p) ((p).regs[8])
+#define RDI_REG(p) ((p).regs[0])
+#define RETURN_REG(p) ((p).regs[0])
+
+const char kPtraceCode[] = {
+ // MOVD $117, R8 /* ptrace */
+ '\xa8',
+ '\x0e',
+ '\x80',
+ '\xd2',
+ // MOVD $0, R0 /* PTRACE_TRACEME */
+ '\x00',
+ '\x00',
+ '\x80',
+ '\xd2',
+ // MOVD $0, R1 /* pid */
+ '\x01',
+ '\x00',
+ '\x80',
+ '\xd2',
+ // MOVD $0, R2 /* addr */
+ '\x02',
+ '\x00',
+ '\x80',
+ '\xd2',
+ // MOVD $0, R3 /* data */
+ '\x03',
+ '\x00',
+ '\x80',
+ '\xd2',
+ // SVC
+ '\x01',
+ '\x00',
+ '\x00',
+ '\xd4',
+ // MOVD $172, R8 /* getpid */
+ '\x88',
+ '\x15',
+ '\x80',
+ '\xd2',
+ // SVC
+ '\x01',
+ '\x00',
+ '\x00',
+ '\xd4',
+ // MOVD $129, R8 /* kill, R0=pid */
+ '\x28',
+ '\x10',
+ '\x80',
+ '\xd2',
+ // MOVD $19, R1 /* SIGSTOP */
+ '\x61',
+ '\x02',
+ '\x80',
+ '\xd2',
+ // SVC
+ '\x01',
+ '\x00',
+ '\x00',
+ '\xd4',
+};
+// Size of a syscall instruction.
+constexpr int kSyscallSize = 4;
+#else
+#error "Unknown architecture"
+#endif
+
+// This test suite tests executable loading in the kernel (ELF and interpreter
+// scripts).
+
+// Parameterized ELF types for 64 and 32 bit.
+template <int Size>
+struct ElfTypes;
+
+template <>
+struct ElfTypes<64> {
+ typedef Elf64_Ehdr ElfEhdr;
+ typedef Elf64_Phdr ElfPhdr;
+};
+
+template <>
+struct ElfTypes<32> {
+ typedef Elf32_Ehdr ElfEhdr;
+ typedef Elf32_Phdr ElfPhdr;
+};
+
+template <int Size>
+struct ElfBinary {
+ using ElfEhdr = typename ElfTypes<Size>::ElfEhdr;
+ using ElfPhdr = typename ElfTypes<Size>::ElfPhdr;
+
+ ElfEhdr header = {};
+ std::vector<ElfPhdr> phdrs;
+ std::vector<char> data;
+
+ // UpdateOffsets updates p_offset, p_vaddr in all phdrs to account for the
+ // space taken by the header and phdrs.
+ //
+ // It also updates header.e_phnum and adds the offset to header.e_entry to
+ // account for the headers residing in the first PT_LOAD segment.
+ //
+ // Before calling UpdateOffsets each of those fields should be the appropriate
+ // offset into data.
+ void UpdateOffsets() {
+ size_t offset = sizeof(header) + phdrs.size() * sizeof(ElfPhdr);
+ header.e_entry += offset;
+ header.e_phnum = phdrs.size();
+ for (auto& p : phdrs) {
+ p.p_offset += offset;
+ p.p_vaddr += offset;
+ }
+ }
+
+ // AddInterpreter adds a PT_INTERP segment with the passed contents.
+ //
+ // A later call to UpdateOffsets is required to make the new phdr valid.
+ void AddInterpreter(std::vector<char> contents) {
+ const int start = data.size();
+ data.insert(data.end(), contents.begin(), contents.end());
+ const int size = data.size() - start;
+
+ ElfPhdr phdr = {};
+ phdr.p_type = PT_INTERP;
+ phdr.p_offset = start;
+ phdr.p_filesz = size;
+ phdr.p_memsz = size;
+ // "If [PT_INTERP] is present, it must precede any loadable segment entry."
+ phdrs.insert(phdrs.begin(), phdr);
+ }
+
+ // Writes the header, phdrs, and data to fd.
+ PosixError Write(int fd) const {
+ int ret = WriteFd(fd, &header, sizeof(header));
+ if (ret < 0) {
+ return PosixError(errno, "failed to write header");
+ } else if (ret != sizeof(header)) {
+ return PosixError(EIO, absl::StrCat("short write of header: ", ret));
+ }
+
+ for (auto const& p : phdrs) {
+ ret = WriteFd(fd, &p, sizeof(p));
+ if (ret < 0) {
+ return PosixError(errno, "failed to write phdr");
+ } else if (ret != sizeof(p)) {
+ return PosixError(EIO, absl::StrCat("short write of phdr: ", ret));
+ }
+ }
+
+ ret = WriteFd(fd, data.data(), data.size());
+ if (ret < 0) {
+ return PosixError(errno, "failed to write data");
+ } else if (ret != static_cast<int>(data.size())) {
+ return PosixError(EIO, absl::StrCat("short write of data: ", ret));
+ }
+
+ return NoError();
+ }
+};
+
+// Creates a new temporary executable ELF file in parent with elf as the
+// contents.
+template <int Size>
+PosixErrorOr<TempPath> CreateElfWith(absl::string_view parent,
+ ElfBinary<Size> const& elf) {
+ ASSIGN_OR_RETURN_ERRNO(
+ auto file, TempPath::CreateFileWith(parent, absl::string_view(), 0755));
+ ASSIGN_OR_RETURN_ERRNO(auto fd, Open(file.path(), O_RDWR));
+ RETURN_IF_ERRNO(elf.Write(fd.get()));
+ return std::move(file);
+}
+
+// Creates a new temporary executable ELF file with elf as the contents.
+template <int Size>
+PosixErrorOr<TempPath> CreateElfWith(ElfBinary<Size> const& elf) {
+ return CreateElfWith(GetAbsoluteTestTmpdir(), elf);
+}
+
+// Wait for pid to stop, and assert that it stopped via SIGSTOP.
+PosixError WaitStopped(pid_t pid) {
+ int status;
+ int ret = RetryEINTR(waitpid)(pid, &status, 0);
+ MaybeSave();
+ if (ret < 0) {
+ return PosixError(errno, "wait failed");
+ } else if (ret != pid) {
+ return PosixError(ESRCH, absl::StrCat("wait got ", ret, " want ", pid));
+ }
+
+ if (!WIFSTOPPED(status) || WSTOPSIG(status) != SIGSTOP) {
+ return PosixError(EINVAL,
+ absl::StrCat("pid did not SIGSTOP; status = ", status));
+ }
+
+ return NoError();
+}
+
+// Returns a valid ELF that PTRACE_TRACEME and SIGSTOPs itself.
+//
+// UpdateOffsets must be called before writing this ELF.
+ElfBinary<64> StandardElf() {
+ ElfBinary<64> elf;
+ elf.header.e_ident[EI_MAG0] = ELFMAG0;
+ elf.header.e_ident[EI_MAG1] = ELFMAG1;
+ elf.header.e_ident[EI_MAG2] = ELFMAG2;
+ elf.header.e_ident[EI_MAG3] = ELFMAG3;
+ elf.header.e_ident[EI_CLASS] = ELFCLASS64;
+ elf.header.e_ident[EI_DATA] = ELFDATA2LSB;
+ elf.header.e_ident[EI_VERSION] = EV_CURRENT;
+ elf.header.e_type = ET_EXEC;
+ elf.header.e_machine = EM_TYPE;
+ elf.header.e_version = EV_CURRENT;
+ elf.header.e_phoff = sizeof(elf.header);
+ elf.header.e_phentsize = sizeof(decltype(elf)::ElfPhdr);
+
+ // TODO(gvisor.dev/issue/153): Always include a PT_GNU_STACK segment to
+ // disable executable stacks. With this omitted the stack (and all PROT_READ)
+ // mappings should be executable, but gVisor doesn't support that.
+ decltype(elf)::ElfPhdr phdr = {};
+ phdr.p_type = PT_GNU_STACK;
+ phdr.p_flags = PF_R | PF_W;
+ elf.phdrs.push_back(phdr);
+
+ phdr = {};
+ phdr.p_type = PT_LOAD;
+ phdr.p_flags = PF_R | PF_X;
+ phdr.p_offset = 0;
+ phdr.p_vaddr = 0x40000;
+ phdr.p_filesz = sizeof(kPtraceCode);
+ phdr.p_memsz = phdr.p_filesz;
+ elf.phdrs.push_back(phdr);
+
+ elf.header.e_entry = phdr.p_vaddr;
+
+ elf.data.assign(kPtraceCode, kPtraceCode + sizeof(kPtraceCode));
+
+ return elf;
+}
+
+// Test that a trivial binary executes.
+TEST(ElfTest, Execute) {
+ ElfBinary<64> elf = StandardElf();
+ elf.UpdateOffsets();
+
+ TempPath file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(elf));
+
+ pid_t child;
+ int execve_errno;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec(file.path(), {file.path()}, {}, &child, &execve_errno));
+ ASSERT_EQ(execve_errno, 0);
+
+ // Ensure it made it to SIGSTOP.
+ ASSERT_NO_ERRNO(WaitStopped(child));
+
+ struct user_regs_struct regs;
+ struct iovec iov;
+ iov.iov_base = &regs;
+ iov.iov_len = sizeof(regs);
+ EXPECT_THAT(ptrace(PTRACE_GETREGSET, child, NT_PRSTATUS, &iov),
+ SyscallSucceeds());
+ // Read exactly the full register set.
+ EXPECT_EQ(iov.iov_len, sizeof(regs));
+ // RIP/PC is just beyond the final syscall instruction.
+ EXPECT_EQ(IP_REG(regs), elf.header.e_entry + sizeof(kPtraceCode));
+
+ EXPECT_THAT(child, ContainsMappings(std::vector<ProcMapsEntry>({
+ {0x40000, 0x41000, true, false, true, true, 0, 0, 0, 0,
+ file.path().c_str()},
+ })));
+}
+
+// StandardElf without data completes execve, but faults once running.
+TEST(ElfTest, MissingText) {
+ ElfBinary<64> elf = StandardElf();
+ elf.data.clear();
+ elf.UpdateOffsets();
+
+ TempPath file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(elf));
+
+ pid_t child;
+ int execve_errno;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec(file.path(), {file.path()}, {}, &child, &execve_errno));
+ ASSERT_EQ(execve_errno, 0);
+
+ int status;
+ ASSERT_THAT(RetryEINTR(waitpid)(child, &status, 0),
+ SyscallSucceedsWithValue(child));
+ // It runs off the end of the zeroes filling the end of the page.
+#if defined(__x86_64__)
+ EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGSEGV) << status;
+#elif defined(__aarch64__)
+ // 0 is an invalid instruction opcode on arm64.
+ EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGILL) << status;
+#endif
+}
+
+// Typical ELF with a data + bss segment
+TEST(ElfTest, DataSegment) {
+ ElfBinary<64> elf = StandardElf();
+
+ // Create a standard ELF, but extend to 1.5 pages. The second page will be the
+ // beginning of a multi-page data + bss segment.
+ elf.data.resize(kPageSize + kPageSize / 2);
+
+ decltype(elf)::ElfPhdr phdr = {};
+ phdr.p_type = PT_LOAD;
+ phdr.p_flags = PF_R | PF_W;
+ phdr.p_offset = kPageSize;
+ phdr.p_vaddr = 0x41000;
+ phdr.p_filesz = kPageSize / 2;
+ // The header is going to push vaddr up by a few hundred bytes. Keep p_memsz a
+ // bit less than 2 pages so this mapping doesn't extend beyond 0x43000.
+ phdr.p_memsz = 2 * kPageSize - kPageSize / 2;
+ elf.phdrs.push_back(phdr);
+
+ elf.UpdateOffsets();
+
+ TempPath file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(elf));
+
+ pid_t child;
+ int execve_errno;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec(file.path(), {file.path()}, {}, &child, &execve_errno));
+ ASSERT_EQ(execve_errno, 0);
+
+ ASSERT_NO_ERRNO(WaitStopped(child));
+
+ EXPECT_THAT(
+ child, ContainsMappings(std::vector<ProcMapsEntry>({
+ // text page.
+ {0x40000, 0x41000, true, false, true, true, 0, 0, 0, 0,
+ file.path().c_str()},
+ // data + bss page from file.
+ {0x41000, 0x42000, true, true, false, true, kPageSize, 0, 0, 0,
+ file.path().c_str()},
+ // bss page from anon.
+ {0x42000, 0x43000, true, true, false, true, 0, 0, 0, 0, ""},
+ })));
+}
+
+// Additonal pages beyond filesz honor (only) execute protections.
+//
+// N.B. Linux changed this in 4.11 (16e72e9b30986 "powerpc: do not make the
+// entire heap executable"). Previously, extra pages were always RW.
+TEST(ElfTest, ExtraMemPages) {
+ // gVisor has the newer behavior.
+ if (!IsRunningOnGvisor()) {
+ auto version = ASSERT_NO_ERRNO_AND_VALUE(GetKernelVersion());
+ SKIP_IF(version.major < 4 || (version.major == 4 && version.minor < 11));
+ }
+
+ ElfBinary<64> elf = StandardElf();
+
+ // Create a standard ELF, but extend to 1.5 pages. The second page will be the
+ // beginning of a multi-page data + bss segment.
+ elf.data.resize(kPageSize + kPageSize / 2);
+
+ decltype(elf)::ElfPhdr phdr = {};
+ phdr.p_type = PT_LOAD;
+ // RWX segment. The extra anon page will also be RWX.
+ //
+ // N.B. Linux uses clear_user to clear the end of the file-mapped page, which
+ // respects the mapping protections. Thus if we map this RO with memsz >
+ // (unaligned) filesz, then execve will fail with EFAULT. See padzero(elf_bss)
+ // in fs/binfmt_elf.c:load_elf_binary.
+ //
+ // N.N.B.B. The above only applies to the last segment. For earlier segments,
+ // the clear_user error is ignored.
+ phdr.p_flags = PF_R | PF_W | PF_X;
+ phdr.p_offset = kPageSize;
+ phdr.p_vaddr = 0x41000;
+ phdr.p_filesz = kPageSize / 2;
+ // The header is going to push vaddr up by a few hundred bytes. Keep p_memsz a
+ // bit less than 2 pages so this mapping doesn't extend beyond 0x43000.
+ phdr.p_memsz = 2 * kPageSize - kPageSize / 2;
+ elf.phdrs.push_back(phdr);
+
+ elf.UpdateOffsets();
+
+ TempPath file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(elf));
+
+ pid_t child;
+ int execve_errno;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec(file.path(), {file.path()}, {}, &child, &execve_errno));
+ ASSERT_EQ(execve_errno, 0);
+
+ ASSERT_NO_ERRNO(WaitStopped(child));
+
+ EXPECT_THAT(child,
+ ContainsMappings(std::vector<ProcMapsEntry>({
+ // text page.
+ {0x40000, 0x41000, true, false, true, true, 0, 0, 0, 0,
+ file.path().c_str()},
+ // data + bss page from file.
+ {0x41000, 0x42000, true, true, true, true, kPageSize, 0, 0, 0,
+ file.path().c_str()},
+ // extra page from anon.
+ {0x42000, 0x43000, true, true, true, true, 0, 0, 0, 0, ""},
+ })));
+}
+
+// An aligned segment with filesz == 0, memsz > 0 is anon-only.
+TEST(ElfTest, AnonOnlySegment) {
+ ElfBinary<64> elf = StandardElf();
+
+ decltype(elf)::ElfPhdr phdr = {};
+ phdr.p_type = PT_LOAD;
+ // RO segment. The extra anon page will be RW anyways.
+ phdr.p_flags = PF_R;
+ phdr.p_offset = 0;
+ phdr.p_vaddr = 0x41000;
+ phdr.p_filesz = 0;
+ phdr.p_memsz = kPageSize;
+ elf.phdrs.push_back(phdr);
+
+ elf.UpdateOffsets();
+
+ // UpdateOffsets adjusts p_vaddr and p_offset by the header size, but we need
+ // a page-aligned p_vaddr to get a truly anon-only page.
+ elf.phdrs[2].p_vaddr = 0x41000;
+ // N.B. p_offset is now unaligned, but Linux doesn't care since this is
+ // anon-only.
+
+ TempPath file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(elf));
+
+ pid_t child;
+ int execve_errno;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec(file.path(), {file.path()}, {}, &child, &execve_errno));
+ ASSERT_EQ(execve_errno, 0);
+
+ ASSERT_NO_ERRNO(WaitStopped(child));
+
+ EXPECT_THAT(child,
+ ContainsMappings(std::vector<ProcMapsEntry>({
+ // text page.
+ {0x40000, 0x41000, true, false, true, true, 0, 0, 0, 0,
+ file.path().c_str()},
+ // anon page.
+ {0x41000, 0x42000, true, true, false, true, 0, 0, 0, 0, ""},
+ })));
+}
+
+// p_offset must have the same alignment as p_vaddr.
+TEST(ElfTest, UnalignedOffset) {
+ ElfBinary<64> elf = StandardElf();
+
+ // Unaligned offset.
+ elf.phdrs[1].p_offset += 1;
+
+ elf.UpdateOffsets();
+
+ TempPath file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(elf));
+
+ pid_t child;
+ int execve_errno;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec(file.path(), {file.path()}, {}, &child, &execve_errno));
+
+ // execve(2) return EINVAL, but behavior varies between Linux and gVisor.
+ //
+ // On Linux, the new mm is committed before attempting to map into it. By the
+ // time we hit EINVAL in the segment mmap, the old mm is gone. Linux returns
+ // to an empty mm, which immediately segfaults.
+ //
+ // OTOH, gVisor maps into the new mm before committing it. Thus when it hits
+ // failure, the caller is still intact to receive the error.
+ if (IsRunningOnGvisor()) {
+ ASSERT_EQ(execve_errno, EINVAL);
+ } else {
+ ASSERT_EQ(execve_errno, 0);
+
+ int status;
+ ASSERT_THAT(RetryEINTR(waitpid)(child, &status, 0),
+ SyscallSucceedsWithValue(child));
+ EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGSEGV) << status;
+ }
+}
+
+// Linux will allow PT_LOAD segments to overlap.
+TEST(ElfTest, DirectlyOverlappingSegments) {
+ // NOTE(b/37289926): see PIEOutOfOrderSegments.
+ SKIP_IF(IsRunningOnGvisor());
+
+ ElfBinary<64> elf = StandardElf();
+
+ // Same as the StandardElf mapping.
+ decltype(elf)::ElfPhdr phdr = {};
+ phdr.p_type = PT_LOAD;
+ // Add PF_W so we can differentiate this mapping from the first.
+ phdr.p_flags = PF_R | PF_W | PF_X;
+ phdr.p_offset = 0;
+ phdr.p_vaddr = 0x40000;
+ phdr.p_filesz = sizeof(kPtraceCode);
+ phdr.p_memsz = phdr.p_filesz;
+ elf.phdrs.push_back(phdr);
+
+ elf.UpdateOffsets();
+
+ TempPath file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(elf));
+
+ pid_t child;
+ int execve_errno;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec(file.path(), {file.path()}, {}, &child, &execve_errno));
+ ASSERT_EQ(execve_errno, 0);
+
+ ASSERT_NO_ERRNO(WaitStopped(child));
+
+ EXPECT_THAT(child, ContainsMappings(std::vector<ProcMapsEntry>({
+ {0x40000, 0x41000, true, true, true, true, 0, 0, 0, 0,
+ file.path().c_str()},
+ })));
+}
+
+// Linux allows out-of-order PT_LOAD segments.
+TEST(ElfTest, OutOfOrderSegments) {
+ // NOTE(b/37289926): see PIEOutOfOrderSegments.
+ SKIP_IF(IsRunningOnGvisor());
+
+ ElfBinary<64> elf = StandardElf();
+
+ decltype(elf)::ElfPhdr phdr = {};
+ phdr.p_type = PT_LOAD;
+ phdr.p_flags = PF_R | PF_X;
+ phdr.p_offset = 0;
+ phdr.p_vaddr = 0x20000;
+ phdr.p_filesz = sizeof(kPtraceCode);
+ phdr.p_memsz = phdr.p_filesz;
+ elf.phdrs.push_back(phdr);
+
+ elf.UpdateOffsets();
+
+ TempPath file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(elf));
+
+ pid_t child;
+ int execve_errno;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec(file.path(), {file.path()}, {}, &child, &execve_errno));
+ ASSERT_EQ(execve_errno, 0);
+
+ ASSERT_NO_ERRNO(WaitStopped(child));
+
+ EXPECT_THAT(child, ContainsMappings(std::vector<ProcMapsEntry>({
+ {0x20000, 0x21000, true, false, true, true, 0, 0, 0, 0,
+ file.path().c_str()},
+ {0x40000, 0x41000, true, false, true, true, 0, 0, 0, 0,
+ file.path().c_str()},
+ })));
+}
+
+// header.e_phoff is bound the end of the file.
+TEST(ElfTest, OutOfBoundsPhdrs) {
+ ElfBinary<64> elf = StandardElf();
+ elf.header.e_phoff = 0x100000;
+ elf.UpdateOffsets();
+
+ TempPath file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(elf));
+
+ pid_t child;
+ int execve_errno;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec(file.path(), {file.path()}, {}, &child, &execve_errno));
+ // On Linux 3.11, this caused EIO. On newer Linux, it causes ENOEXEC.
+ EXPECT_THAT(execve_errno, AnyOf(Eq(ENOEXEC), Eq(EIO)));
+}
+
+// Claim there is a phdr beyond the end of the file, but don't include it.
+TEST(ElfTest, MissingPhdr) {
+ ElfBinary<64> elf = StandardElf();
+
+ // Clear data so the file ends immediately after the phdrs.
+ // N.B. Per ElfTest.MissingData, StandardElf without data completes execve
+ // without error.
+ elf.data.clear();
+ elf.UpdateOffsets();
+
+ // Claim that there is another phdr just beyond the end of the file. Of
+ // course, it isn't accessible.
+ elf.header.e_phnum++;
+
+ TempPath file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(elf));
+
+ pid_t child;
+ int execve_errno;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec(file.path(), {file.path()}, {}, &child, &execve_errno));
+ // On Linux 3.11, this caused EIO. On newer Linux, it causes ENOEXEC.
+ EXPECT_THAT(execve_errno, AnyOf(Eq(ENOEXEC), Eq(EIO)));
+}
+
+// No headers at all, just the ELF magic.
+TEST(ElfTest, MissingHeader) {
+ TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileMode(0755));
+ FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR));
+
+ const char kElfMagic[] = {0x7f, 'E', 'L', 'F'};
+
+ ASSERT_THAT(WriteFd(fd.get(), &kElfMagic, sizeof(kElfMagic)),
+ SyscallSucceeds());
+ fd.reset();
+
+ pid_t child;
+ int execve_errno;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec(file.path(), {file.path()}, {}, &child, &execve_errno));
+ EXPECT_EQ(execve_errno, ENOEXEC);
+}
+
+// Load a PIE ELF with a data + bss segment.
+TEST(ElfTest, PIE) {
+ ElfBinary<64> elf = StandardElf();
+
+ elf.header.e_type = ET_DYN;
+
+ // Create a standard ELF, but extend to 1.5 pages. The second page will be the
+ // beginning of a multi-page data + bss segment.
+ elf.data.resize(kPageSize + kPageSize / 2);
+
+ elf.header.e_entry = 0x0;
+
+ decltype(elf)::ElfPhdr phdr = {};
+ phdr.p_type = PT_LOAD;
+ phdr.p_flags = PF_R | PF_W;
+ phdr.p_offset = kPageSize;
+ // Put the data segment at a bit of an offset.
+ phdr.p_vaddr = 0x20000;
+ phdr.p_filesz = kPageSize / 2;
+ // The header is going to push vaddr up by a few hundred bytes. Keep p_memsz a
+ // bit less than 2 pages so this mapping doesn't extend beyond 0x43000.
+ phdr.p_memsz = 2 * kPageSize - kPageSize / 2;
+ elf.phdrs.push_back(phdr);
+
+ elf.UpdateOffsets();
+
+ // The first segment really needs to start at 0 for a normal PIE binary, and
+ // thus includes the headers.
+ const uint64_t offset = elf.phdrs[1].p_offset;
+ elf.phdrs[1].p_offset = 0x0;
+ elf.phdrs[1].p_vaddr = 0x0;
+ elf.phdrs[1].p_filesz += offset;
+ elf.phdrs[1].p_memsz += offset;
+
+ TempPath file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(elf));
+
+ pid_t child;
+ int execve_errno;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec(file.path(), {file.path()}, {}, &child, &execve_errno));
+ ASSERT_EQ(execve_errno, 0);
+
+ ASSERT_NO_ERRNO(WaitStopped(child));
+
+ // RIP tells us which page the first segment was loaded into.
+ struct user_regs_struct regs;
+ struct iovec iov;
+ iov.iov_base = &regs;
+ iov.iov_len = sizeof(regs);
+
+ EXPECT_THAT(ptrace(PTRACE_GETREGSET, child, NT_PRSTATUS, &iov),
+ SyscallSucceeds());
+ // Read exactly the full register set.
+ EXPECT_EQ(iov.iov_len, sizeof(regs));
+
+ const uint64_t load_addr = IP_REG(regs) & ~(kPageSize - 1);
+
+ EXPECT_THAT(child, ContainsMappings(std::vector<ProcMapsEntry>({
+ // text page.
+ {load_addr, load_addr + 0x1000, true, false, true,
+ true, 0, 0, 0, 0, file.path().c_str()},
+ // data + bss page from file.
+ {load_addr + 0x20000, load_addr + 0x21000, true, true,
+ false, true, kPageSize, 0, 0, 0, file.path().c_str()},
+ // bss page from anon.
+ {load_addr + 0x21000, load_addr + 0x22000, true, true,
+ false, true, 0, 0, 0, 0, ""},
+ })));
+}
+
+// PIE binary with a non-zero start address.
+//
+// This is non-standard for a PIE binary, but valid. The binary is still loaded
+// at an arbitrary address, not the first PT_LOAD vaddr.
+//
+// N.B. Linux changed this behavior in d1fd836dcf00d2028c700c7e44d2c23404062c90.
+// Previously, with "randomization" enabled, PIE binaries with a non-zero start
+// address would be be loaded at the address they specified because mmap was
+// passed the load address, which wasn't 0 as expected.
+//
+// This change is present in kernel v4.1+.
+TEST(ElfTest, PIENonZeroStart) {
+ // gVisor has the newer behavior.
+ if (!IsRunningOnGvisor()) {
+ auto version = ASSERT_NO_ERRNO_AND_VALUE(GetKernelVersion());
+ SKIP_IF(version.major < 4 || (version.major == 4 && version.minor < 1));
+ }
+
+ ElfBinary<64> elf = StandardElf();
+
+ elf.header.e_type = ET_DYN;
+
+ // Create a standard ELF, but extend to 1.5 pages. The second page will be the
+ // beginning of a multi-page data + bss segment.
+ elf.data.resize(kPageSize + kPageSize / 2);
+
+ decltype(elf)::ElfPhdr phdr = {};
+ phdr.p_type = PT_LOAD;
+ phdr.p_flags = PF_R | PF_W;
+ phdr.p_offset = kPageSize;
+ // Put the data segment at a bit of an offset.
+ phdr.p_vaddr = 0x60000;
+ phdr.p_filesz = kPageSize / 2;
+ // The header is going to push vaddr up by a few hundred bytes. Keep p_memsz a
+ // bit less than 2 pages so this mapping doesn't extend beyond 0x43000.
+ phdr.p_memsz = 2 * kPageSize - kPageSize / 2;
+ elf.phdrs.push_back(phdr);
+
+ elf.UpdateOffsets();
+
+ TempPath file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(elf));
+
+ pid_t child;
+ int execve_errno;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec(file.path(), {file.path()}, {}, &child, &execve_errno));
+ ASSERT_EQ(execve_errno, 0);
+
+ ASSERT_NO_ERRNO(WaitStopped(child));
+
+ // RIP tells us which page the first segment was loaded into.
+ struct user_regs_struct regs;
+ struct iovec iov;
+ iov.iov_base = &regs;
+ iov.iov_len = sizeof(regs);
+ EXPECT_THAT(ptrace(PTRACE_GETREGSET, child, NT_PRSTATUS, &iov),
+ SyscallSucceeds());
+ // Read exactly the full register set.
+ EXPECT_EQ(iov.iov_len, sizeof(regs));
+
+ const uint64_t load_addr = IP_REG(regs) & ~(kPageSize - 1);
+
+ // The ELF is loaded at an arbitrary address, not the first PT_LOAD vaddr.
+ //
+ // N.B. this is technically flaky, but Linux is *extremely* unlikely to pick
+ // this as the start address, as it searches from the top down.
+ EXPECT_NE(load_addr, 0x40000);
+
+ EXPECT_THAT(child, ContainsMappings(std::vector<ProcMapsEntry>({
+ // text page.
+ {load_addr, load_addr + 0x1000, true, false, true,
+ true, 0, 0, 0, 0, file.path().c_str()},
+ // data + bss page from file.
+ {load_addr + 0x20000, load_addr + 0x21000, true, true,
+ false, true, kPageSize, 0, 0, 0, file.path().c_str()},
+ // bss page from anon.
+ {load_addr + 0x21000, load_addr + 0x22000, true, true,
+ false, true, 0, 0, 0, 0, ""},
+ })));
+}
+
+TEST(ElfTest, PIEOutOfOrderSegments) {
+ // TODO(b/37289926): This triggers a bug in Linux where it computes the size
+ // of the binary as 0x20000 - 0x40000 = 0xfffffffffffe0000, which obviously
+ // fails to map.
+ //
+ // We test gVisor's behavior (of rejecting the binary) because I assert that
+ // Linux is wrong and needs to be fixed.
+ SKIP_IF(!IsRunningOnGvisor());
+
+ ElfBinary<64> elf = StandardElf();
+
+ elf.header.e_type = ET_DYN;
+
+ // Create a standard ELF, but extend to 1.5 pages. The second page will be the
+ // beginning of a multi-page data + bss segment.
+ elf.data.resize(kPageSize + kPageSize / 2);
+
+ decltype(elf)::ElfPhdr phdr = {};
+ phdr.p_type = PT_LOAD;
+ phdr.p_flags = PF_R | PF_W;
+ phdr.p_offset = kPageSize;
+ // Put the data segment *before* the first segment.
+ phdr.p_vaddr = 0x20000;
+ phdr.p_filesz = kPageSize / 2;
+ // The header is going to push vaddr up by a few hundred bytes. Keep p_memsz a
+ // bit less than 2 pages so this mapping doesn't extend beyond 0x43000.
+ phdr.p_memsz = 2 * kPageSize - kPageSize / 2;
+ elf.phdrs.push_back(phdr);
+
+ elf.UpdateOffsets();
+
+ TempPath file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(elf));
+
+ pid_t child;
+ int execve_errno;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec(file.path(), {file.path()}, {}, &child, &execve_errno));
+ EXPECT_EQ(execve_errno, ENOEXEC);
+}
+
+// Standard dynamically linked binary with an ELF interpreter.
+TEST(ElfTest, ELFInterpreter) {
+ ElfBinary<64> interpreter = StandardElf();
+ interpreter.header.e_type = ET_DYN;
+ interpreter.header.e_entry = 0x0;
+ interpreter.UpdateOffsets();
+
+ // The first segment really needs to start at 0 for a normal PIE binary, and
+ // thus includes the headers.
+ uint64_t const offset = interpreter.phdrs[1].p_offset;
+ // N.B. Since Linux 4.10 (0036d1f7eb95b "binfmt_elf: fix calculations for bss
+ // padding"), Linux unconditionally zeroes the remainder of the highest mapped
+ // page in an interpreter, failing if the protections don't allow write. Thus
+ // we must mark this writeable.
+ interpreter.phdrs[1].p_flags = PF_R | PF_W | PF_X;
+ interpreter.phdrs[1].p_offset = 0x0;
+ interpreter.phdrs[1].p_vaddr = 0x0;
+ interpreter.phdrs[1].p_filesz += offset;
+ interpreter.phdrs[1].p_memsz += offset;
+
+ TempPath interpreter_file =
+ ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(interpreter));
+
+ ElfBinary<64> binary = StandardElf();
+
+ // Append the interpreter path.
+ int const interp_data_start = binary.data.size();
+ for (char const c : interpreter_file.path()) {
+ binary.data.push_back(c);
+ }
+ // NUL-terminate.
+ binary.data.push_back(0);
+ int const interp_data_size = binary.data.size() - interp_data_start;
+
+ decltype(binary)::ElfPhdr phdr = {};
+ phdr.p_type = PT_INTERP;
+ phdr.p_offset = interp_data_start;
+ phdr.p_filesz = interp_data_size;
+ phdr.p_memsz = interp_data_size;
+ // "If [PT_INTERP] is present, it must precede any loadable segment entry."
+ //
+ // However, Linux allows it anywhere, so we just stick it at the end to make
+ // sure out-of-order PT_INTERP is OK.
+ binary.phdrs.push_back(phdr);
+
+ binary.UpdateOffsets();
+
+ TempPath binary_file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(binary));
+
+ pid_t child;
+ int execve_errno;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(ForkAndExec(
+ binary_file.path(), {binary_file.path()}, {}, &child, &execve_errno));
+ ASSERT_EQ(execve_errno, 0);
+
+ ASSERT_NO_ERRNO(WaitStopped(child));
+
+ // RIP tells us which page the first segment of the interpreter was loaded
+ // into.
+ struct user_regs_struct regs;
+ struct iovec iov;
+ iov.iov_base = &regs;
+ iov.iov_len = sizeof(regs);
+ EXPECT_THAT(ptrace(PTRACE_GETREGSET, child, NT_PRSTATUS, &iov),
+ SyscallSucceeds());
+ // Read exactly the full register set.
+ EXPECT_EQ(iov.iov_len, sizeof(regs));
+
+ const uint64_t interp_load_addr = IP_REG(regs) & ~(kPageSize - 1);
+
+ EXPECT_THAT(
+ child, ContainsMappings(std::vector<ProcMapsEntry>({
+ // Main binary
+ {0x40000, 0x41000, true, false, true, true, 0, 0, 0, 0,
+ binary_file.path().c_str()},
+ // Interpreter
+ {interp_load_addr, interp_load_addr + 0x1000, true, true, true,
+ true, 0, 0, 0, 0, interpreter_file.path().c_str()},
+ })));
+}
+
+// Test parameter to ElfInterpterStaticTest cases. The first item is a suffix to
+// add to the end of the interpreter path in the PT_INTERP segment and the
+// second is the expected execve(2) errno.
+using ElfInterpreterStaticParam = std::tuple<std::vector<char>, int>;
+
+class ElfInterpreterStaticTest
+ : public ::testing::TestWithParam<ElfInterpreterStaticParam> {};
+
+// Statically linked ELF with a statically linked ELF interpreter.
+TEST_P(ElfInterpreterStaticTest, Test) {
+ const std::vector<char> segment_suffix = std::get<0>(GetParam());
+ const int expected_errno = std::get<1>(GetParam());
+
+ ElfBinary<64> interpreter = StandardElf();
+ // See comment in ElfTest.ELFInterpreter.
+ interpreter.phdrs[1].p_flags = PF_R | PF_W | PF_X;
+ interpreter.UpdateOffsets();
+ TempPath interpreter_file =
+ ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(interpreter));
+
+ ElfBinary<64> binary = StandardElf();
+ // The PT_LOAD segment conflicts with the interpreter's PT_LOAD segment. The
+ // interpreter's will be mapped directly over the binary's.
+
+ // Interpreter path plus the parameterized suffix in the PT_INTERP segment.
+ const std::string path = interpreter_file.path();
+ std::vector<char> segment(path.begin(), path.end());
+ segment.insert(segment.end(), segment_suffix.begin(), segment_suffix.end());
+ binary.AddInterpreter(segment);
+
+ binary.UpdateOffsets();
+
+ TempPath binary_file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(binary));
+
+ pid_t child;
+ int execve_errno;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(ForkAndExec(
+ binary_file.path(), {binary_file.path()}, {}, &child, &execve_errno));
+ ASSERT_EQ(execve_errno, expected_errno);
+
+ if (expected_errno == 0) {
+ ASSERT_NO_ERRNO(WaitStopped(child));
+
+ EXPECT_THAT(child, ContainsMappings(std::vector<ProcMapsEntry>({
+ // Interpreter.
+ {0x40000, 0x41000, true, true, true, true, 0, 0, 0,
+ 0, interpreter_file.path().c_str()},
+ })));
+ }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ Cases, ElfInterpreterStaticTest,
+ ::testing::ValuesIn({
+ // Simple NUL-terminator to run the interpreter as normal.
+ std::make_tuple(std::vector<char>({'\0'}), 0),
+ // Add some garbage to the segment followed by a NUL-terminator. This is
+ // ignored.
+ std::make_tuple(std::vector<char>({'\0', 'b', '\0'}), 0),
+ // Add some garbage to the segment without a NUL-terminator. Linux will
+ // reject
+ // this.
+ std::make_tuple(std::vector<char>({'\0', 'b'}), ENOEXEC),
+ }));
+
+// Test parameter to ElfInterpterBadPathTest cases. The first item is the
+// contents of the PT_INTERP segment and the second is the expected execve(2)
+// errno.
+using ElfInterpreterBadPathParam = std::tuple<std::vector<char>, int>;
+
+class ElfInterpreterBadPathTest
+ : public ::testing::TestWithParam<ElfInterpreterBadPathParam> {};
+
+TEST_P(ElfInterpreterBadPathTest, Test) {
+ const std::vector<char> segment = std::get<0>(GetParam());
+ const int expected_errno = std::get<1>(GetParam());
+
+ ElfBinary<64> binary = StandardElf();
+ binary.AddInterpreter(segment);
+ binary.UpdateOffsets();
+
+ TempPath binary_file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(binary));
+
+ int execve_errno;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(ForkAndExec(
+ binary_file.path(), {binary_file.path()}, {}, nullptr, &execve_errno));
+ EXPECT_EQ(execve_errno, expected_errno);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ Cases, ElfInterpreterBadPathTest,
+ ::testing::ValuesIn({
+ // NUL-terminated fake path in the PT_INTERP segment.
+ std::make_tuple(std::vector<char>({'/', 'f', '/', 'b', '\0'}), ENOENT),
+ // ELF interpreter not NUL-terminated.
+ std::make_tuple(std::vector<char>({'/', 'f', '/', 'b'}), ENOEXEC),
+ // ELF interpreter path omitted entirely.
+ //
+ // fs/binfmt_elf.c:load_elf_binary returns ENOEXEC if p_filesz is < 2
+ // bytes.
+ std::make_tuple(std::vector<char>({'\0'}), ENOEXEC),
+ // ELF interpreter path = "\0".
+ //
+ // fs/binfmt_elf.c:load_elf_binary returns ENOEXEC if p_filesz is < 2
+ // bytes, so add an extra byte to pass that check.
+ //
+ // load_elf_binary -> open_exec -> do_open_execat fails to check that
+ // name != '\0' before calling do_filp_open, which thus opens the
+ // working directory. do_open_execat returns EACCES because the
+ // directory is not a regular file.
+ std::make_tuple(std::vector<char>({'\0', '\0'}), EACCES),
+ }));
+
+// Relative path to ELF interpreter.
+TEST(ElfTest, ELFInterpreterRelative) {
+ ElfBinary<64> interpreter = StandardElf();
+ interpreter.header.e_type = ET_DYN;
+ interpreter.header.e_entry = 0x0;
+ interpreter.UpdateOffsets();
+
+ // The first segment really needs to start at 0 for a normal PIE binary, and
+ // thus includes the headers.
+ uint64_t const offset = interpreter.phdrs[1].p_offset;
+ // See comment in ElfTest.ELFInterpreter.
+ interpreter.phdrs[1].p_flags = PF_R | PF_W | PF_X;
+ interpreter.phdrs[1].p_offset = 0x0;
+ interpreter.phdrs[1].p_vaddr = 0x0;
+ interpreter.phdrs[1].p_filesz += offset;
+ interpreter.phdrs[1].p_memsz += offset;
+
+ TempPath interpreter_file =
+ ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(interpreter));
+ auto cwd = ASSERT_NO_ERRNO_AND_VALUE(GetCWD());
+ auto interpreter_relative =
+ ASSERT_NO_ERRNO_AND_VALUE(GetRelativePath(cwd, interpreter_file.path()));
+
+ ElfBinary<64> binary = StandardElf();
+
+ // NUL-terminated path in the PT_INTERP segment.
+ std::vector<char> segment(interpreter_relative.begin(),
+ interpreter_relative.end());
+ segment.push_back(0);
+ binary.AddInterpreter(segment);
+
+ binary.UpdateOffsets();
+
+ TempPath binary_file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(binary));
+
+ pid_t child;
+ int execve_errno;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(ForkAndExec(
+ binary_file.path(), {binary_file.path()}, {}, &child, &execve_errno));
+ ASSERT_EQ(execve_errno, 0);
+
+ ASSERT_NO_ERRNO(WaitStopped(child));
+
+ // RIP tells us which page the first segment of the interpreter was loaded
+ // into.
+ struct user_regs_struct regs;
+ struct iovec iov;
+ iov.iov_base = &regs;
+ iov.iov_len = sizeof(regs);
+ EXPECT_THAT(ptrace(PTRACE_GETREGSET, child, NT_PRSTATUS, &iov),
+ SyscallSucceeds());
+ // Read exactly the full register set.
+ EXPECT_EQ(iov.iov_len, sizeof(regs));
+
+ const uint64_t interp_load_addr = IP_REG(regs) & ~(kPageSize - 1);
+
+ EXPECT_THAT(
+ child, ContainsMappings(std::vector<ProcMapsEntry>({
+ // Main binary
+ {0x40000, 0x41000, true, false, true, true, 0, 0, 0, 0,
+ binary_file.path().c_str()},
+ // Interpreter
+ {interp_load_addr, interp_load_addr + 0x1000, true, true, true,
+ true, 0, 0, 0, 0, interpreter_file.path().c_str()},
+ })));
+}
+
+// ELF interpreter architecture doesn't match the binary.
+TEST(ElfTest, ELFInterpreterWrongArch) {
+ ElfBinary<64> interpreter = StandardElf();
+ interpreter.header.e_machine = EM_PPC64;
+ interpreter.header.e_type = ET_DYN;
+ interpreter.header.e_entry = 0x0;
+ interpreter.UpdateOffsets();
+
+ // The first segment really needs to start at 0 for a normal PIE binary, and
+ // thus includes the headers.
+ uint64_t const offset = interpreter.phdrs[1].p_offset;
+ // See comment in ElfTest.ELFInterpreter.
+ interpreter.phdrs[1].p_flags = PF_R | PF_W | PF_X;
+ interpreter.phdrs[1].p_offset = 0x0;
+ interpreter.phdrs[1].p_vaddr = 0x0;
+ interpreter.phdrs[1].p_filesz += offset;
+ interpreter.phdrs[1].p_memsz += offset;
+
+ TempPath interpreter_file =
+ ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(interpreter));
+
+ ElfBinary<64> binary = StandardElf();
+
+ // NUL-terminated path in the PT_INTERP segment.
+ const std::string path = interpreter_file.path();
+ std::vector<char> segment(path.begin(), path.end());
+ segment.push_back(0);
+ binary.AddInterpreter(segment);
+
+ binary.UpdateOffsets();
+
+ TempPath binary_file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(binary));
+
+ pid_t child;
+ int execve_errno;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(ForkAndExec(
+ binary_file.path(), {binary_file.path()}, {}, &child, &execve_errno));
+ ASSERT_EQ(execve_errno, ELIBBAD);
+}
+
+// No execute permissions on the binary.
+TEST(ElfTest, NoExecute) {
+ ElfBinary<64> elf = StandardElf();
+ elf.UpdateOffsets();
+
+ TempPath file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(elf));
+
+ ASSERT_THAT(chmod(file.path().c_str(), 0644), SyscallSucceeds());
+
+ pid_t child;
+ int execve_errno;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec(file.path(), {file.path()}, {}, &child, &execve_errno));
+ EXPECT_EQ(execve_errno, EACCES);
+}
+
+// Execute, but no read permissions on the binary works just fine.
+TEST(ElfTest, NoRead) {
+ // TODO(gvisor.dev/issue/160): gVisor's backing filesystem may prevent the
+ // sentry from reading the executable.
+ SKIP_IF(IsRunningOnGvisor());
+
+ ElfBinary<64> elf = StandardElf();
+ elf.UpdateOffsets();
+
+ TempPath file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(elf));
+
+ ASSERT_THAT(chmod(file.path().c_str(), 0111), SyscallSucceeds());
+
+ pid_t child;
+ int execve_errno;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec(file.path(), {file.path()}, {}, &child, &execve_errno));
+ ASSERT_EQ(execve_errno, 0);
+
+ ASSERT_NO_ERRNO(WaitStopped(child));
+
+ // TODO(gvisor.dev/issue/160): A task with a non-readable executable is marked
+ // non-dumpable, preventing access to proc files. gVisor does not implement
+ // this behavior.
+}
+
+// No execute permissions on the ELF interpreter.
+TEST(ElfTest, ElfInterpreterNoExecute) {
+ ElfBinary<64> interpreter = StandardElf();
+ interpreter.header.e_type = ET_DYN;
+ interpreter.header.e_entry = 0x0;
+ interpreter.UpdateOffsets();
+
+ // The first segment really needs to start at 0 for a normal PIE binary, and
+ // thus includes the headers.
+ uint64_t const offset = interpreter.phdrs[1].p_offset;
+ // See comment in ElfTest.ELFInterpreter.
+ interpreter.phdrs[1].p_flags = PF_R | PF_W | PF_X;
+ interpreter.phdrs[1].p_offset = 0x0;
+ interpreter.phdrs[1].p_vaddr = 0x0;
+ interpreter.phdrs[1].p_filesz += offset;
+ interpreter.phdrs[1].p_memsz += offset;
+
+ TempPath interpreter_file =
+ ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(interpreter));
+
+ ElfBinary<64> binary = StandardElf();
+
+ // NUL-terminated path in the PT_INTERP segment.
+ const std::string path = interpreter_file.path();
+ std::vector<char> segment(path.begin(), path.end());
+ segment.push_back(0);
+ binary.AddInterpreter(segment);
+
+ binary.UpdateOffsets();
+
+ TempPath binary_file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(binary));
+
+ ASSERT_THAT(chmod(interpreter_file.path().c_str(), 0644), SyscallSucceeds());
+
+ pid_t child;
+ int execve_errno;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec(interpreter_file.path(), {interpreter_file.path()}, {},
+ &child, &execve_errno));
+ EXPECT_EQ(execve_errno, EACCES);
+}
+
+// Execute a basic interpreter script.
+TEST(InterpreterScriptTest, Execute) {
+ ElfBinary<64> elf = StandardElf();
+ elf.UpdateOffsets();
+ // Use /tmp explicitly to ensure the path is short enough.
+ TempPath binary = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith("/tmp", elf));
+
+ TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), absl::StrCat("#!", binary.path()), 0755));
+
+ pid_t child;
+ int execve_errno;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec(script.path(), {script.path()}, {}, &child, &execve_errno));
+ ASSERT_EQ(execve_errno, 0);
+
+ EXPECT_NO_ERRNO(WaitStopped(child));
+}
+
+// Whitespace after #!.
+TEST(InterpreterScriptTest, Whitespace) {
+ ElfBinary<64> elf = StandardElf();
+ elf.UpdateOffsets();
+ // Use /tmp explicitly to ensure the path is short enough.
+ TempPath binary = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith("/tmp", elf));
+
+ TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), absl::StrCat("#! \t \t", binary.path()), 0755));
+
+ pid_t child;
+ int execve_errno;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec(script.path(), {script.path()}, {}, &child, &execve_errno));
+ ASSERT_EQ(execve_errno, 0);
+
+ EXPECT_NO_ERRNO(WaitStopped(child));
+}
+
+// Interpreter script is missing execute permission.
+TEST(InterpreterScriptTest, InterpreterScriptNoExecute) {
+ ElfBinary<64> elf = StandardElf();
+ elf.UpdateOffsets();
+ // Use /tmp explicitly to ensure the path is short enough.
+ TempPath binary = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith("/tmp", elf));
+
+ TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), absl::StrCat("#!", binary.path()), 0644));
+
+ pid_t child;
+ int execve_errno;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec(script.path(), {script.path()}, {}, &child, &execve_errno));
+ ASSERT_EQ(execve_errno, EACCES);
+}
+
+// Binary interpreter script refers to is missing execute permission.
+TEST(InterpreterScriptTest, BinaryNoExecute) {
+ ElfBinary<64> elf = StandardElf();
+ elf.UpdateOffsets();
+ // Use /tmp explicitly to ensure the path is short enough.
+ TempPath binary = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith("/tmp", elf));
+
+ ASSERT_THAT(chmod(binary.path().c_str(), 0644), SyscallSucceeds());
+
+ TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), absl::StrCat("#!", binary.path()), 0755));
+
+ pid_t child;
+ int execve_errno;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec(script.path(), {script.path()}, {}, &child, &execve_errno));
+ ASSERT_EQ(execve_errno, EACCES);
+}
+
+// Linux will load interpreter scripts five levels deep, but no more.
+TEST(InterpreterScriptTest, MaxRecursion) {
+ ElfBinary<64> elf = StandardElf();
+ elf.UpdateOffsets();
+ // Use /tmp explicitly to ensure the path is short enough.
+ TempPath binary = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith("/tmp", elf));
+
+ TempPath script1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ "/tmp", absl::StrCat("#!", binary.path()), 0755));
+ TempPath script2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ "/tmp", absl::StrCat("#!", script1.path()), 0755));
+ TempPath script3 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ "/tmp", absl::StrCat("#!", script2.path()), 0755));
+ TempPath script4 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ "/tmp", absl::StrCat("#!", script3.path()), 0755));
+ TempPath script5 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ "/tmp", absl::StrCat("#!", script4.path()), 0755));
+ TempPath script6 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ "/tmp", absl::StrCat("#!", script5.path()), 0755));
+
+ pid_t child;
+ int execve_errno;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec(script6.path(), {script6.path()}, {}, &child, &execve_errno));
+ // Too many levels of recursion.
+ EXPECT_EQ(execve_errno, ELOOP);
+
+ // The next level up is OK.
+ auto cleanup2 = ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec(script5.path(), {script5.path()}, {}, &child, &execve_errno));
+ ASSERT_EQ(execve_errno, 0);
+
+ EXPECT_NO_ERRNO(WaitStopped(child));
+}
+
+// Interpreter script with a relative path.
+TEST(InterpreterScriptTest, RelativePath) {
+ ElfBinary<64> elf = StandardElf();
+ elf.UpdateOffsets();
+ TempPath binary = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith("/tmp", elf));
+
+ auto cwd = ASSERT_NO_ERRNO_AND_VALUE(GetCWD());
+ auto binary_relative =
+ ASSERT_NO_ERRNO_AND_VALUE(GetRelativePath(cwd, binary.path()));
+
+ TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), absl::StrCat("#!", binary_relative), 0755));
+
+ pid_t child;
+ int execve_errno;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec(script.path(), {script.path()}, {}, &child, &execve_errno));
+ ASSERT_EQ(execve_errno, 0);
+
+ EXPECT_NO_ERRNO(WaitStopped(child));
+}
+
+// Interpreter script with .. in a path component.
+TEST(InterpreterScriptTest, UncleanPath) {
+ ElfBinary<64> elf = StandardElf();
+ elf.UpdateOffsets();
+ // Use /tmp explicitly to ensure the path is short enough.
+ TempPath binary = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith("/tmp", elf));
+
+ TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), absl::StrCat("#!/tmp/../", binary.path()),
+ 0755));
+
+ pid_t child;
+ int execve_errno;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec(script.path(), {script.path()}, {}, &child, &execve_errno));
+ ASSERT_EQ(execve_errno, 0);
+
+ EXPECT_NO_ERRNO(WaitStopped(child));
+}
+
+// Passed interpreter script is a symlink.
+TEST(InterpreterScriptTest, Symlink) {
+ ElfBinary<64> elf = StandardElf();
+ elf.UpdateOffsets();
+ // Use /tmp explicitly to ensure the path is short enough.
+ TempPath binary = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith("/tmp", elf));
+
+ TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), absl::StrCat("#!", binary.path()), 0755));
+
+ TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateSymlinkTo(GetAbsoluteTestTmpdir(), script.path()));
+
+ pid_t child;
+ int execve_errno;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec(link.path(), {link.path()}, {}, &child, &execve_errno));
+ ASSERT_EQ(execve_errno, 0);
+
+ EXPECT_NO_ERRNO(WaitStopped(child));
+}
+
+// Interpreter script points to a symlink loop.
+TEST(InterpreterScriptTest, SymlinkLoop) {
+ std::string const link1 = NewTempAbsPathInDir("/tmp");
+ std::string const link2 = NewTempAbsPathInDir("/tmp");
+
+ ASSERT_THAT(symlink(link2.c_str(), link1.c_str()), SyscallSucceeds());
+ auto remove_link1 = Cleanup(
+ [&link1] { EXPECT_THAT(unlink(link1.c_str()), SyscallSucceeds()); });
+
+ ASSERT_THAT(symlink(link1.c_str(), link2.c_str()), SyscallSucceeds());
+ auto remove_link2 = Cleanup(
+ [&link2] { EXPECT_THAT(unlink(link2.c_str()), SyscallSucceeds()); });
+
+ TempPath script = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), absl::StrCat("#!", link1), 0755));
+
+ pid_t child;
+ int execve_errno;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec(script.path(), {script.path()}, {}, &child, &execve_errno));
+ EXPECT_EQ(execve_errno, ELOOP);
+}
+
+// Binary is a symlink loop.
+TEST(ExecveTest, SymlinkLoop) {
+ std::string const link1 = NewTempAbsPathInDir("/tmp");
+ std::string const link2 = NewTempAbsPathInDir("/tmp");
+
+ ASSERT_THAT(symlink(link2.c_str(), link1.c_str()), SyscallSucceeds());
+ auto remove_link = Cleanup(
+ [&link1] { EXPECT_THAT(unlink(link1.c_str()), SyscallSucceeds()); });
+
+ ASSERT_THAT(symlink(link1.c_str(), link2.c_str()), SyscallSucceeds());
+ auto remove_link2 = Cleanup(
+ [&link2] { EXPECT_THAT(unlink(link2.c_str()), SyscallSucceeds()); });
+
+ pid_t child;
+ int execve_errno;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec(link1, {link1}, {}, &child, &execve_errno));
+ EXPECT_EQ(execve_errno, ELOOP);
+}
+
+// Binary is a directory.
+TEST(ExecveTest, Directory) {
+ pid_t child;
+ int execve_errno;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec("/tmp", {"/tmp"}, {}, &child, &execve_errno));
+ EXPECT_EQ(execve_errno, EACCES);
+}
+
+// Pass a valid binary as a directory (extra / on the end).
+TEST(ExecveTest, BinaryAsDirectory) {
+ ElfBinary<64> elf = StandardElf();
+ elf.UpdateOffsets();
+ TempPath file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(elf));
+
+ std::string const path = absl::StrCat(file.path(), "/");
+
+ pid_t child;
+ int execve_errno;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec(path, {path}, {}, &child, &execve_errno));
+ EXPECT_EQ(execve_errno, ENOTDIR);
+}
+
+// The initial brk value is after the page at the end of the binary.
+TEST(ExecveTest, BrkAfterBinary) {
+ ElfBinary<64> elf = StandardElf();
+ elf.UpdateOffsets();
+
+ TempPath file = ASSERT_NO_ERRNO_AND_VALUE(CreateElfWith(elf));
+
+ pid_t child;
+ int execve_errno;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec(file.path(), {file.path()}, {}, &child, &execve_errno));
+ ASSERT_EQ(execve_errno, 0);
+
+ // Ensure it made it to SIGSTOP.
+ ASSERT_NO_ERRNO(WaitStopped(child));
+
+ struct user_regs_struct regs;
+ struct iovec iov;
+ iov.iov_base = &regs;
+ iov.iov_len = sizeof(regs);
+ EXPECT_THAT(ptrace(PTRACE_GETREGSET, child, NT_PRSTATUS, &iov),
+ SyscallSucceeds());
+ // Read exactly the full register set.
+ EXPECT_EQ(iov.iov_len, sizeof(regs));
+
+ // RIP is just beyond the final syscall instruction. Rewind to execute a brk
+ // syscall.
+ IP_REG(regs) -= kSyscallSize;
+ RAX_REG(regs) = __NR_brk;
+ RDI_REG(regs) = 0;
+ ASSERT_THAT(ptrace(PTRACE_SETREGSET, child, NT_PRSTATUS, &iov),
+ SyscallSucceeds());
+
+ // Resume the child, waiting for syscall entry.
+ ASSERT_THAT(ptrace(PTRACE_SYSCALL, child, 0, 0), SyscallSucceeds());
+ int status;
+ ASSERT_THAT(RetryEINTR(waitpid)(child, &status, 0),
+ SyscallSucceedsWithValue(child));
+ ASSERT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP)
+ << "status = " << status;
+
+ // Execute the syscall.
+ ASSERT_THAT(ptrace(PTRACE_SYSCALL, child, 0, 0), SyscallSucceeds());
+ ASSERT_THAT(RetryEINTR(waitpid)(child, &status, 0),
+ SyscallSucceedsWithValue(child));
+ ASSERT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP)
+ << "status = " << status;
+
+ iov.iov_base = &regs;
+ iov.iov_len = sizeof(regs);
+ EXPECT_THAT(ptrace(PTRACE_GETREGSET, child, NT_PRSTATUS, &iov),
+ SyscallSucceeds());
+ // Read exactly the full register set.
+ EXPECT_EQ(iov.iov_len, sizeof(regs));
+
+ // brk is after the text page.
+ //
+ // The kernel does brk randomization, so we can't be sure what the exact
+ // address will be, but it is always beyond the final page in the binary.
+ // i.e., it does not start immediately after memsz in the middle of a page.
+ // Userspace may expect to use that space.
+ EXPECT_GE(RETURN_REG(regs), 0x41000);
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/exec_proc_exe_workload.cc b/test/syscalls/linux/exec_proc_exe_workload.cc
new file mode 100644
index 000000000..2989379b7
--- /dev/null
+++ b/test/syscalls/linux/exec_proc_exe_workload.cc
@@ -0,0 +1,42 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <iostream>
+
+#include "test/util/fs_util.h"
+#include "test/util/posix_error.h"
+
+int main(int argc, char** argv, char** envp) {
+ // This is annoying. Because remote build systems may put these binaries
+ // in a content-addressable-store, you may wind up with /proc/self/exe
+ // pointing to some random path (but with a sensible argv[0]).
+ //
+ // Therefore, this test simply checks that the /proc/self/exe
+ // is absolute and *doesn't* match argv[1].
+ std::string exe =
+ gvisor::testing::ProcessExePath(getpid()).ValueOrDie();
+ if (exe[0] != '/') {
+ std::cerr << "relative path: " << exe << std::endl;
+ exit(1);
+ }
+ if (exe.find(argv[1]) != std::string::npos) {
+ std::cerr << "matching path: " << exe << std::endl;
+ exit(1);
+ }
+
+ return 0;
+}
diff --git a/test/syscalls/linux/exec_state_workload.cc b/test/syscalls/linux/exec_state_workload.cc
new file mode 100644
index 000000000..028902b14
--- /dev/null
+++ b/test/syscalls/linux/exec_state_workload.cc
@@ -0,0 +1,202 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <signal.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/auxv.h>
+#include <sys/prctl.h>
+#include <sys/time.h>
+
+#include <iostream>
+#include <ostream>
+#include <string>
+
+#include "absl/strings/numbers.h"
+
+// Pretty-print a sigset_t.
+std::ostream& operator<<(std::ostream& out, const sigset_t& s) {
+ out << "{ ";
+
+ for (int i = 0; i < NSIG; i++) {
+ if (sigismember(&s, i)) {
+ out << i << " ";
+ }
+ }
+
+ out << "}";
+ return out;
+}
+
+// Verify that the signo handler is handler.
+int CheckSigHandler(uint32_t signo, uintptr_t handler) {
+ struct sigaction sa;
+ int ret = sigaction(signo, nullptr, &sa);
+ if (ret < 0) {
+ perror("sigaction");
+ return 1;
+ }
+
+ if (reinterpret_cast<void (*)(int)>(handler) != sa.sa_handler) {
+ std::cerr << "signo " << signo << " handler got: " << sa.sa_handler
+ << " expected: " << std::hex << handler;
+ return 1;
+ }
+ return 0;
+}
+
+// Verify that the signo is blocked.
+int CheckSigBlocked(uint32_t signo) {
+ sigset_t s;
+ int ret = sigprocmask(SIG_SETMASK, nullptr, &s);
+ if (ret < 0) {
+ perror("sigprocmask");
+ return 1;
+ }
+
+ if (!sigismember(&s, signo)) {
+ std::cerr << "signal " << signo << " not blocked in signal mask: " << s
+ << std::endl;
+ return 1;
+ }
+ return 0;
+}
+
+// Verify that the itimer is enabled.
+int CheckItimerEnabled(uint32_t timer) {
+ struct itimerval itv;
+ int ret = getitimer(timer, &itv);
+ if (ret < 0) {
+ perror("getitimer");
+ return 1;
+ }
+
+ if (!itv.it_value.tv_sec && !itv.it_value.tv_usec &&
+ !itv.it_interval.tv_sec && !itv.it_interval.tv_usec) {
+ std::cerr << "timer " << timer
+ << " not enabled. value sec: " << itv.it_value.tv_sec
+ << " usec: " << itv.it_value.tv_usec
+ << " interval sec: " << itv.it_interval.tv_sec
+ << " usec: " << itv.it_interval.tv_usec << std::endl;
+ return 1;
+ }
+ return 0;
+}
+
+int PrintExecFn() {
+ unsigned long execfn = getauxval(AT_EXECFN);
+ if (!execfn) {
+ std::cerr << "AT_EXECFN missing" << std::endl;
+ return 1;
+ }
+
+ std::cerr << reinterpret_cast<const char*>(execfn) << std::endl;
+ return 0;
+}
+
+int PrintExecName() {
+ const size_t name_length = 20;
+ char name[name_length] = {0};
+ if (prctl(PR_GET_NAME, name) < 0) {
+ std::cerr << "prctl(PR_GET_NAME) failed" << std::endl;
+ return 1;
+ }
+
+ std::cerr << name << std::endl;
+ return 0;
+}
+
+void usage(const std::string& prog) {
+ std::cerr << "usage:\n"
+ << "\t" << prog << " CheckSigHandler <signo> <handler addr (hex)>\n"
+ << "\t" << prog << " CheckSigBlocked <signo>\n"
+ << "\t" << prog << " CheckTimerDisabled <timer>\n"
+ << "\t" << prog << " PrintExecFn\n"
+ << "\t" << prog << " PrintExecName" << std::endl;
+}
+
+int main(int argc, char** argv) {
+ if (argc < 2) {
+ usage(argv[0]);
+ return 1;
+ }
+
+ std::string func(argv[1]);
+
+ if (func == "CheckSigHandler") {
+ if (argc != 4) {
+ usage(argv[0]);
+ return 1;
+ }
+
+ uint32_t signo;
+ if (!absl::SimpleAtoi(argv[2], &signo)) {
+ std::cerr << "invalid signo: " << argv[2] << std::endl;
+ return 1;
+ }
+
+ uintptr_t handler;
+ if (!absl::numbers_internal::safe_strtoi_base(argv[3], &handler, 16)) {
+ std::cerr << "invalid handler: " << std::hex << argv[3] << std::endl;
+ return 1;
+ }
+
+ return CheckSigHandler(signo, handler);
+ }
+
+ if (func == "CheckSigBlocked") {
+ if (argc != 3) {
+ usage(argv[0]);
+ return 1;
+ }
+
+ uint32_t signo;
+ if (!absl::SimpleAtoi(argv[2], &signo)) {
+ std::cerr << "invalid signo: " << argv[2] << std::endl;
+ return 1;
+ }
+
+ return CheckSigBlocked(signo);
+ }
+
+ if (func == "CheckItimerEnabled") {
+ if (argc != 3) {
+ usage(argv[0]);
+ return 1;
+ }
+
+ uint32_t timer;
+ if (!absl::SimpleAtoi(argv[2], &timer)) {
+ std::cerr << "invalid signo: " << argv[2] << std::endl;
+ return 1;
+ }
+
+ return CheckItimerEnabled(timer);
+ }
+
+ if (func == "PrintExecFn") {
+ // N.B. This will be called as an interpreter script, with the script passed
+ // as the third argument. We don't care about that script.
+ return PrintExecFn();
+ }
+
+ if (func == "PrintExecName") {
+ // N.B. This may be called as an interpreter script like PrintExecFn.
+ return PrintExecName();
+ }
+
+ std::cerr << "Invalid function: " << func << std::endl;
+ return 1;
+}
diff --git a/test/syscalls/linux/exit.cc b/test/syscalls/linux/exit.cc
new file mode 100644
index 000000000..d52ea786b
--- /dev/null
+++ b/test/syscalls/linux/exit.cc
@@ -0,0 +1,78 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "absl/time/time.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/test_util.h"
+#include "test/util/time_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void TestExit(int code) {
+ pid_t pid = fork();
+ if (pid == 0) {
+ _exit(code);
+ }
+
+ ASSERT_THAT(pid, SyscallSucceeds());
+
+ int status;
+ EXPECT_THAT(RetryEINTR(waitpid)(pid, &status, 0), SyscallSucceeds());
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == code) << status;
+}
+
+TEST(ExitTest, Success) { TestExit(0); }
+
+TEST(ExitTest, Failure) { TestExit(1); }
+
+// This test ensures that a process's file descriptors are closed when it calls
+// exit(). In order to test this, the parent tries to read from a pipe whose
+// write end is held by the child. While the read is blocking, the child exits,
+// which should cause the parent to read 0 bytes due to EOF.
+TEST(ExitTest, CloseFds) {
+ int pipe_fds[2];
+ ASSERT_THAT(pipe(pipe_fds), SyscallSucceeds());
+
+ FileDescriptor read_fd(pipe_fds[0]);
+ FileDescriptor write_fd(pipe_fds[1]);
+
+ pid_t pid = fork();
+ if (pid == 0) {
+ read_fd.reset();
+
+ SleepSafe(absl::Seconds(10));
+
+ _exit(0);
+ }
+
+ EXPECT_THAT(pid, SyscallSucceeds());
+
+ write_fd.reset();
+
+ char buf[10];
+ EXPECT_THAT(ReadFd(read_fd.get(), buf, sizeof(buf)),
+ SyscallSucceedsWithValue(0));
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/exit_script.sh b/test/syscalls/linux/exit_script.sh
new file mode 100755
index 000000000..527518e06
--- /dev/null
+++ b/test/syscalls/linux/exit_script.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+# Copyright 2018 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if [ $# -ne 1 ]; then
+ echo "Usage: $0 exit_code"
+ exit 255
+fi
+
+exit $1
diff --git a/test/syscalls/linux/fadvise64.cc b/test/syscalls/linux/fadvise64.cc
new file mode 100644
index 000000000..2af7aa6d9
--- /dev/null
+++ b/test/syscalls/linux/fadvise64.cc
@@ -0,0 +1,72 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <syscall.h>
+#include <unistd.h>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+TEST(FAdvise64Test, Basic) {
+ auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const auto fd = ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY));
+
+ // fadvise64 is noop in gVisor, so just test that it succeeds.
+ ASSERT_THAT(syscall(__NR_fadvise64, fd.get(), 0, 10, POSIX_FADV_NORMAL),
+ SyscallSucceeds());
+ ASSERT_THAT(syscall(__NR_fadvise64, fd.get(), 0, 10, POSIX_FADV_RANDOM),
+ SyscallSucceeds());
+ ASSERT_THAT(syscall(__NR_fadvise64, fd.get(), 0, 10, POSIX_FADV_SEQUENTIAL),
+ SyscallSucceeds());
+ ASSERT_THAT(syscall(__NR_fadvise64, fd.get(), 0, 10, POSIX_FADV_WILLNEED),
+ SyscallSucceeds());
+ ASSERT_THAT(syscall(__NR_fadvise64, fd.get(), 0, 10, POSIX_FADV_DONTNEED),
+ SyscallSucceeds());
+ ASSERT_THAT(syscall(__NR_fadvise64, fd.get(), 0, 10, POSIX_FADV_NOREUSE),
+ SyscallSucceeds());
+}
+
+TEST(FAdvise64Test, InvalidArgs) {
+ auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const auto fd = ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY));
+
+ // Note: offset is allowed to be negative.
+ ASSERT_THAT(syscall(__NR_fadvise64, fd.get(), 0, static_cast<off_t>(-1),
+ POSIX_FADV_NORMAL),
+ SyscallFailsWithErrno(EINVAL));
+ ASSERT_THAT(syscall(__NR_fadvise64, fd.get(), 0, 10, 12345),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(FAdvise64Test, NoPipes) {
+ int fds[2];
+ ASSERT_THAT(pipe(fds), SyscallSucceeds());
+ const FileDescriptor read(fds[0]);
+ const FileDescriptor write(fds[1]);
+
+ ASSERT_THAT(syscall(__NR_fadvise64, read.get(), 0, 10, POSIX_FADV_NORMAL),
+ SyscallFailsWithErrno(ESPIPE));
+}
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/fallocate.cc b/test/syscalls/linux/fallocate.cc
new file mode 100644
index 000000000..cabc2b751
--- /dev/null
+++ b/test/syscalls/linux/fallocate.cc
@@ -0,0 +1,186 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <sys/eventfd.h>
+#include <sys/resource.h>
+#include <sys/signalfd.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/timerfd.h>
+#include <syscall.h>
+#include <time.h>
+#include <unistd.h>
+
+#include <ctime>
+
+#include "gtest/gtest.h"
+#include "absl/strings/str_cat.h"
+#include "absl/time/time.h"
+#include "test/syscalls/linux/file_base.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/cleanup.h"
+#include "test/util/eventfd_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/posix_error.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+int fallocate(int fd, int mode, off_t offset, off_t len) {
+ return RetryEINTR(syscall)(__NR_fallocate, fd, mode, offset, len);
+}
+
+class AllocateTest : public FileTest {
+ void SetUp() override { FileTest::SetUp(); }
+};
+
+TEST_F(AllocateTest, Fallocate) {
+ // Check that it starts at size zero.
+ struct stat buf;
+ ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
+ EXPECT_EQ(buf.st_size, 0);
+
+ // Grow to ten bytes.
+ ASSERT_THAT(fallocate(test_file_fd_.get(), 0, 0, 10), SyscallSucceeds());
+ ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
+ EXPECT_EQ(buf.st_size, 10);
+
+ // Allocate to a smaller size should be noop.
+ ASSERT_THAT(fallocate(test_file_fd_.get(), 0, 0, 5), SyscallSucceeds());
+ ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
+ EXPECT_EQ(buf.st_size, 10);
+
+ // Grow again.
+ ASSERT_THAT(fallocate(test_file_fd_.get(), 0, 0, 20), SyscallSucceeds());
+ ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
+ EXPECT_EQ(buf.st_size, 20);
+
+ // Grow with offset.
+ ASSERT_THAT(fallocate(test_file_fd_.get(), 0, 10, 20), SyscallSucceeds());
+ ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
+ EXPECT_EQ(buf.st_size, 30);
+
+ // Grow with offset beyond EOF.
+ ASSERT_THAT(fallocate(test_file_fd_.get(), 0, 39, 1), SyscallSucceeds());
+ ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
+ EXPECT_EQ(buf.st_size, 40);
+
+ // Given length 0 should fail with EINVAL.
+ ASSERT_THAT(fallocate(test_file_fd_.get(), 0, 50, 0),
+ SyscallFailsWithErrno(EINVAL));
+ ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
+ EXPECT_EQ(buf.st_size, 40);
+}
+
+TEST_F(AllocateTest, FallocateInvalid) {
+ // Invalid FD
+ EXPECT_THAT(fallocate(-1, 0, 0, 10), SyscallFailsWithErrno(EBADF));
+
+ // Negative offset and size.
+ EXPECT_THAT(fallocate(test_file_fd_.get(), 0, -1, 10),
+ SyscallFailsWithErrno(EINVAL));
+ EXPECT_THAT(fallocate(test_file_fd_.get(), 0, 0, -1),
+ SyscallFailsWithErrno(EINVAL));
+ EXPECT_THAT(fallocate(test_file_fd_.get(), 0, -1, -1),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_F(AllocateTest, FallocateReadonly) {
+ auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY));
+ EXPECT_THAT(fallocate(fd.get(), 0, 0, 10), SyscallFailsWithErrno(EBADF));
+}
+
+TEST_F(AllocateTest, FallocatePipe) {
+ int pipes[2];
+ EXPECT_THAT(pipe(pipes), SyscallSucceeds());
+ auto cleanup = Cleanup([&pipes] {
+ EXPECT_THAT(close(pipes[0]), SyscallSucceeds());
+ EXPECT_THAT(close(pipes[1]), SyscallSucceeds());
+ });
+
+ EXPECT_THAT(fallocate(pipes[1], 0, 0, 10), SyscallFailsWithErrno(ESPIPE));
+}
+
+TEST_F(AllocateTest, FallocateChar) {
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/null", O_RDWR));
+ EXPECT_THAT(fallocate(fd.get(), 0, 0, 10), SyscallFailsWithErrno(ENODEV));
+}
+
+TEST_F(AllocateTest, FallocateRlimit) {
+ // Get the current rlimit and restore after test run.
+ struct rlimit initial_lim;
+ ASSERT_THAT(getrlimit(RLIMIT_FSIZE, &initial_lim), SyscallSucceeds());
+ auto cleanup = Cleanup([&initial_lim] {
+ EXPECT_THAT(setrlimit(RLIMIT_FSIZE, &initial_lim), SyscallSucceeds());
+ });
+
+ // Try growing past the file size limit.
+ sigset_t new_mask;
+ sigemptyset(&new_mask);
+ sigaddset(&new_mask, SIGXFSZ);
+ sigprocmask(SIG_BLOCK, &new_mask, nullptr);
+
+ struct rlimit setlim = {};
+ setlim.rlim_cur = 1024;
+ setlim.rlim_max = RLIM_INFINITY;
+ ASSERT_THAT(setrlimit(RLIMIT_FSIZE, &setlim), SyscallSucceeds());
+
+ EXPECT_THAT(fallocate(test_file_fd_.get(), 0, 0, 1025),
+ SyscallFailsWithErrno(EFBIG));
+
+ struct timespec timelimit = {};
+ timelimit.tv_sec = 10;
+ EXPECT_EQ(sigtimedwait(&new_mask, nullptr, &timelimit), SIGXFSZ);
+ ASSERT_THAT(sigprocmask(SIG_UNBLOCK, &new_mask, nullptr), SyscallSucceeds());
+}
+
+TEST_F(AllocateTest, FallocateOtherFDs) {
+ int fd;
+ ASSERT_THAT(fd = timerfd_create(CLOCK_MONOTONIC, 0), SyscallSucceeds());
+ auto timer_fd = FileDescriptor(fd);
+ EXPECT_THAT(fallocate(timer_fd.get(), 0, 0, 10),
+ SyscallFailsWithErrno(ENODEV));
+
+ sigset_t mask;
+ sigemptyset(&mask);
+ ASSERT_THAT(fd = signalfd(-1, &mask, 0), SyscallSucceeds());
+ auto sfd = FileDescriptor(fd);
+ EXPECT_THAT(fallocate(sfd.get(), 0, 0, 10), SyscallFailsWithErrno(ENODEV));
+
+ auto efd =
+ ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(0, EFD_NONBLOCK | EFD_SEMAPHORE));
+ EXPECT_THAT(fallocate(efd.get(), 0, 0, 10), SyscallFailsWithErrno(ENODEV));
+
+ auto sockfd = ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+ EXPECT_THAT(fallocate(sockfd.get(), 0, 0, 10), SyscallFailsWithErrno(ENODEV));
+
+ int socks[2];
+ ASSERT_THAT(socketpair(AF_UNIX, SOCK_STREAM, PF_UNIX, socks),
+ SyscallSucceeds());
+ auto sock0 = FileDescriptor(socks[0]);
+ auto sock1 = FileDescriptor(socks[1]);
+ EXPECT_THAT(fallocate(sock0.get(), 0, 0, 10), SyscallFailsWithErrno(ENODEV));
+}
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/fault.cc b/test/syscalls/linux/fault.cc
new file mode 100644
index 000000000..a85750382
--- /dev/null
+++ b/test/syscalls/linux/fault.cc
@@ -0,0 +1,74 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#define _GNU_SOURCE 1
+#include <signal.h>
+#include <ucontext.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+__attribute__((noinline)) void Fault(void) {
+ volatile int* foo = nullptr;
+ *foo = 0;
+}
+
+int GetPcFromUcontext(ucontext_t* uc, uintptr_t* pc) {
+#if defined(__x86_64__)
+ *pc = uc->uc_mcontext.gregs[REG_RIP];
+ return 1;
+#elif defined(__i386__)
+ *pc = uc->uc_mcontext.gregs[REG_EIP];
+ return 1;
+#elif defined(__aarch64__)
+ *pc = uc->uc_mcontext.pc;
+ return 1;
+#else
+ return 0;
+#endif
+}
+
+void sigact_handler(int sig, siginfo_t* siginfo, void* context) {
+ uintptr_t pc;
+ if (GetPcFromUcontext(reinterpret_cast<ucontext_t*>(context), &pc)) {
+ /* Expect Fault() to be at most 64 bytes in size. */
+ uintptr_t fault_addr = reinterpret_cast<uintptr_t>(&Fault);
+ EXPECT_GE(pc, fault_addr);
+ EXPECT_LT(pc, fault_addr + 64);
+ exit(0);
+ }
+}
+
+TEST(FaultTest, InRange) {
+ // Reset the signal handler to do nothing so that it doesn't freak out
+ // the test runner when we fire an alarm.
+ struct sigaction sa = {};
+ sa.sa_sigaction = sigact_handler;
+ sigfillset(&sa.sa_mask);
+ sa.sa_flags = SA_SIGINFO;
+ ASSERT_THAT(sigaction(SIGSEGV, &sa, nullptr), SyscallSucceeds());
+
+ Fault();
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/fchdir.cc b/test/syscalls/linux/fchdir.cc
new file mode 100644
index 000000000..08bcae1e8
--- /dev/null
+++ b/test/syscalls/linux/fchdir.cc
@@ -0,0 +1,77 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/un.h>
+
+#include "gtest/gtest.h"
+#include "test/util/capability_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(FchdirTest, Success) {
+ auto temp_dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ int fd;
+ ASSERT_THAT(fd = open(temp_dir.path().c_str(), O_DIRECTORY | O_RDONLY),
+ SyscallSucceeds());
+
+ EXPECT_THAT(fchdir(fd), SyscallSucceeds());
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+ // Change CWD to a permanent location as temp dirs will be cleaned up.
+ EXPECT_THAT(chdir("/"), SyscallSucceeds());
+}
+
+TEST(FchdirTest, InvalidFD) {
+ EXPECT_THAT(fchdir(-1), SyscallFailsWithErrno(EBADF));
+}
+
+TEST(FchdirTest, PermissionDenied) {
+ // Drop capabilities that allow us to override directory permissions.
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+ auto temp_dir = ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateDirWith(GetAbsoluteTestTmpdir(), 0666 /* mode */));
+
+ int fd;
+ ASSERT_THAT(fd = open(temp_dir.path().c_str(), O_DIRECTORY | O_RDONLY),
+ SyscallSucceeds());
+
+ EXPECT_THAT(fchdir(fd), SyscallFailsWithErrno(EACCES));
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
+TEST(FchdirTest, NotDir) {
+ auto temp_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+ int fd;
+ ASSERT_THAT(fd = open(temp_file.path().c_str(), O_CREAT | O_RDONLY, 0777),
+ SyscallSucceeds());
+
+ EXPECT_THAT(fchdir(fd), SyscallFailsWithErrno(ENOTDIR));
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/fcntl.cc b/test/syscalls/linux/fcntl.cc
new file mode 100644
index 000000000..5467fa2c8
--- /dev/null
+++ b/test/syscalls/linux/fcntl.cc
@@ -0,0 +1,1353 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <syscall.h>
+#include <unistd.h>
+
+#include <iostream>
+#include <list>
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "absl/base/macros.h"
+#include "absl/base/port.h"
+#include "absl/flags/flag.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/cleanup.h"
+#include "test/util/eventfd_util.h"
+#include "test/util/fs_util.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/save_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+#include "test/util/timer_util.h"
+
+ABSL_FLAG(std::string, child_setlock_on, "",
+ "Contains the path to try to set a file lock on.");
+ABSL_FLAG(bool, child_setlock_write, false,
+ "Whether to set a writable lock (otherwise readable)");
+ABSL_FLAG(bool, blocking, false,
+ "Whether to set a blocking lock (otherwise non-blocking).");
+ABSL_FLAG(bool, retry_eintr, false,
+ "Whether to retry in the subprocess on EINTR.");
+ABSL_FLAG(uint64_t, child_setlock_start, 0, "The value of struct flock start");
+ABSL_FLAG(uint64_t, child_setlock_len, 0, "The value of struct flock len");
+ABSL_FLAG(int32_t, socket_fd, -1,
+ "A socket to use for communicating more state back "
+ "to the parent.");
+
+namespace gvisor {
+namespace testing {
+
+class FcntlLockTest : public ::testing::Test {
+ public:
+ void SetUp() override {
+ // Let's make a socket pair.
+ ASSERT_THAT(socketpair(AF_UNIX, SOCK_STREAM, 0, fds_), SyscallSucceeds());
+ }
+
+ void TearDown() override {
+ EXPECT_THAT(close(fds_[0]), SyscallSucceeds());
+ EXPECT_THAT(close(fds_[1]), SyscallSucceeds());
+ }
+
+ int64_t GetSubprocessFcntlTimeInUsec() {
+ int64_t ret = 0;
+ EXPECT_THAT(ReadFd(fds_[0], reinterpret_cast<void*>(&ret), sizeof(ret)),
+ SyscallSucceedsWithValue(sizeof(ret)));
+ return ret;
+ }
+
+ // The first fd will remain with the process creating the subprocess
+ // and the second will go to the subprocess.
+ int fds_[2] = {};
+};
+
+namespace {
+
+PosixErrorOr<Cleanup> SubprocessLock(std::string const& path, bool for_write,
+ bool blocking, bool retry_eintr, int fd,
+ off_t start, off_t length, pid_t* child) {
+ std::vector<std::string> args = {
+ "/proc/self/exe", "--child_setlock_on", path,
+ "--child_setlock_start", absl::StrCat(start), "--child_setlock_len",
+ absl::StrCat(length), "--socket_fd", absl::StrCat(fd)};
+
+ if (for_write) {
+ args.push_back("--child_setlock_write");
+ }
+
+ if (blocking) {
+ args.push_back("--blocking");
+ }
+
+ if (retry_eintr) {
+ args.push_back("--retry_eintr");
+ }
+
+ int execve_errno = 0;
+ ASSIGN_OR_RETURN_ERRNO(
+ auto cleanup,
+ ForkAndExec("/proc/self/exe", ExecveArray(args.begin(), args.end()), {},
+ nullptr, child, &execve_errno));
+
+ if (execve_errno != 0) {
+ return PosixError(execve_errno, "execve");
+ }
+
+ return std::move(cleanup);
+}
+
+TEST(FcntlTest, SetCloExecBadFD) {
+ // Open an eventfd file descriptor with FD_CLOEXEC descriptor flag not set.
+ FileDescriptor f = ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(0, 0));
+ auto fd = f.get();
+ f.reset();
+ ASSERT_THAT(fcntl(fd, F_GETFD), SyscallFailsWithErrno(EBADF));
+ ASSERT_THAT(fcntl(fd, F_SETFD, FD_CLOEXEC), SyscallFailsWithErrno(EBADF));
+}
+
+TEST(FcntlTest, SetCloExec) {
+ // Open an eventfd file descriptor with FD_CLOEXEC descriptor flag not set.
+ FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(0, 0));
+ ASSERT_THAT(fcntl(fd.get(), F_GETFD), SyscallSucceedsWithValue(0));
+
+ // Set the FD_CLOEXEC flag.
+ ASSERT_THAT(fcntl(fd.get(), F_SETFD, FD_CLOEXEC), SyscallSucceeds());
+ ASSERT_THAT(fcntl(fd.get(), F_GETFD), SyscallSucceedsWithValue(FD_CLOEXEC));
+}
+
+TEST(FcntlTest, ClearCloExec) {
+ // Open an eventfd file descriptor with FD_CLOEXEC descriptor flag set.
+ FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(0, EFD_CLOEXEC));
+ ASSERT_THAT(fcntl(fd.get(), F_GETFD), SyscallSucceedsWithValue(FD_CLOEXEC));
+
+ // Clear the FD_CLOEXEC flag.
+ ASSERT_THAT(fcntl(fd.get(), F_SETFD, 0), SyscallSucceeds());
+ ASSERT_THAT(fcntl(fd.get(), F_GETFD), SyscallSucceedsWithValue(0));
+}
+
+TEST(FcntlTest, IndependentDescriptorFlags) {
+ // Open an eventfd file descriptor with FD_CLOEXEC descriptor flag not set.
+ FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(0, 0));
+ ASSERT_THAT(fcntl(fd.get(), F_GETFD), SyscallSucceedsWithValue(0));
+
+ // Duplicate the descriptor. Ensure that it also doesn't have FD_CLOEXEC.
+ FileDescriptor newfd = ASSERT_NO_ERRNO_AND_VALUE(fd.Dup());
+ ASSERT_THAT(fcntl(newfd.get(), F_GETFD), SyscallSucceedsWithValue(0));
+
+ // Set FD_CLOEXEC on the first FD.
+ ASSERT_THAT(fcntl(fd.get(), F_SETFD, FD_CLOEXEC), SyscallSucceeds());
+ ASSERT_THAT(fcntl(fd.get(), F_GETFD), SyscallSucceedsWithValue(FD_CLOEXEC));
+
+ // Ensure that the second FD is unaffected by the change on the first.
+ ASSERT_THAT(fcntl(newfd.get(), F_GETFD), SyscallSucceedsWithValue(0));
+}
+
+// All file description flags passed to open appear in F_GETFL.
+TEST(FcntlTest, GetAllFlags) {
+ TempPath path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ int flags = O_RDWR | O_DIRECT | O_SYNC | O_NONBLOCK | O_APPEND;
+ FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path.path(), flags));
+
+ // Linux forces O_LARGEFILE on all 64-bit kernels and gVisor's is 64-bit.
+ int expected = flags | kOLargeFile;
+
+ int rflags;
+ EXPECT_THAT(rflags = fcntl(fd.get(), F_GETFL), SyscallSucceeds());
+ EXPECT_EQ(rflags, expected);
+}
+
+TEST(FcntlTest, SetFlags) {
+ TempPath path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path.path(), 0));
+
+ int const flags = O_RDWR | O_DIRECT | O_SYNC | O_NONBLOCK | O_APPEND;
+ EXPECT_THAT(fcntl(fd.get(), F_SETFL, flags), SyscallSucceeds());
+
+ // Can't set O_RDWR or O_SYNC.
+ // Linux forces O_LARGEFILE on all 64-bit kernels and gVisor's is 64-bit.
+ int expected = O_DIRECT | O_NONBLOCK | O_APPEND | kOLargeFile;
+
+ int rflags;
+ EXPECT_THAT(rflags = fcntl(fd.get(), F_GETFL), SyscallSucceeds());
+ EXPECT_EQ(rflags, expected);
+}
+
+void TestLock(int fd, short lock_type = F_RDLCK) { // NOLINT, type in flock
+ struct flock fl;
+ fl.l_type = lock_type;
+ fl.l_whence = SEEK_SET;
+ fl.l_start = 0;
+ // len 0 locks all bytes despite how large the file grows.
+ fl.l_len = 0;
+ EXPECT_THAT(fcntl(fd, F_SETLK, &fl), SyscallSucceeds());
+}
+
+void TestLockBadFD(int fd,
+ short lock_type = F_RDLCK) { // NOLINT, type in flock
+ struct flock fl;
+ fl.l_type = lock_type;
+ fl.l_whence = SEEK_SET;
+ fl.l_start = 0;
+ // len 0 locks all bytes despite how large the file grows.
+ fl.l_len = 0;
+ EXPECT_THAT(fcntl(fd, F_SETLK, &fl), SyscallFailsWithErrno(EBADF));
+}
+
+TEST_F(FcntlLockTest, SetLockBadFd) { TestLockBadFD(-1); }
+
+TEST_F(FcntlLockTest, SetLockDir) {
+ auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ auto fd = ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_RDONLY, 0000));
+ TestLock(fd.get());
+}
+
+TEST_F(FcntlLockTest, SetLockSymlink) {
+ // TODO(gvisor.dev/issue/2782): Replace with IsRunningWithVFS1() when O_PATH
+ // is supported.
+ SKIP_IF(IsRunningOnGvisor());
+
+ auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ auto symlink = ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateSymlinkTo(GetAbsoluteTestTmpdir(), file.path()));
+
+ auto fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(symlink.path(), O_RDONLY | O_PATH, 0000));
+ TestLockBadFD(fd.get());
+}
+
+TEST_F(FcntlLockTest, SetLockProc) {
+ auto fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/proc/self/status", O_RDONLY, 0000));
+ TestLock(fd.get());
+}
+
+TEST_F(FcntlLockTest, SetLockPipe) {
+ SKIP_IF(IsRunningWithVFS1());
+
+ int fds[2];
+ ASSERT_THAT(pipe(fds), SyscallSucceeds());
+
+ TestLock(fds[0]);
+ TestLockBadFD(fds[0], F_WRLCK);
+
+ TestLock(fds[1], F_WRLCK);
+ TestLockBadFD(fds[1]);
+
+ EXPECT_THAT(close(fds[0]), SyscallSucceeds());
+ EXPECT_THAT(close(fds[1]), SyscallSucceeds());
+}
+
+TEST_F(FcntlLockTest, SetLockSocket) {
+ SKIP_IF(IsRunningWithVFS1());
+
+ int sock = socket(AF_UNIX, SOCK_STREAM, 0);
+ ASSERT_THAT(sock, SyscallSucceeds());
+
+ struct sockaddr_un addr =
+ ASSERT_NO_ERRNO_AND_VALUE(UniqueUnixAddr(true /* abstract */, AF_UNIX));
+ ASSERT_THAT(
+ bind(sock, reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr)),
+ SyscallSucceeds());
+
+ TestLock(sock);
+ EXPECT_THAT(close(sock), SyscallSucceeds());
+}
+
+TEST_F(FcntlLockTest, SetLockBadOpenFlagsWrite) {
+ auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY, 0666));
+
+ struct flock fl0;
+ fl0.l_type = F_WRLCK;
+ fl0.l_whence = SEEK_SET;
+ fl0.l_start = 0;
+ fl0.l_len = 0; // Lock all file
+
+ // Expect that setting a write lock using a read only file descriptor
+ // won't work.
+ EXPECT_THAT(fcntl(fd.get(), F_SETLK, &fl0), SyscallFailsWithErrno(EBADF));
+}
+
+TEST_F(FcntlLockTest, SetLockBadOpenFlagsRead) {
+ auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_WRONLY, 0666));
+
+ struct flock fl1;
+ fl1.l_type = F_RDLCK;
+ fl1.l_whence = SEEK_SET;
+ fl1.l_start = 0;
+ // Same as SetLockBadFd.
+ fl1.l_len = 0;
+
+ EXPECT_THAT(fcntl(fd.get(), F_SETLK, &fl1), SyscallFailsWithErrno(EBADF));
+}
+
+TEST_F(FcntlLockTest, SetLockUnlockOnNothing) {
+ auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666));
+
+ struct flock fl;
+ fl.l_type = F_UNLCK;
+ fl.l_whence = SEEK_SET;
+ fl.l_start = 0;
+ // Same as SetLockBadFd.
+ fl.l_len = 0;
+
+ EXPECT_THAT(fcntl(fd.get(), F_SETLK, &fl), SyscallSucceeds());
+}
+
+TEST_F(FcntlLockTest, SetWriteLockSingleProc) {
+ auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ FileDescriptor fd0 =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666));
+
+ struct flock fl;
+ fl.l_type = F_WRLCK;
+ fl.l_whence = SEEK_SET;
+ fl.l_start = 0;
+ // Same as SetLockBadFd.
+ fl.l_len = 0;
+
+ EXPECT_THAT(fcntl(fd0.get(), F_SETLK, &fl), SyscallSucceeds());
+ // Expect to be able to take the same lock on the same fd no problem.
+ EXPECT_THAT(fcntl(fd0.get(), F_SETLK, &fl), SyscallSucceeds());
+
+ FileDescriptor fd1 =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666));
+
+ // Expect to be able to take the same lock from a different fd but for
+ // the same process.
+ EXPECT_THAT(fcntl(fd1.get(), F_SETLK, &fl), SyscallSucceeds());
+}
+
+TEST_F(FcntlLockTest, SetReadLockMultiProc) {
+ auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666));
+
+ struct flock fl;
+ fl.l_type = F_RDLCK;
+ fl.l_whence = SEEK_SET;
+ fl.l_start = 0;
+ // Same as SetLockBadFd.
+ fl.l_len = 0;
+ EXPECT_THAT(fcntl(fd.get(), F_SETLK, &fl), SyscallSucceeds());
+
+ // spawn a child process to take a read lock on the same file.
+ pid_t child_pid = 0;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+ SubprocessLock(file.path(), false /* write lock */,
+ false /* nonblocking */, false /* no eintr retry */,
+ -1 /* no socket fd */, fl.l_start, fl.l_len, &child_pid));
+
+ int status = 0;
+ ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+ << "Exited with code: " << status;
+}
+
+TEST_F(FcntlLockTest, SetReadThenWriteLockMultiProc) {
+ auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666));
+
+ struct flock fl;
+ fl.l_type = F_RDLCK;
+ fl.l_whence = SEEK_SET;
+ fl.l_start = 0;
+ // Same as SetLockBadFd.
+ fl.l_len = 0;
+ EXPECT_THAT(fcntl(fd.get(), F_SETLK, &fl), SyscallSucceeds());
+
+ // Assert that another process trying to lock on the same file will fail
+ // with EAGAIN. It's important that we keep the fd above open so that
+ // that the other process will contend with the lock.
+ pid_t child_pid = 0;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+ SubprocessLock(file.path(), true /* write lock */,
+ false /* nonblocking */, false /* no eintr retry */,
+ -1 /* no socket fd */, fl.l_start, fl.l_len, &child_pid));
+
+ int status = 0;
+ ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == EAGAIN)
+ << "Exited with code: " << status;
+
+ // Close the fd: we want to test that another process can acquire the
+ // lock after this point.
+ fd.reset();
+ // Assert that another process can now acquire the lock.
+
+ child_pid = 0;
+ auto cleanup2 = ASSERT_NO_ERRNO_AND_VALUE(
+ SubprocessLock(file.path(), true /* write lock */,
+ false /* nonblocking */, false /* no eintr retry */,
+ -1 /* no socket fd */, fl.l_start, fl.l_len, &child_pid));
+ ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+ << "Exited with code: " << status;
+}
+
+TEST_F(FcntlLockTest, SetWriteThenReadLockMultiProc) {
+ auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666));
+ // Same as SetReadThenWriteLockMultiProc.
+
+ struct flock fl;
+ fl.l_type = F_WRLCK;
+ fl.l_whence = SEEK_SET;
+ fl.l_start = 0;
+ // Same as SetLockBadFd.
+ fl.l_len = 0;
+
+ // Same as SetReadThenWriteLockMultiProc.
+ EXPECT_THAT(fcntl(fd.get(), F_SETLK, &fl), SyscallSucceeds());
+
+ // Same as SetReadThenWriteLockMultiProc.
+ pid_t child_pid = 0;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+ SubprocessLock(file.path(), false /* write lock */,
+ false /* nonblocking */, false /* no eintr retry */,
+ -1 /* no socket fd */, fl.l_start, fl.l_len, &child_pid));
+
+ int status = 0;
+ ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == EAGAIN)
+ << "Exited with code: " << status;
+
+ // Same as SetReadThenWriteLockMultiProc.
+ fd.reset(); // Close the fd.
+
+ // Same as SetReadThenWriteLockMultiProc.
+ child_pid = 0;
+ auto cleanup2 = ASSERT_NO_ERRNO_AND_VALUE(
+ SubprocessLock(file.path(), false /* write lock */,
+ false /* nonblocking */, false /* no eintr retry */,
+ -1 /* no socket fd */, fl.l_start, fl.l_len, &child_pid));
+ ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+ << "Exited with code: " << status;
+}
+
+TEST_F(FcntlLockTest, SetWriteLockMultiProc) {
+ auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666));
+ // Same as SetReadThenWriteLockMultiProc.
+
+ struct flock fl;
+ fl.l_type = F_WRLCK;
+ fl.l_whence = SEEK_SET;
+ fl.l_start = 0;
+ // Same as SetLockBadFd.
+ fl.l_len = 0;
+
+ // Same as SetReadWriteLockMultiProc.
+ EXPECT_THAT(fcntl(fd.get(), F_SETLK, &fl), SyscallSucceeds());
+
+ // Same as SetReadWriteLockMultiProc.
+ pid_t child_pid = 0;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+ SubprocessLock(file.path(), true /* write lock */,
+ false /* nonblocking */, false /* no eintr retry */,
+ -1 /* no socket fd */, fl.l_start, fl.l_len, &child_pid));
+ int status = 0;
+ ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == EAGAIN)
+ << "Exited with code: " << status;
+
+ fd.reset(); // Close the FD.
+ // Same as SetReadWriteLockMultiProc.
+ child_pid = 0;
+ auto cleanup2 = ASSERT_NO_ERRNO_AND_VALUE(
+ SubprocessLock(file.path(), true /* write lock */,
+ false /* nonblocking */, false /* no eintr retry */,
+ -1 /* no socket fd */, fl.l_start, fl.l_len, &child_pid));
+ ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+ << "Exited with code: " << status;
+}
+
+TEST_F(FcntlLockTest, SetLockIsRegional) {
+ auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666));
+
+ struct flock fl;
+ fl.l_type = F_WRLCK;
+ fl.l_whence = SEEK_SET;
+ fl.l_start = 0;
+ fl.l_len = 4096;
+
+ // Same as SetReadWriteLockMultiProc.
+ EXPECT_THAT(fcntl(fd.get(), F_SETLK, &fl), SyscallSucceeds());
+
+ // Same as SetReadWriteLockMultiProc.
+ pid_t child_pid = 0;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+ SubprocessLock(file.path(), true /* write lock */,
+ false /* nonblocking */, false /* no eintr retry */,
+ -1 /* no socket fd */, fl.l_len, 0, &child_pid));
+ int status = 0;
+ ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+ << "Exited with code: " << status;
+}
+
+TEST_F(FcntlLockTest, SetLockUpgradeDowngrade) {
+ auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666));
+
+ struct flock fl;
+ fl.l_type = F_RDLCK;
+ fl.l_whence = SEEK_SET;
+ fl.l_start = 0;
+ // Same as SetLockBadFd.
+ fl.l_len = 0;
+
+ // Same as SetReadWriteLockMultiProc.
+ EXPECT_THAT(fcntl(fd.get(), F_SETLK, &fl), SyscallSucceeds());
+
+ // Upgrade to a write lock. This will prevent anyone else from taking
+ // the lock.
+ fl.l_type = F_WRLCK;
+ EXPECT_THAT(fcntl(fd.get(), F_SETLK, &fl), SyscallSucceeds());
+
+ // Same as SetReadWriteLockMultiProc.,
+ pid_t child_pid = 0;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+ SubprocessLock(file.path(), false /* write lock */,
+ false /* nonblocking */, false /* no eintr retry */,
+ -1 /* no socket fd */, fl.l_start, fl.l_len, &child_pid));
+
+ int status = 0;
+ ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == EAGAIN)
+ << "Exited with code: " << status;
+
+ // Downgrade back to a read lock.
+ fl.l_type = F_RDLCK;
+ EXPECT_THAT(fcntl(fd.get(), F_SETLK, &fl), SyscallSucceeds());
+
+ // Do the same stint as before, but this time it should succeed.
+ child_pid = 0;
+ auto cleanup2 = ASSERT_NO_ERRNO_AND_VALUE(
+ SubprocessLock(file.path(), false /* write lock */,
+ false /* nonblocking */, false /* no eintr retry */,
+ -1 /* no socket fd */, fl.l_start, fl.l_len, &child_pid));
+
+ ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+ << "Exited with code: " << status;
+}
+
+TEST_F(FcntlLockTest, SetLockDroppedOnClose) {
+ auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666));
+
+ // While somewhat surprising, obtaining another fd to the same file and
+ // then closing it in this process drops *all* locks.
+ FileDescriptor other_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666));
+ // Same as SetReadThenWriteLockMultiProc.
+
+ struct flock fl;
+ fl.l_type = F_WRLCK;
+ fl.l_whence = SEEK_SET;
+ fl.l_start = 0;
+ // Same as SetLockBadFd.
+ fl.l_len = 0;
+
+ // Same as SetReadWriteLockMultiProc.
+ EXPECT_THAT(fcntl(fd.get(), F_SETLK, &fl), SyscallSucceeds());
+
+ other_fd.reset(); // Close.
+
+ // Expect to be able to get the lock, given that the close above dropped it.
+ pid_t child_pid = 0;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+ SubprocessLock(file.path(), true /* write lock */,
+ false /* nonblocking */, false /* no eintr retry */,
+ -1 /* no socket fd */, fl.l_start, fl.l_len, &child_pid));
+
+ int status = 0;
+ ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+ << "Exited with code: " << status;
+}
+
+TEST_F(FcntlLockTest, SetLockUnlock) {
+ auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666));
+
+ // Setup two regional locks with different permissions.
+ struct flock fl0;
+ fl0.l_type = F_WRLCK;
+ fl0.l_whence = SEEK_SET;
+ fl0.l_start = 0;
+ fl0.l_len = 4096;
+
+ struct flock fl1;
+ fl1.l_type = F_RDLCK;
+ fl1.l_whence = SEEK_SET;
+ fl1.l_start = 4096;
+ // Same as SetLockBadFd.
+ fl1.l_len = 0;
+
+ // Set both region locks.
+ EXPECT_THAT(fcntl(fd.get(), F_SETLK, &fl0), SyscallSucceeds());
+ EXPECT_THAT(fcntl(fd.get(), F_SETLK, &fl1), SyscallSucceeds());
+
+ // Another process should fail to take a read lock on the entire file
+ // due to the regional write lock.
+ pid_t child_pid = 0;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(SubprocessLock(
+ file.path(), false /* write lock */, false /* nonblocking */,
+ false /* no eintr retry */, -1 /* no socket fd */, 0, 0, &child_pid));
+
+ int status = 0;
+ ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == EAGAIN)
+ << "Exited with code: " << status;
+
+ // Then only unlock the writable one. This should ensure that other
+ // processes can take any read lock that it wants.
+ fl0.l_type = F_UNLCK;
+ EXPECT_THAT(fcntl(fd.get(), F_SETLK, &fl0), SyscallSucceeds());
+
+ // Another process should now succeed to get a read lock on the entire file.
+ child_pid = 0;
+ auto cleanup2 = ASSERT_NO_ERRNO_AND_VALUE(SubprocessLock(
+ file.path(), false /* write lock */, false /* nonblocking */,
+ false /* no eintr retry */, -1 /* no socket fd */, 0, 0, &child_pid));
+ ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+ << "Exited with code: " << status;
+}
+
+TEST_F(FcntlLockTest, SetLockAcrossRename) {
+ auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666));
+
+ // Setup two regional locks with different permissions.
+ struct flock fl;
+ fl.l_type = F_WRLCK;
+ fl.l_whence = SEEK_SET;
+ fl.l_start = 0;
+ // Same as SetLockBadFd.
+ fl.l_len = 0;
+
+ // Set the region lock.
+ EXPECT_THAT(fcntl(fd.get(), F_SETLK, &fl), SyscallSucceeds());
+
+ // Rename the file to someplace nearby.
+ std::string const newpath = NewTempAbsPath();
+ EXPECT_THAT(rename(file.path().c_str(), newpath.c_str()), SyscallSucceeds());
+
+ // Another process should fail to take a read lock on the renamed file
+ // since we still have an open handle to the inode.
+ pid_t child_pid = 0;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+ SubprocessLock(newpath, false /* write lock */, false /* nonblocking */,
+ false /* no eintr retry */, -1 /* no socket fd */,
+ fl.l_start, fl.l_len, &child_pid));
+
+ int status = 0;
+ ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == EAGAIN)
+ << "Exited with code: " << status;
+}
+
+// NOTE: The blocking tests below aren't perfect. It's hard to assert exactly
+// what the kernel did while handling a syscall. These tests are timing based
+// because there really isn't any other reasonable way to assert that correct
+// blocking behavior happened.
+
+// This test will verify that blocking works as expected when another process
+// holds a write lock when obtaining a write lock. This test will hold the lock
+// for some amount of time and then wait for the second process to send over the
+// socket_fd the amount of time it was blocked for before the lock succeeded.
+TEST_F(FcntlLockTest, SetWriteLockThenBlockingWriteLock) {
+ auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666));
+
+ struct flock fl;
+ fl.l_type = F_WRLCK;
+ fl.l_whence = SEEK_SET;
+ fl.l_start = 0;
+ fl.l_len = 0;
+
+ // Take the write lock.
+ ASSERT_THAT(fcntl(fd.get(), F_SETLKW, &fl), SyscallSucceeds());
+
+ // Attempt to take the read lock in a sub process. This will immediately block
+ // so we will release our lock after some amount of time and then assert the
+ // amount of time the other process was blocked for.
+ pid_t child_pid = 0;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(SubprocessLock(
+ file.path(), true /* write lock */, true /* Blocking Lock */,
+ true /* Retry on EINTR */, fds_[1] /* Socket fd for timing information */,
+ fl.l_start, fl.l_len, &child_pid));
+
+ // We will wait kHoldLockForSec before we release our lock allowing the
+ // subprocess to obtain it.
+ constexpr absl::Duration kHoldLockFor = absl::Seconds(5);
+ const int64_t kMinBlockTimeUsec = absl::ToInt64Microseconds(absl::Seconds(1));
+
+ absl::SleepFor(kHoldLockFor);
+
+ // Unlock our write lock.
+ fl.l_type = F_UNLCK;
+ ASSERT_THAT(fcntl(fd.get(), F_SETLKW, &fl), SyscallSucceeds());
+
+ // Read the blocked time from the subprocess socket.
+ int64_t subprocess_blocked_time_usec = GetSubprocessFcntlTimeInUsec();
+
+ // We must have been waiting at least kMinBlockTime.
+ EXPECT_GT(subprocess_blocked_time_usec, kMinBlockTimeUsec);
+
+ // The FCNTL write lock must always succeed as it will simply block until it
+ // can obtain the lock.
+ int status = 0;
+ ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+ << "Exited with code: " << status;
+}
+
+// This test will verify that blocking works as expected when another process
+// holds a read lock when obtaining a write lock. This test will hold the lock
+// for some amount of time and then wait for the second process to send over the
+// socket_fd the amount of time it was blocked for before the lock succeeded.
+TEST_F(FcntlLockTest, SetReadLockThenBlockingWriteLock) {
+ auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666));
+
+ struct flock fl;
+ fl.l_type = F_RDLCK;
+ fl.l_whence = SEEK_SET;
+ fl.l_start = 0;
+ fl.l_len = 0;
+
+ // Take the write lock.
+ ASSERT_THAT(fcntl(fd.get(), F_SETLKW, &fl), SyscallSucceeds());
+
+ // Attempt to take the read lock in a sub process. This will immediately block
+ // so we will release our lock after some amount of time and then assert the
+ // amount of time the other process was blocked for.
+ pid_t child_pid = 0;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(SubprocessLock(
+ file.path(), true /* write lock */, true /* Blocking Lock */,
+ true /* Retry on EINTR */, fds_[1] /* Socket fd for timing information */,
+ fl.l_start, fl.l_len, &child_pid));
+
+ // We will wait kHoldLockForSec before we release our lock allowing the
+ // subprocess to obtain it.
+ constexpr absl::Duration kHoldLockFor = absl::Seconds(5);
+
+ const int64_t kMinBlockTimeUsec = absl::ToInt64Microseconds(absl::Seconds(1));
+
+ absl::SleepFor(kHoldLockFor);
+
+ // Unlock our READ lock.
+ fl.l_type = F_UNLCK;
+ ASSERT_THAT(fcntl(fd.get(), F_SETLKW, &fl), SyscallSucceeds());
+
+ // Read the blocked time from the subprocess socket.
+ int64_t subprocess_blocked_time_usec = GetSubprocessFcntlTimeInUsec();
+
+ // We must have been waiting at least kMinBlockTime.
+ EXPECT_GT(subprocess_blocked_time_usec, kMinBlockTimeUsec);
+
+ // The FCNTL write lock must always succeed as it will simply block until it
+ // can obtain the lock.
+ int status = 0;
+ ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+ << "Exited with code: " << status;
+}
+
+// This test will veirfy that blocking works as expected when another process
+// holds a write lock when obtaining a read lock. This test will hold the lock
+// for some amount of time and then wait for the second process to send over the
+// socket_fd the amount of time it was blocked for before the lock succeeded.
+TEST_F(FcntlLockTest, SetWriteLockThenBlockingReadLock) {
+ auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666));
+
+ struct flock fl;
+ fl.l_type = F_WRLCK;
+ fl.l_whence = SEEK_SET;
+ fl.l_start = 0;
+ fl.l_len = 0;
+
+ // Take the write lock.
+ ASSERT_THAT(fcntl(fd.get(), F_SETLKW, &fl), SyscallSucceeds());
+
+ // Attempt to take the read lock in a sub process. This will immediately block
+ // so we will release our lock after some amount of time and then assert the
+ // amount of time the other process was blocked for.
+ pid_t child_pid = 0;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(SubprocessLock(
+ file.path(), false /* read lock */, true /* Blocking Lock */,
+ true /* Retry on EINTR */, fds_[1] /* Socket fd for timing information */,
+ fl.l_start, fl.l_len, &child_pid));
+
+ // We will wait kHoldLockForSec before we release our lock allowing the
+ // subprocess to obtain it.
+ constexpr absl::Duration kHoldLockFor = absl::Seconds(5);
+
+ const int64_t kMinBlockTimeUsec = absl::ToInt64Microseconds(absl::Seconds(1));
+
+ absl::SleepFor(kHoldLockFor);
+
+ // Unlock our write lock.
+ fl.l_type = F_UNLCK;
+ ASSERT_THAT(fcntl(fd.get(), F_SETLKW, &fl), SyscallSucceeds());
+
+ // Read the blocked time from the subprocess socket.
+ int64_t subprocess_blocked_time_usec = GetSubprocessFcntlTimeInUsec();
+
+ // We must have been waiting at least kMinBlockTime.
+ EXPECT_GT(subprocess_blocked_time_usec, kMinBlockTimeUsec);
+
+ // The FCNTL read lock must always succeed as it will simply block until it
+ // can obtain the lock.
+ int status = 0;
+ ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+ << "Exited with code: " << status;
+}
+
+// This test will verify that when one process only holds a read lock that
+// another will not block while obtaining a read lock when F_SETLKW is used.
+TEST_F(FcntlLockTest, SetReadLockThenBlockingReadLock) {
+ auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666));
+
+ struct flock fl;
+ fl.l_type = F_RDLCK;
+ fl.l_whence = SEEK_SET;
+ fl.l_start = 0;
+ fl.l_len = 0;
+
+ // Take the READ lock.
+ ASSERT_THAT(fcntl(fd.get(), F_SETLKW, &fl), SyscallSucceeds());
+
+ // Attempt to take the read lock in a sub process. Since multiple processes
+ // can hold a read lock this should immediately return without blocking
+ // even though we used F_SETLKW in the subprocess.
+ pid_t child_pid = 0;
+ auto sp = ASSERT_NO_ERRNO_AND_VALUE(SubprocessLock(
+ file.path(), false /* read lock */, true /* Blocking Lock */,
+ true /* Retry on EINTR */, -1 /* No fd, should not block */, fl.l_start,
+ fl.l_len, &child_pid));
+
+ // We never release the lock and the subprocess should still obtain it without
+ // blocking for any period of time.
+ int status = 0;
+ ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+ << "Exited with code: " << status;
+}
+
+TEST(FcntlTest, GetO_ASYNC) {
+ FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+ int flag_fl = -1;
+ ASSERT_THAT(flag_fl = fcntl(s.get(), F_GETFL), SyscallSucceeds());
+ EXPECT_EQ(flag_fl & O_ASYNC, 0);
+
+ int flag_fd = -1;
+ ASSERT_THAT(flag_fd = fcntl(s.get(), F_GETFD), SyscallSucceeds());
+ EXPECT_EQ(flag_fd & O_ASYNC, 0);
+}
+
+TEST(FcntlTest, SetFlO_ASYNC) {
+ FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+ int before_fl = -1;
+ ASSERT_THAT(before_fl = fcntl(s.get(), F_GETFL), SyscallSucceeds());
+
+ int before_fd = -1;
+ ASSERT_THAT(before_fd = fcntl(s.get(), F_GETFD), SyscallSucceeds());
+
+ ASSERT_THAT(fcntl(s.get(), F_SETFL, before_fl | O_ASYNC), SyscallSucceeds());
+
+ int after_fl = -1;
+ ASSERT_THAT(after_fl = fcntl(s.get(), F_GETFL), SyscallSucceeds());
+ EXPECT_EQ(after_fl, before_fl | O_ASYNC);
+
+ int after_fd = -1;
+ ASSERT_THAT(after_fd = fcntl(s.get(), F_GETFD), SyscallSucceeds());
+ EXPECT_EQ(after_fd, before_fd);
+}
+
+TEST(FcntlTest, SetFdO_ASYNC) {
+ FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+ int before_fl = -1;
+ ASSERT_THAT(before_fl = fcntl(s.get(), F_GETFL), SyscallSucceeds());
+
+ int before_fd = -1;
+ ASSERT_THAT(before_fd = fcntl(s.get(), F_GETFD), SyscallSucceeds());
+
+ ASSERT_THAT(fcntl(s.get(), F_SETFD, before_fd | O_ASYNC), SyscallSucceeds());
+
+ int after_fl = -1;
+ ASSERT_THAT(after_fl = fcntl(s.get(), F_GETFL), SyscallSucceeds());
+ EXPECT_EQ(after_fl, before_fl);
+
+ int after_fd = -1;
+ ASSERT_THAT(after_fd = fcntl(s.get(), F_GETFD), SyscallSucceeds());
+ EXPECT_EQ(after_fd, before_fd);
+}
+
+TEST(FcntlTest, DupAfterO_ASYNC) {
+ FileDescriptor s1 = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+ int before = -1;
+ ASSERT_THAT(before = fcntl(s1.get(), F_GETFL), SyscallSucceeds());
+
+ ASSERT_THAT(fcntl(s1.get(), F_SETFL, before | O_ASYNC), SyscallSucceeds());
+
+ FileDescriptor fd2 = ASSERT_NO_ERRNO_AND_VALUE(s1.Dup());
+
+ int after = -1;
+ ASSERT_THAT(after = fcntl(fd2.get(), F_GETFL), SyscallSucceeds());
+ EXPECT_EQ(after & O_ASYNC, O_ASYNC);
+}
+
+TEST(FcntlTest, GetOwnNone) {
+ FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+ // Use the raw syscall because the glibc wrapper may convert F_{GET,SET}OWN
+ // into F_{GET,SET}OWN_EX.
+ EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN),
+ SyscallSucceedsWithValue(0));
+ MaybeSave();
+}
+
+TEST(FcntlTest, GetOwnExNone) {
+ FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+ f_owner_ex owner = {};
+ EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN_EX, &owner),
+ SyscallSucceedsWithValue(0));
+}
+
+TEST(FcntlTest, SetOwnInvalidPid) {
+ SKIP_IF(IsRunningWithVFS1());
+
+ FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+ EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, 12345678),
+ SyscallFailsWithErrno(ESRCH));
+}
+
+TEST(FcntlTest, SetOwnInvalidPgrp) {
+ SKIP_IF(IsRunningWithVFS1());
+
+ FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+ EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, -12345678),
+ SyscallFailsWithErrno(ESRCH));
+}
+
+TEST(FcntlTest, SetOwnPid) {
+ FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+ pid_t pid;
+ EXPECT_THAT(pid = getpid(), SyscallSucceeds());
+
+ ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, pid), SyscallSucceeds());
+
+ EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN),
+ SyscallSucceedsWithValue(pid));
+ MaybeSave();
+}
+
+TEST(FcntlTest, SetOwnPgrp) {
+ FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+ pid_t pgid;
+ EXPECT_THAT(pgid = getpgrp(), SyscallSucceeds());
+
+ ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, -pgid), SyscallSucceeds());
+
+ // Verify with F_GETOWN_EX; using F_GETOWN on Linux may incorrectly treat the
+ // negative return value as an error, converting the return value to -1 and
+ // setting errno accordingly.
+ f_owner_ex got_owner = {};
+ ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN_EX, &got_owner),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(got_owner.type, F_OWNER_PGRP);
+ EXPECT_EQ(got_owner.pid, pgid);
+ MaybeSave();
+}
+
+TEST(FcntlTest, SetOwnUnset) {
+ FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+ // Set and unset pid.
+ pid_t pid;
+ EXPECT_THAT(pid = getpid(), SyscallSucceeds());
+ ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, pid), SyscallSucceeds());
+ ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, 0), SyscallSucceeds());
+
+ EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN),
+ SyscallSucceedsWithValue(0));
+
+ // Set and unset pgid.
+ pid_t pgid;
+ EXPECT_THAT(pgid = getpgrp(), SyscallSucceeds());
+ ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, -pgid), SyscallSucceeds());
+ ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, 0), SyscallSucceeds());
+
+ EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN),
+ SyscallSucceedsWithValue(0));
+ MaybeSave();
+}
+
+// F_SETOWN flips the sign of negative values, an operation that is guarded
+// against overflow.
+TEST(FcntlTest, SetOwnOverflow) {
+ FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+ EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, INT_MIN),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(FcntlTest, SetOwnExInvalidType) {
+ FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+ f_owner_ex owner = {};
+ owner.type = __pid_type(-1);
+ EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(FcntlTest, SetOwnExInvalidTid) {
+ FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+ f_owner_ex owner = {};
+ owner.type = F_OWNER_TID;
+ owner.pid = -1;
+
+ EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner),
+ SyscallFailsWithErrno(ESRCH));
+}
+
+TEST(FcntlTest, SetOwnExInvalidPid) {
+ FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+ f_owner_ex owner = {};
+ owner.type = F_OWNER_PID;
+ owner.pid = -1;
+
+ EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner),
+ SyscallFailsWithErrno(ESRCH));
+}
+
+TEST(FcntlTest, SetOwnExInvalidPgrp) {
+ FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+ f_owner_ex owner = {};
+ owner.type = F_OWNER_PGRP;
+ owner.pid = -1;
+
+ EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner),
+ SyscallFailsWithErrno(ESRCH));
+}
+
+TEST(FcntlTest, SetOwnExTid) {
+ FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+ f_owner_ex owner = {};
+ owner.type = F_OWNER_TID;
+ EXPECT_THAT(owner.pid = syscall(__NR_gettid), SyscallSucceeds());
+
+ ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner),
+ SyscallSucceeds());
+
+ EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN),
+ SyscallSucceedsWithValue(owner.pid));
+ MaybeSave();
+}
+
+TEST(FcntlTest, SetOwnExPid) {
+ FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+ f_owner_ex owner = {};
+ owner.type = F_OWNER_PID;
+ EXPECT_THAT(owner.pid = getpid(), SyscallSucceeds());
+
+ ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner),
+ SyscallSucceeds());
+
+ EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN),
+ SyscallSucceedsWithValue(owner.pid));
+ MaybeSave();
+}
+
+TEST(FcntlTest, SetOwnExPgrp) {
+ FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+ f_owner_ex set_owner = {};
+ set_owner.type = F_OWNER_PGRP;
+ EXPECT_THAT(set_owner.pid = getpgrp(), SyscallSucceeds());
+
+ ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &set_owner),
+ SyscallSucceeds());
+
+ // Verify with F_GETOWN_EX; using F_GETOWN on Linux may incorrectly treat the
+ // negative return value as an error, converting the return value to -1 and
+ // setting errno accordingly.
+ f_owner_ex got_owner = {};
+ ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN_EX, &got_owner),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(got_owner.type, set_owner.type);
+ EXPECT_EQ(got_owner.pid, set_owner.pid);
+ MaybeSave();
+}
+
+TEST(FcntlTest, SetOwnExUnset) {
+ SKIP_IF(IsRunningWithVFS1());
+
+ FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+ // Set and unset pid.
+ f_owner_ex owner = {};
+ owner.type = F_OWNER_PID;
+ EXPECT_THAT(owner.pid = getpid(), SyscallSucceeds());
+ ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner),
+ SyscallSucceeds());
+ owner.pid = 0;
+ ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner),
+ SyscallSucceeds());
+
+ EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN),
+ SyscallSucceedsWithValue(0));
+
+ // Set and unset pgid.
+ owner.type = F_OWNER_PGRP;
+ EXPECT_THAT(owner.pid = getpgrp(), SyscallSucceeds());
+ ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner),
+ SyscallSucceeds());
+ owner.pid = 0;
+ ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &owner),
+ SyscallSucceeds());
+
+ EXPECT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN),
+ SyscallSucceedsWithValue(0));
+ MaybeSave();
+}
+
+TEST(FcntlTest, GetOwnExTid) {
+ FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+ f_owner_ex set_owner = {};
+ set_owner.type = F_OWNER_TID;
+ EXPECT_THAT(set_owner.pid = syscall(__NR_gettid), SyscallSucceeds());
+
+ ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &set_owner),
+ SyscallSucceeds());
+
+ f_owner_ex got_owner = {};
+ ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN_EX, &got_owner),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(got_owner.type, set_owner.type);
+ EXPECT_EQ(got_owner.pid, set_owner.pid);
+}
+
+TEST(FcntlTest, GetOwnExPid) {
+ FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+ f_owner_ex set_owner = {};
+ set_owner.type = F_OWNER_PID;
+ EXPECT_THAT(set_owner.pid = getpid(), SyscallSucceeds());
+
+ ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &set_owner),
+ SyscallSucceeds());
+
+ f_owner_ex got_owner = {};
+ ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN_EX, &got_owner),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(got_owner.type, set_owner.type);
+ EXPECT_EQ(got_owner.pid, set_owner.pid);
+}
+
+TEST(FcntlTest, GetOwnExPgrp) {
+ FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+ f_owner_ex set_owner = {};
+ set_owner.type = F_OWNER_PGRP;
+ EXPECT_THAT(set_owner.pid = getpgrp(), SyscallSucceeds());
+
+ ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN_EX, &set_owner),
+ SyscallSucceeds());
+
+ f_owner_ex got_owner = {};
+ ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_GETOWN_EX, &got_owner),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(got_owner.type, set_owner.type);
+ EXPECT_EQ(got_owner.pid, set_owner.pid);
+}
+
+// Make sure that making multiple concurrent changes to async signal generation
+// does not cause any race issues.
+TEST(FcntlTest, SetFlSetOwnDoNotRace) {
+ FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+ pid_t pid;
+ EXPECT_THAT(pid = getpid(), SyscallSucceeds());
+
+ constexpr absl::Duration runtime = absl::Milliseconds(300);
+ auto setAsync = [&s, &runtime] {
+ for (auto start = absl::Now(); absl::Now() - start < runtime;) {
+ ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETFL, O_ASYNC),
+ SyscallSucceeds());
+ sched_yield();
+ }
+ };
+ auto resetAsync = [&s, &runtime] {
+ for (auto start = absl::Now(); absl::Now() - start < runtime;) {
+ ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETFL, 0), SyscallSucceeds());
+ sched_yield();
+ }
+ };
+ auto setOwn = [&s, &pid, &runtime] {
+ for (auto start = absl::Now(); absl::Now() - start < runtime;) {
+ ASSERT_THAT(syscall(__NR_fcntl, s.get(), F_SETOWN, pid),
+ SyscallSucceeds());
+ sched_yield();
+ }
+ };
+
+ std::list<ScopedThread> threads;
+ for (int i = 0; i < 10; i++) {
+ threads.emplace_back(setAsync);
+ threads.emplace_back(resetAsync);
+ threads.emplace_back(setOwn);
+ }
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
+
+int main(int argc, char** argv) {
+ gvisor::testing::TestInit(&argc, &argv);
+
+ const std::string setlock_on = absl::GetFlag(FLAGS_child_setlock_on);
+ if (!setlock_on.empty()) {
+ int socket_fd = absl::GetFlag(FLAGS_socket_fd);
+ int fd = open(setlock_on.c_str(), O_RDWR, 0666);
+ if (fd == -1 && errno != 0) {
+ int err = errno;
+ std::cerr << "CHILD open " << setlock_on << " failed " << err
+ << std::endl;
+ exit(err);
+ }
+
+ struct flock fl;
+ if (absl::GetFlag(FLAGS_child_setlock_write)) {
+ fl.l_type = F_WRLCK;
+ } else {
+ fl.l_type = F_RDLCK;
+ }
+ fl.l_whence = SEEK_SET;
+ fl.l_start = absl::GetFlag(FLAGS_child_setlock_start);
+ fl.l_len = absl::GetFlag(FLAGS_child_setlock_len);
+
+ // Test the fcntl.
+ int err = 0;
+ int ret = 0;
+
+ gvisor::testing::MonotonicTimer timer;
+ timer.Start();
+ do {
+ ret = fcntl(fd, absl::GetFlag(FLAGS_blocking) ? F_SETLKW : F_SETLK, &fl);
+ } while (absl::GetFlag(FLAGS_retry_eintr) && ret == -1 && errno == EINTR);
+ auto usec = absl::ToInt64Microseconds(timer.Duration());
+
+ if (ret == -1 && errno != 0) {
+ err = errno;
+ std::cerr << "CHILD lock " << setlock_on << " failed " << err
+ << std::endl;
+ }
+
+ // If there is a socket fd let's send back the time in microseconds it took
+ // to execute this syscall.
+ if (socket_fd != -1) {
+ gvisor::testing::WriteFd(socket_fd, reinterpret_cast<void*>(&usec),
+ sizeof(usec));
+ close(socket_fd);
+ }
+
+ close(fd);
+ exit(err);
+ }
+
+ return gvisor::testing::RunAllTests();
+}
diff --git a/test/syscalls/linux/file_base.h b/test/syscalls/linux/file_base.h
new file mode 100644
index 000000000..fb418e052
--- /dev/null
+++ b/test/syscalls/linux/file_base.h
@@ -0,0 +1,100 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_FILE_BASE_H_
+#define GVISOR_TEST_SYSCALLS_FILE_BASE_H_
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <netinet/in.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <unistd.h>
+
+#include <cstring>
+#include <string>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/strings/string_view.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/posix_error.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+class FileTest : public ::testing::Test {
+ public:
+ void SetUp() override {
+ test_pipe_[0] = -1;
+ test_pipe_[1] = -1;
+
+ test_file_name_ = NewTempAbsPath();
+ test_file_fd_ = ASSERT_NO_ERRNO_AND_VALUE(
+ Open(test_file_name_, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR));
+
+ ASSERT_THAT(pipe(test_pipe_), SyscallSucceeds());
+ ASSERT_THAT(fcntl(test_pipe_[0], F_SETFL, O_NONBLOCK), SyscallSucceeds());
+ }
+
+ // CloseFile will allow the test to manually close the file descriptor.
+ void CloseFile() { test_file_fd_.reset(); }
+
+ // UnlinkFile will allow the test to manually unlink the file.
+ void UnlinkFile() {
+ if (!test_file_name_.empty()) {
+ EXPECT_THAT(unlink(test_file_name_.c_str()), SyscallSucceeds());
+ test_file_name_.clear();
+ }
+ }
+
+ // ClosePipes will allow the test to manually close the pipes.
+ void ClosePipes() {
+ if (test_pipe_[0] > 0) {
+ EXPECT_THAT(close(test_pipe_[0]), SyscallSucceeds());
+ }
+
+ if (test_pipe_[1] > 0) {
+ EXPECT_THAT(close(test_pipe_[1]), SyscallSucceeds());
+ }
+
+ test_pipe_[0] = -1;
+ test_pipe_[1] = -1;
+ }
+
+ void TearDown() override {
+ CloseFile();
+ UnlinkFile();
+ ClosePipes();
+ }
+
+ protected:
+ std::string test_file_name_;
+ FileDescriptor test_file_fd_;
+
+ int test_pipe_[2];
+};
+
+} // namespace testing
+} // namespace gvisor
+
+#endif // GVISOR_TEST_SYSCALLS_FILE_BASE_H_
diff --git a/test/syscalls/linux/flock.cc b/test/syscalls/linux/flock.cc
new file mode 100644
index 000000000..638a93979
--- /dev/null
+++ b/test/syscalls/linux/flock.cc
@@ -0,0 +1,636 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <sys/file.h>
+
+#include <string>
+
+#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/syscalls/linux/file_base.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+#include "test/util/timer_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+class FlockTest : public FileTest {};
+
+TEST_F(FlockTest, InvalidOpCombinations) {
+ // The operation cannot be both exclusive and shared.
+ EXPECT_THAT(flock(test_file_fd_.get(), LOCK_EX | LOCK_SH | LOCK_NB),
+ SyscallFailsWithErrno(EINVAL));
+
+ // Locking and Unlocking doesn't make sense.
+ EXPECT_THAT(flock(test_file_fd_.get(), LOCK_EX | LOCK_UN | LOCK_NB),
+ SyscallFailsWithErrno(EINVAL));
+ EXPECT_THAT(flock(test_file_fd_.get(), LOCK_SH | LOCK_UN | LOCK_NB),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_F(FlockTest, NoOperationSpecified) {
+ // Not specifying an operation is invalid.
+ ASSERT_THAT(flock(test_file_fd_.get(), LOCK_NB),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_F(FlockTest, TestSimpleExLock) {
+ // Test that we can obtain an exclusive lock (no other holders)
+ // and that we can unlock it.
+ ASSERT_THAT(flock(test_file_fd_.get(), LOCK_EX | LOCK_NB),
+ SyscallSucceedsWithValue(0));
+ ASSERT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(FlockTest, TestSimpleShLock) {
+ // Test that we can obtain a shared lock (no other holders)
+ // and that we can unlock it.
+ ASSERT_THAT(flock(test_file_fd_.get(), LOCK_SH | LOCK_NB),
+ SyscallSucceedsWithValue(0));
+ ASSERT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(FlockTest, TestLockableAnyMode) {
+ // flock(2): A shared or exclusive lock can be placed on a file
+ // regardless of the mode in which the file was opened.
+ const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Open(test_file_name_, O_RDONLY)); // open read only to test
+
+ // Mode shouldn't prevent us from taking an exclusive lock.
+ ASSERT_THAT(flock(fd.get(), LOCK_EX | LOCK_NB), SyscallSucceedsWithValue(0));
+
+ // Unlock
+ ASSERT_THAT(flock(fd.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(FlockTest, TestUnlockWithNoHolders) {
+ // Test that unlocking when no one holds a lock succeeeds.
+ ASSERT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(FlockTest, TestRepeatedExLockingBySameHolder) {
+ // Test that repeated locking by the same holder for the
+ // same type of lock works correctly.
+ ASSERT_THAT(flock(test_file_fd_.get(), LOCK_NB | LOCK_EX),
+ SyscallSucceedsWithValue(0));
+ ASSERT_THAT(flock(test_file_fd_.get(), LOCK_NB | LOCK_EX),
+ SyscallSucceedsWithValue(0));
+ ASSERT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(FlockTest, TestRepeatedExLockingSingleUnlock) {
+ // Test that repeated locking by the same holder for the
+ // same type of lock works correctly and that a single unlock is required.
+ ASSERT_THAT(flock(test_file_fd_.get(), LOCK_NB | LOCK_EX),
+ SyscallSucceedsWithValue(0));
+ ASSERT_THAT(flock(test_file_fd_.get(), LOCK_NB | LOCK_EX),
+ SyscallSucceedsWithValue(0));
+ ASSERT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDONLY));
+
+ // Should be unlocked at this point
+ ASSERT_THAT(flock(fd.get(), LOCK_NB | LOCK_EX), SyscallSucceedsWithValue(0));
+
+ ASSERT_THAT(flock(fd.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(FlockTest, TestRepeatedShLockingBySameHolder) {
+ // Test that repeated locking by the same holder for the
+ // same type of lock works correctly.
+ ASSERT_THAT(flock(test_file_fd_.get(), LOCK_NB | LOCK_SH),
+ SyscallSucceedsWithValue(0));
+ ASSERT_THAT(flock(test_file_fd_.get(), LOCK_NB | LOCK_SH),
+ SyscallSucceedsWithValue(0));
+ ASSERT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(FlockTest, TestSingleHolderUpgrade) {
+ // Test that a shared lock is upgradable when no one else holds a lock.
+ ASSERT_THAT(flock(test_file_fd_.get(), LOCK_NB | LOCK_SH),
+ SyscallSucceedsWithValue(0));
+ ASSERT_THAT(flock(test_file_fd_.get(), LOCK_NB | LOCK_EX),
+ SyscallSucceedsWithValue(0));
+ ASSERT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(FlockTest, TestSingleHolderDowngrade) {
+ // Test single holder lock downgrade case.
+ ASSERT_THAT(flock(test_file_fd_.get(), LOCK_EX | LOCK_NB),
+ SyscallSucceedsWithValue(0));
+ ASSERT_THAT(flock(test_file_fd_.get(), LOCK_SH | LOCK_NB),
+ SyscallSucceedsWithValue(0));
+ ASSERT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(FlockTest, TestMultipleShared) {
+ // This is a simple test to verify that multiple independent shared
+ // locks will be granted.
+ ASSERT_THAT(flock(test_file_fd_.get(), LOCK_SH | LOCK_NB),
+ SyscallSucceedsWithValue(0));
+
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR));
+
+ // A shared lock should be granted as there only exists other shared locks.
+ ASSERT_THAT(flock(fd.get(), LOCK_SH | LOCK_NB), SyscallSucceedsWithValue(0));
+
+ // Unlock both.
+ ASSERT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+ ASSERT_THAT(flock(fd.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+}
+
+/*
+ * flock(2): If a process uses open(2) (or similar) to obtain more than one
+ * descriptor for the same file, these descriptors are treated
+ * independently by flock(). An attempt to lock the file using one of
+ * these file descriptors may be denied by a lock that the calling process
+ * has already placed via another descriptor.
+ */
+TEST_F(FlockTest, TestMultipleHolderSharedExclusive) {
+ // This test will verify that an exclusive lock will not be granted
+ // while a shared is held.
+ ASSERT_THAT(flock(test_file_fd_.get(), LOCK_SH | LOCK_NB),
+ SyscallSucceedsWithValue(0));
+
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR));
+
+ // Verify We're unable to get an exlcusive lock via the second FD.
+ // because someone is holding a shared lock.
+ ASSERT_THAT(flock(fd.get(), LOCK_EX | LOCK_NB),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+
+ // Unlock
+ ASSERT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(FlockTest, TestSharedLockFailExclusiveHolder) {
+ // This test will verify that a shared lock is denied while
+ // someone holds an exclusive lock.
+ ASSERT_THAT(flock(test_file_fd_.get(), LOCK_EX | LOCK_NB),
+ SyscallSucceedsWithValue(0));
+
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR));
+
+ // Verify we're unable to get an shared lock via the second FD.
+ // because someone is holding an exclusive lock.
+ ASSERT_THAT(flock(fd.get(), LOCK_SH | LOCK_NB),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+
+ // Unlock
+ ASSERT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(FlockTest, TestExclusiveLockFailExclusiveHolder) {
+ // This test will verify that an exclusive lock is denied while
+ // someone already holds an exclsuive lock.
+ ASSERT_THAT(flock(test_file_fd_.get(), LOCK_EX | LOCK_NB),
+ SyscallSucceedsWithValue(0));
+
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR));
+
+ // Verify we're unable to get an exclusive lock via the second FD
+ // because someone is already holding an exclusive lock.
+ ASSERT_THAT(flock(fd.get(), LOCK_EX | LOCK_NB),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+
+ // Unlock
+ ASSERT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(FlockTest, TestMultipleHolderSharedExclusiveUpgrade) {
+ // This test will verify that we cannot obtain an exclusive lock while
+ // a shared lock is held by another descriptor, then verify that an upgrade
+ // is possible on a shared lock once all other shared locks have closed.
+ ASSERT_THAT(flock(test_file_fd_.get(), LOCK_SH | LOCK_NB),
+ SyscallSucceedsWithValue(0));
+
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR));
+
+ // Verify we're unable to get an exclusive lock via the second FD because
+ // a shared lock is held.
+ ASSERT_THAT(flock(fd.get(), LOCK_EX | LOCK_NB),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+
+ // Verify that we can get a shared lock via the second descriptor instead
+ ASSERT_THAT(flock(fd.get(), LOCK_SH | LOCK_NB), SyscallSucceedsWithValue(0));
+
+ // Unlock the first and there will only be one shared lock remaining.
+ ASSERT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+
+ // Upgrade 2nd fd.
+ ASSERT_THAT(flock(fd.get(), LOCK_EX | LOCK_NB), SyscallSucceedsWithValue(0));
+
+ // Finally unlock the second
+ ASSERT_THAT(flock(fd.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(FlockTest, TestMultipleHolderSharedExclusiveDowngrade) {
+ // This test will verify that a shared lock is not obtainable while an
+ // exclusive lock is held but that once the first is downgraded that
+ // the second independent file descriptor can also get a shared lock.
+ ASSERT_THAT(flock(test_file_fd_.get(), LOCK_EX | LOCK_NB),
+ SyscallSucceedsWithValue(0));
+
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR));
+
+ // Verify We're unable to get a shared lock via the second FD because
+ // an exclusive lock is held.
+ ASSERT_THAT(flock(fd.get(), LOCK_SH | LOCK_NB),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+
+ // Verify that we can downgrade the first.
+ ASSERT_THAT(flock(test_file_fd_.get(), LOCK_SH | LOCK_NB),
+ SyscallSucceedsWithValue(0));
+
+ // Now verify that we can obtain a shared lock since the first was downgraded.
+ ASSERT_THAT(flock(fd.get(), LOCK_SH | LOCK_NB), SyscallSucceedsWithValue(0));
+
+ // Finally unlock both.
+ ASSERT_THAT(flock(fd.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+ ASSERT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+}
+
+/*
+ * flock(2): Locks created by flock() are associated with an open file table
+ * entry. This means that duplicate file descriptors (created by, for example,
+ * fork(2) or dup(2)) refer to the same lock, and this lock may be modified or
+ * released using any of these descriptors. Furthermore, the lock is released
+ * either by an explicit LOCK_UN operation on any of these duplicate descriptors
+ * or when all such descriptors have been closed.
+ */
+TEST_F(FlockTest, TestDupFdUpgrade) {
+ // This test will verify that a shared lock is upgradeable via a dupped
+ // file descriptor, if the FD wasn't dupped this would fail.
+ ASSERT_THAT(flock(test_file_fd_.get(), LOCK_SH | LOCK_NB),
+ SyscallSucceedsWithValue(0));
+
+ const FileDescriptor dup_fd = ASSERT_NO_ERRNO_AND_VALUE(test_file_fd_.Dup());
+
+ // Now we should be able to upgrade via the dupped fd.
+ ASSERT_THAT(flock(dup_fd.get(), LOCK_EX | LOCK_NB),
+ SyscallSucceedsWithValue(0));
+
+ // Validate unlock via dupped fd.
+ ASSERT_THAT(flock(dup_fd.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(FlockTest, TestDupFdDowngrade) {
+ // This test will verify that a exclusive lock is downgradable via a dupped
+ // file descriptor, if the FD wasn't dupped this would fail.
+ ASSERT_THAT(flock(test_file_fd_.get(), LOCK_EX | LOCK_NB),
+ SyscallSucceedsWithValue(0));
+
+ const FileDescriptor dup_fd = ASSERT_NO_ERRNO_AND_VALUE(test_file_fd_.Dup());
+
+ // Now we should be able to downgrade via the dupped fd.
+ ASSERT_THAT(flock(dup_fd.get(), LOCK_SH | LOCK_NB),
+ SyscallSucceedsWithValue(0));
+
+ // Validate unlock via dupped fd
+ ASSERT_THAT(flock(dup_fd.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(FlockTest, TestDupFdCloseRelease) {
+ // flock(2): Furthermore, the lock is released either by an explicit LOCK_UN
+ // operation on any of these duplicate descriptors, or when all such
+ // descriptors have been closed.
+ //
+ // This test will verify that a dupped fd closing will not release the
+ // underlying lock until all such dupped fds have closed.
+ ASSERT_THAT(flock(test_file_fd_.get(), LOCK_EX | LOCK_NB),
+ SyscallSucceedsWithValue(0));
+
+ FileDescriptor dup_fd = ASSERT_NO_ERRNO_AND_VALUE(test_file_fd_.Dup());
+
+ // At this point we have ONE exclusive locked referenced by two different fds.
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR));
+
+ // Validate that we cannot get a lock on a new unrelated FD.
+ ASSERT_THAT(flock(fd.get(), LOCK_EX | LOCK_NB),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+
+ // Closing the dupped fd shouldn't affect the lock until all are closed.
+ dup_fd.reset(); // Closed the duped fd.
+
+ // Validate that we still cannot get a lock on a new unrelated FD.
+ ASSERT_THAT(flock(fd.get(), LOCK_EX | LOCK_NB),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+
+ // Closing the first fd
+ CloseFile(); // Will validate the syscall succeeds.
+
+ // Now we should actually be able to get a lock since all fds related to
+ // the first lock are closed.
+ ASSERT_THAT(flock(fd.get(), LOCK_EX | LOCK_NB), SyscallSucceedsWithValue(0));
+
+ // Unlock.
+ ASSERT_THAT(flock(fd.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(FlockTest, TestDupFdUnlockRelease) {
+ /* flock(2): Furthermore, the lock is released either by an explicit LOCK_UN
+ * operation on any of these duplicate descriptors, or when all such
+ * descriptors have been closed.
+ */
+ // This test will verify that an explict unlock on a dupped FD will release
+ // the underlying lock unlike the previous case where close on a dup was
+ // not enough to release the lock.
+ ASSERT_THAT(flock(test_file_fd_.get(), LOCK_EX | LOCK_NB),
+ SyscallSucceedsWithValue(0));
+
+ const FileDescriptor dup_fd = ASSERT_NO_ERRNO_AND_VALUE(test_file_fd_.Dup());
+
+ // At this point we have ONE exclusive locked referenced by two different fds.
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR));
+
+ // Validate that we cannot get a lock on a new unrelated FD.
+ ASSERT_THAT(flock(fd.get(), LOCK_EX | LOCK_NB),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+
+ // Explicitly unlock via the dupped descriptor.
+ ASSERT_THAT(flock(dup_fd.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+
+ // Validate that we can now get the lock since we explicitly unlocked.
+ ASSERT_THAT(flock(fd.get(), LOCK_EX | LOCK_NB), SyscallSucceedsWithValue(0));
+
+ // Unlock
+ ASSERT_THAT(flock(fd.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(FlockTest, TestDupFdFollowedByLock) {
+ // This test will verify that taking a lock on a file descriptor that has
+ // already been dupped means that the lock is shared between both. This is
+ // slightly different than than duping on an already locked FD.
+ FileDescriptor dup_fd = ASSERT_NO_ERRNO_AND_VALUE(test_file_fd_.Dup());
+
+ // Take a lock.
+ ASSERT_THAT(flock(dup_fd.get(), LOCK_EX | LOCK_NB), SyscallSucceeds());
+
+ // Now dup_fd and test_file_ should both reference the same lock.
+ // We shouldn't be able to obtain a lock until both are closed.
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR));
+
+ // Closing the first fd
+ dup_fd.reset(); // Close the duped fd.
+
+ // Validate that we cannot get a lock yet because the dupped descriptor.
+ ASSERT_THAT(flock(fd.get(), LOCK_EX | LOCK_NB),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+
+ // Closing the second fd.
+ CloseFile(); // CloseFile() will validate the syscall succeeds.
+
+ // Now we should be able to get the lock.
+ ASSERT_THAT(flock(fd.get(), LOCK_EX | LOCK_NB), SyscallSucceeds());
+
+ // Unlock.
+ ASSERT_THAT(flock(fd.get(), LOCK_UN), SyscallSucceedsWithValue(0));
+}
+
+// NOTE: These blocking tests are not perfect. Unfortunately it's very hard to
+// determine if a thread was actually blocked in the kernel so we're forced
+// to use timing.
+TEST_F(FlockTest, BlockingLockNoBlockingForSharedLocks_NoRandomSave) {
+ // This test will verify that although LOCK_NB isn't specified
+ // two different fds can obtain shared locks without blocking.
+ ASSERT_THAT(flock(test_file_fd_.get(), LOCK_SH), SyscallSucceeds());
+
+ // kHoldLockTime is the amount of time we will hold the lock before releasing.
+ constexpr absl::Duration kHoldLockTime = absl::Seconds(30);
+
+ const DisableSave ds; // Timing-related.
+
+ // We do this in another thread so we can determine if it was actually
+ // blocked by timing the amount of time it took for the syscall to complete.
+ ScopedThread t([&] {
+ MonotonicTimer timer;
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR));
+
+ // Only a single shared lock is held, the lock will be granted immediately.
+ // This should be granted without any blocking. Don't save here to avoid
+ // wild discrepencies on timing.
+ timer.Start();
+ ASSERT_THAT(flock(fd.get(), LOCK_SH), SyscallSucceeds());
+
+ // We held the lock for 30 seconds but this thread should not have
+ // blocked at all so we expect a very small duration on syscall completion.
+ ASSERT_LT(timer.Duration(),
+ absl::Seconds(1)); // 1000ms is much less than 30s.
+
+ // We can release our second shared lock
+ ASSERT_THAT(flock(fd.get(), LOCK_UN), SyscallSucceeds());
+ });
+
+ // Sleep before unlocking.
+ absl::SleepFor(kHoldLockTime);
+
+ // Release the first shared lock. Don't save in this situation to avoid
+ // discrepencies in timing.
+ EXPECT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceeds());
+}
+
+TEST_F(FlockTest, BlockingLockFirstSharedSecondExclusive_NoRandomSave) {
+ // This test will verify that if someone holds a shared lock any attempt to
+ // obtain an exclusive lock will result in blocking.
+ ASSERT_THAT(flock(test_file_fd_.get(), LOCK_SH), SyscallSucceeds());
+
+ // kHoldLockTime is the amount of time we will hold the lock before releasing.
+ constexpr absl::Duration kHoldLockTime = absl::Seconds(2);
+
+ const DisableSave ds; // Timing-related.
+
+ // We do this in another thread so we can determine if it was actually
+ // blocked by timing the amount of time it took for the syscall to complete.
+ ScopedThread t([&] {
+ MonotonicTimer timer;
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR));
+
+ // This exclusive lock should block because someone is already holding a
+ // shared lock. We don't save here to avoid wild discrepencies on timing.
+ timer.Start();
+ ASSERT_THAT(RetryEINTR(flock)(fd.get(), LOCK_EX), SyscallSucceeds());
+
+ // We should be blocked, we will expect to be blocked for more than 1.0s.
+ ASSERT_GT(timer.Duration(), absl::Seconds(1));
+
+ // We can release our exclusive lock.
+ ASSERT_THAT(flock(fd.get(), LOCK_UN), SyscallSucceeds());
+ });
+
+ // Sleep before unlocking.
+ absl::SleepFor(kHoldLockTime);
+
+ // Release the shared lock allowing the thread to proceed.
+ // We don't save here to avoid wild discrepencies in timing.
+ EXPECT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceeds());
+}
+
+TEST_F(FlockTest, BlockingLockFirstExclusiveSecondShared_NoRandomSave) {
+ // This test will verify that if someone holds an exclusive lock any attempt
+ // to obtain a shared lock will result in blocking.
+ ASSERT_THAT(flock(test_file_fd_.get(), LOCK_EX), SyscallSucceeds());
+
+ // kHoldLockTime is the amount of time we will hold the lock before releasing.
+ constexpr absl::Duration kHoldLockTime = absl::Seconds(2);
+
+ const DisableSave ds; // Timing-related.
+
+ // We do this in another thread so we can determine if it was actually
+ // blocked by timing the amount of time it took for the syscall to complete.
+ ScopedThread t([&] {
+ MonotonicTimer timer;
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR));
+
+ // This shared lock should block because someone is already holding an
+ // exclusive lock. We don't save here to avoid wild discrepencies on timing.
+ timer.Start();
+ ASSERT_THAT(RetryEINTR(flock)(fd.get(), LOCK_SH), SyscallSucceeds());
+
+ // We should be blocked, we will expect to be blocked for more than 1.0s.
+ ASSERT_GT(timer.Duration(), absl::Seconds(1));
+
+ // We can release our shared lock.
+ ASSERT_THAT(flock(fd.get(), LOCK_UN), SyscallSucceeds());
+ });
+
+ // Sleep before unlocking.
+ absl::SleepFor(kHoldLockTime);
+
+ // Release the exclusive lock allowing the blocked thread to proceed.
+ // We don't save here to avoid wild discrepencies in timing.
+ EXPECT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceeds());
+}
+
+TEST_F(FlockTest, BlockingLockFirstExclusiveSecondExclusive_NoRandomSave) {
+ // This test will verify that if someone holds an exclusive lock any attempt
+ // to obtain another exclusive lock will result in blocking.
+ ASSERT_THAT(flock(test_file_fd_.get(), LOCK_EX), SyscallSucceeds());
+
+ // kHoldLockTime is the amount of time we will hold the lock before releasing.
+ constexpr absl::Duration kHoldLockTime = absl::Seconds(2);
+
+ const DisableSave ds; // Timing-related.
+
+ // We do this in another thread so we can determine if it was actually
+ // blocked by timing the amount of time it took for the syscall to complete.
+ ScopedThread t([&] {
+ MonotonicTimer timer;
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR));
+
+ // This exclusive lock should block because someone is already holding an
+ // exclusive lock.
+ timer.Start();
+ ASSERT_THAT(RetryEINTR(flock)(fd.get(), LOCK_EX), SyscallSucceeds());
+
+ // We should be blocked, we will expect to be blocked for more than 1.0s.
+ ASSERT_GT(timer.Duration(), absl::Seconds(1));
+
+ // We can release our exclusive lock.
+ ASSERT_THAT(flock(fd.get(), LOCK_UN), SyscallSucceeds());
+ });
+
+ // Sleep before unlocking.
+ absl::SleepFor(kHoldLockTime);
+
+ // Release the exclusive lock allowing the blocked thread to proceed.
+ // We don't save to avoid wild discrepencies in timing.
+ EXPECT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceeds());
+}
+
+TEST(FlockTestNoFixture, BadFD) {
+ // EBADF: fd is not an open file descriptor.
+ ASSERT_THAT(flock(-1, 0), SyscallFailsWithErrno(EBADF));
+}
+
+TEST(FlockTestNoFixture, FlockDir) {
+ auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ auto fd = ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_RDONLY, 0000));
+ EXPECT_THAT(flock(fd.get(), LOCK_EX | LOCK_NB), SyscallSucceeds());
+}
+
+TEST(FlockTestNoFixture, FlockSymlink) {
+ // TODO(gvisor.dev/issue/2782): Replace with IsRunningWithVFS1() when O_PATH
+ // is supported.
+ SKIP_IF(IsRunningOnGvisor());
+
+ auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ auto symlink = ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateSymlinkTo(GetAbsoluteTestTmpdir(), file.path()));
+
+ auto fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(symlink.path(), O_RDONLY | O_PATH, 0000));
+ EXPECT_THAT(flock(fd.get(), LOCK_EX | LOCK_NB), SyscallFailsWithErrno(EBADF));
+}
+
+TEST(FlockTestNoFixture, FlockProc) {
+ auto fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/proc/self/status", O_RDONLY, 0000));
+ EXPECT_THAT(flock(fd.get(), LOCK_EX | LOCK_NB), SyscallSucceeds());
+}
+
+TEST(FlockTestNoFixture, FlockPipe) {
+ int fds[2];
+ ASSERT_THAT(pipe(fds), SyscallSucceeds());
+
+ EXPECT_THAT(flock(fds[0], LOCK_EX | LOCK_NB), SyscallSucceeds());
+ // Check that the pipe was locked above.
+ EXPECT_THAT(flock(fds[1], LOCK_EX | LOCK_NB), SyscallFailsWithErrno(EAGAIN));
+
+ EXPECT_THAT(flock(fds[0], LOCK_UN), SyscallSucceeds());
+ EXPECT_THAT(flock(fds[1], LOCK_EX | LOCK_NB), SyscallSucceeds());
+
+ EXPECT_THAT(close(fds[0]), SyscallSucceeds());
+ EXPECT_THAT(close(fds[1]), SyscallSucceeds());
+}
+
+TEST(FlockTestNoFixture, FlockSocket) {
+ int sock = socket(AF_UNIX, SOCK_STREAM, 0);
+ ASSERT_THAT(sock, SyscallSucceeds());
+
+ struct sockaddr_un addr =
+ ASSERT_NO_ERRNO_AND_VALUE(UniqueUnixAddr(true /* abstract */, AF_UNIX));
+ ASSERT_THAT(
+ bind(sock, reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr)),
+ SyscallSucceeds());
+
+ EXPECT_THAT(flock(sock, LOCK_EX | LOCK_NB), SyscallSucceeds());
+ EXPECT_THAT(close(sock), SyscallSucceeds());
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/fork.cc b/test/syscalls/linux/fork.cc
new file mode 100644
index 000000000..853f6231a
--- /dev/null
+++ b/test/syscalls/linux/fork.cc
@@ -0,0 +1,464 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <atomic>
+#include <cstdlib>
+
+#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/capability_util.h"
+#include "test/util/logging.h"
+#include "test/util/memory_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+using ::testing::Ge;
+
+class ForkTest : public ::testing::Test {
+ protected:
+ // SetUp creates a populated, open file.
+ void SetUp() override {
+ // Make a shared mapping.
+ shared_ = reinterpret_cast<char*>(mmap(0, kPageSize, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_ANONYMOUS, -1, 0));
+ ASSERT_NE(reinterpret_cast<void*>(shared_), MAP_FAILED);
+
+ // Make a private mapping.
+ private_ =
+ reinterpret_cast<char*>(mmap(0, kPageSize, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0));
+ ASSERT_NE(reinterpret_cast<void*>(private_), MAP_FAILED);
+
+ // Make a pipe.
+ ASSERT_THAT(pipe(pipes_), SyscallSucceeds());
+ }
+
+ // TearDown frees associated resources.
+ void TearDown() override {
+ EXPECT_THAT(munmap(shared_, kPageSize), SyscallSucceeds());
+ EXPECT_THAT(munmap(private_, kPageSize), SyscallSucceeds());
+ EXPECT_THAT(close(pipes_[0]), SyscallSucceeds());
+ EXPECT_THAT(close(pipes_[1]), SyscallSucceeds());
+ }
+
+ // Fork executes a clone system call.
+ pid_t Fork() {
+ pid_t pid = fork();
+ MaybeSave();
+ TEST_PCHECK_MSG(pid >= 0, "fork failed");
+ return pid;
+ }
+
+ // Wait waits for the given pid and returns the exit status. If the child was
+ // killed by a signal or an error occurs, then 256+signal is returned.
+ int Wait(pid_t pid) {
+ int status;
+ while (true) {
+ int rval = wait4(pid, &status, 0, NULL);
+ if (rval < 0) {
+ return rval;
+ }
+ if (rval != pid) {
+ continue;
+ }
+ if (WIFEXITED(status)) {
+ return WEXITSTATUS(status);
+ }
+ if (WIFSIGNALED(status)) {
+ return 256 + WTERMSIG(status);
+ }
+ }
+ }
+
+ // Exit exits the proccess.
+ void Exit(int code) {
+ _exit(code);
+
+ // Should never reach here. Since the exit above failed, we really don't
+ // have much in the way of options to indicate failure. So we just try to
+ // log an assertion failure to the logs. The parent process will likely
+ // fail anyways if exit is not working.
+ TEST_CHECK_MSG(false, "_exit returned");
+ }
+
+ // ReadByte reads a byte from the shared pipe.
+ char ReadByte() {
+ char val = -1;
+ TEST_PCHECK(ReadFd(pipes_[0], &val, 1) == 1);
+ MaybeSave();
+ return val;
+ }
+
+ // WriteByte writes a byte from the shared pipe.
+ void WriteByte(char val) {
+ TEST_PCHECK(WriteFd(pipes_[1], &val, 1) == 1);
+ MaybeSave();
+ }
+
+ // Shared pipe.
+ int pipes_[2];
+
+ // Shared mapping (one page).
+ char* shared_;
+
+ // Private mapping (one page).
+ char* private_;
+};
+
+TEST_F(ForkTest, Simple) {
+ pid_t child = Fork();
+ if (child == 0) {
+ Exit(0);
+ }
+ EXPECT_THAT(Wait(child), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(ForkTest, ExitCode) {
+ pid_t child = Fork();
+ if (child == 0) {
+ Exit(123);
+ }
+ EXPECT_THAT(Wait(child), SyscallSucceedsWithValue(123));
+ child = Fork();
+ if (child == 0) {
+ Exit(1);
+ }
+ EXPECT_THAT(Wait(child), SyscallSucceedsWithValue(1));
+}
+
+TEST_F(ForkTest, Multi) {
+ pid_t child1 = Fork();
+ if (child1 == 0) {
+ Exit(0);
+ }
+ pid_t child2 = Fork();
+ if (child2 == 0) {
+ Exit(1);
+ }
+ EXPECT_THAT(Wait(child1), SyscallSucceedsWithValue(0));
+ EXPECT_THAT(Wait(child2), SyscallSucceedsWithValue(1));
+}
+
+TEST_F(ForkTest, Pipe) {
+ pid_t child = Fork();
+ if (child == 0) {
+ WriteByte(1);
+ Exit(0);
+ }
+ EXPECT_EQ(ReadByte(), 1);
+ EXPECT_THAT(Wait(child), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(ForkTest, SharedMapping) {
+ pid_t child = Fork();
+ if (child == 0) {
+ // Wait for the parent.
+ ReadByte();
+ if (shared_[0] == 1) {
+ Exit(0);
+ }
+ // Failed.
+ Exit(1);
+ }
+ // Change the mapping.
+ ASSERT_EQ(shared_[0], 0);
+ shared_[0] = 1;
+ // Unblock the child.
+ WriteByte(0);
+ // Did it work?
+ EXPECT_THAT(Wait(child), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(ForkTest, PrivateMapping) {
+ pid_t child = Fork();
+ if (child == 0) {
+ // Wait for the parent.
+ ReadByte();
+ if (private_[0] == 0) {
+ Exit(0);
+ }
+ // Failed.
+ Exit(1);
+ }
+ // Change the mapping.
+ ASSERT_EQ(private_[0], 0);
+ private_[0] = 1;
+ // Unblock the child.
+ WriteByte(0);
+ // Did it work?
+ EXPECT_THAT(Wait(child), SyscallSucceedsWithValue(0));
+}
+
+// CPUID is x86 specific.
+#ifdef __x86_64__
+// Test that cpuid works after a fork.
+TEST_F(ForkTest, Cpuid) {
+ pid_t child = Fork();
+
+ // We should be able to determine the CPU vendor.
+ ASSERT_NE(GetCPUVendor(), CPUVendor::kUnknownVendor);
+
+ if (child == 0) {
+ Exit(0);
+ }
+ EXPECT_THAT(Wait(child), SyscallSucceedsWithValue(0));
+}
+#endif
+
+TEST_F(ForkTest, Mmap) {
+ pid_t child = Fork();
+
+ if (child == 0) {
+ void* addr =
+ mmap(0, kPageSize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ MaybeSave();
+ Exit(addr == MAP_FAILED);
+ }
+
+ EXPECT_THAT(Wait(child), SyscallSucceedsWithValue(0));
+}
+
+static volatile int alarmed = 0;
+
+void AlarmHandler(int sig, siginfo_t* info, void* context) { alarmed = 1; }
+
+TEST_F(ForkTest, Alarm) {
+ // Setup an alarm handler.
+ struct sigaction sa;
+ sa.sa_sigaction = AlarmHandler;
+ sigfillset(&sa.sa_mask);
+ sa.sa_flags = SA_SIGINFO;
+ EXPECT_THAT(sigaction(SIGALRM, &sa, nullptr), SyscallSucceeds());
+
+ pid_t child = Fork();
+
+ if (child == 0) {
+ alarm(1);
+ sleep(3);
+ if (!alarmed) {
+ Exit(1);
+ }
+ Exit(0);
+ }
+
+ EXPECT_THAT(Wait(child), SyscallSucceedsWithValue(0));
+ EXPECT_EQ(0, alarmed);
+}
+
+// Child cannot affect parent private memory. Regression test for b/24137240.
+TEST_F(ForkTest, PrivateMemory) {
+ std::atomic<uint32_t> local(0);
+
+ pid_t child1 = Fork();
+ if (child1 == 0) {
+ local++;
+
+ pid_t child2 = Fork();
+ if (child2 == 0) {
+ local++;
+
+ TEST_CHECK(local.load() == 2);
+
+ Exit(0);
+ }
+
+ TEST_PCHECK(Wait(child2) == 0);
+ TEST_CHECK(local.load() == 1);
+ Exit(0);
+ }
+
+ EXPECT_THAT(Wait(child1), SyscallSucceedsWithValue(0));
+ EXPECT_EQ(0, local.load());
+}
+
+// Kernel-accessed buffers should remain coherent across COW.
+//
+// The buffer must be >= usermem.ZeroCopyMinBytes, as UnsafeAccess operates
+// differently. Regression test for b/33811887.
+TEST_F(ForkTest, COWSegment) {
+ constexpr int kBufSize = 1024;
+ char* read_buf = private_;
+ char* touch = private_ + kPageSize / 2;
+
+ std::string contents(kBufSize, 'a');
+
+ ScopedThread t([&] {
+ // Wait to be sure the parent is blocked in read.
+ absl::SleepFor(absl::Seconds(3));
+
+ // Fork to mark private pages for COW.
+ //
+ // Use fork directly rather than the Fork wrapper to skip the multi-threaded
+ // check, and limit the child to async-signal-safe functions:
+ //
+ // "After a fork() in a multithreaded program, the child can safely call
+ // only async-signal-safe functions (see signal(7)) until such time as it
+ // calls execve(2)."
+ //
+ // Skip ASSERT in the child, as it isn't async-signal-safe.
+ pid_t child = fork();
+ if (child == 0) {
+ // Wait to be sure parent touched memory.
+ sleep(3);
+ Exit(0);
+ }
+
+ // Check success only in the parent.
+ ASSERT_THAT(child, SyscallSucceedsWithValue(Ge(0)));
+
+ // Trigger COW on private page.
+ *touch = 42;
+
+ // Write to pipe. Parent should still be able to read this.
+ EXPECT_THAT(WriteFd(pipes_[1], contents.c_str(), kBufSize),
+ SyscallSucceedsWithValue(kBufSize));
+
+ EXPECT_THAT(Wait(child), SyscallSucceedsWithValue(0));
+ });
+
+ EXPECT_THAT(ReadFd(pipes_[0], read_buf, kBufSize),
+ SyscallSucceedsWithValue(kBufSize));
+ EXPECT_STREQ(contents.c_str(), read_buf);
+}
+
+TEST_F(ForkTest, SigAltStack) {
+ std::vector<char> stack_mem(SIGSTKSZ);
+ stack_t stack = {};
+ stack.ss_size = SIGSTKSZ;
+ stack.ss_sp = stack_mem.data();
+ ASSERT_THAT(sigaltstack(&stack, nullptr), SyscallSucceeds());
+
+ pid_t child = Fork();
+
+ if (child == 0) {
+ stack_t oss = {};
+ TEST_PCHECK(sigaltstack(nullptr, &oss) == 0);
+ MaybeSave();
+
+ TEST_CHECK((oss.ss_flags & SS_DISABLE) == 0);
+ TEST_CHECK(oss.ss_size == SIGSTKSZ);
+ TEST_CHECK(oss.ss_sp == stack.ss_sp);
+
+ Exit(0);
+ }
+ EXPECT_THAT(Wait(child), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(ForkTest, Affinity) {
+ // Make a non-default cpumask.
+ cpu_set_t parent_mask;
+ EXPECT_THAT(sched_getaffinity(/*pid=*/0, sizeof(cpu_set_t), &parent_mask),
+ SyscallSucceeds());
+ // Knock out the lowest bit.
+ for (unsigned int n = 0; n < CPU_SETSIZE; n++) {
+ if (CPU_ISSET(n, &parent_mask)) {
+ CPU_CLR(n, &parent_mask);
+ break;
+ }
+ }
+ EXPECT_THAT(sched_setaffinity(/*pid=*/0, sizeof(cpu_set_t), &parent_mask),
+ SyscallSucceeds());
+
+ pid_t child = Fork();
+ if (child == 0) {
+ cpu_set_t child_mask;
+
+ int ret = sched_getaffinity(/*pid=*/0, sizeof(cpu_set_t), &child_mask);
+ MaybeSave();
+ if (ret < 0) {
+ Exit(-ret);
+ }
+
+ TEST_CHECK(CPU_EQUAL(&child_mask, &parent_mask));
+
+ Exit(0);
+ }
+
+ EXPECT_THAT(Wait(child), SyscallSucceedsWithValue(0));
+}
+
+TEST(CloneTest, NewUserNamespacePermitsAllOtherNamespaces) {
+ // "If CLONE_NEWUSER is specified along with other CLONE_NEW* flags in a
+ // single clone(2) or unshare(2) call, the user namespace is guaranteed to be
+ // created first, giving the child (clone(2)) or caller (unshare(2))
+ // privileges over the remaining namespaces created by the call. Thus, it is
+ // possible for an unprivileged caller to specify this combination of flags."
+ // - user_namespaces(7)
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanCreateUserNamespace()));
+ Mapping child_stack = ASSERT_NO_ERRNO_AND_VALUE(
+ MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+ int child_pid;
+ // We only test with CLONE_NEWIPC, CLONE_NEWNET, and CLONE_NEWUTS since these
+ // namespaces were implemented in Linux before user namespaces.
+ ASSERT_THAT(
+ child_pid = clone(
+ +[](void*) { return 0; },
+ reinterpret_cast<void*>(child_stack.addr() + kPageSize),
+ CLONE_NEWUSER | CLONE_NEWIPC | CLONE_NEWNET | CLONE_NEWUTS | SIGCHLD,
+ /* arg = */ nullptr),
+ SyscallSucceeds());
+
+ int status;
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+ << "status = " << status;
+}
+
+// Clone with CLONE_SETTLS and a non-canonical TLS address is rejected.
+TEST(CloneTest, NonCanonicalTLS) {
+ constexpr uintptr_t kNonCanonical = 1ull << 48;
+
+ // We need a valid address for the stack pointer. We'll never actually execute
+ // on this.
+ char stack;
+
+ // The raw system call interface on x86-64 is:
+ // long clone(unsigned long flags, void *stack,
+ // int *parent_tid, int *child_tid,
+ // unsigned long tls);
+ //
+ // While on arm64, the order of the last two arguments is reversed:
+ // long clone(unsigned long flags, void *stack,
+ // int *parent_tid, unsigned long tls,
+ // int *child_tid);
+#if defined(__x86_64__)
+ EXPECT_THAT(syscall(__NR_clone, SIGCHLD | CLONE_SETTLS, &stack, nullptr,
+ nullptr, kNonCanonical),
+ SyscallFailsWithErrno(EPERM));
+#elif defined(__aarch64__)
+ EXPECT_THAT(syscall(__NR_clone, SIGCHLD | CLONE_SETTLS, &stack, nullptr,
+ kNonCanonical, nullptr),
+ SyscallFailsWithErrno(EPERM));
+#endif
+}
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/fpsig_fork.cc b/test/syscalls/linux/fpsig_fork.cc
new file mode 100644
index 000000000..c47567b4e
--- /dev/null
+++ b/test/syscalls/linux/fpsig_fork.cc
@@ -0,0 +1,131 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This test verifies that fork(2) in a signal handler will correctly
+// restore floating point state after the signal handler returns in both
+// the child and parent.
+#include <sys/time.h>
+
+#include "gtest/gtest.h"
+#include "test/util/logging.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+#ifdef __x86_64__
+#define GET_XMM(__var, __xmm) \
+ asm volatile("movq %%" #__xmm ", %0" : "=r"(__var))
+#define SET_XMM(__var, __xmm) asm volatile("movq %0, %%" #__xmm : : "r"(__var))
+#define GET_FP0(__var) GET_XMM(__var, xmm0)
+#define SET_FP0(__var) SET_XMM(__var, xmm0)
+#elif __aarch64__
+#define __stringify_1(x...) #x
+#define __stringify(x...) __stringify_1(x)
+#define GET_FPREG(var, regname) \
+ asm volatile("str " __stringify(regname) ", %0" : "=m"(var))
+#define SET_FPREG(var, regname) \
+ asm volatile("ldr " __stringify(regname) ", %0" : "=m"(var))
+#define GET_FP0(var) GET_FPREG(var, d0)
+#define SET_FP0(var) SET_FPREG(var, d0)
+#endif
+
+int parent, child;
+
+void sigusr1(int s, siginfo_t* siginfo, void* _uc) {
+ // Fork and clobber %xmm0. The fpstate should be restored by sigreturn(2)
+ // in both parent and child.
+ child = fork();
+ TEST_CHECK_MSG(child >= 0, "fork failed");
+
+ uint64_t val = SIGUSR1;
+ SET_FP0(val);
+ uint64_t got;
+ GET_FP0(got);
+ TEST_CHECK_MSG(val == got, "Basic FP check failed in sigusr1()");
+}
+
+TEST(FPSigTest, Fork) {
+ parent = getpid();
+ pid_t parent_tid = gettid();
+
+ struct sigaction sa = {};
+ sigemptyset(&sa.sa_mask);
+ sa.sa_flags = SA_SIGINFO;
+ sa.sa_sigaction = sigusr1;
+ ASSERT_THAT(sigaction(SIGUSR1, &sa, nullptr), SyscallSucceeds());
+
+ // The amd64 ABI specifies that the XMM register set is caller-saved. This
+ // implies that if there is any function call between SET_XMM and GET_XMM the
+ // compiler might save/restore xmm0 implicitly. This defeats the entire
+ // purpose of the test which is to verify that fpstate is restored by
+ // sigreturn(2).
+ //
+ // This is the reason why 'tgkill(getpid(), gettid(), SIGUSR1)' is implemented
+ // in inline assembly below.
+ //
+ // If the OS is broken and registers are clobbered by the child, using tgkill
+ // to signal the current thread increases the likelihood that this thread will
+ // be the one clobbered.
+
+ uint64_t expected = 0xdeadbeeffacefeed;
+ SET_FP0(expected);
+
+#ifdef __x86_64__
+ asm volatile(
+ "movl %[killnr], %%eax;"
+ "movl %[parent], %%edi;"
+ "movl %[tid], %%esi;"
+ "movl %[sig], %%edx;"
+ "syscall;"
+ :
+ : [ killnr ] "i"(__NR_tgkill), [ parent ] "rm"(parent),
+ [ tid ] "rm"(parent_tid), [ sig ] "i"(SIGUSR1)
+ : "rax", "rdi", "rsi", "rdx",
+ // Clobbered by syscall.
+ "rcx", "r11");
+#elif __aarch64__
+ asm volatile(
+ "mov x8, %0\n"
+ "mov x0, %1\n"
+ "mov x1, %2\n"
+ "mov x2, %3\n"
+ "svc #0\n" ::"r"(__NR_tgkill),
+ "r"(parent), "r"(parent_tid), "r"(SIGUSR1));
+#endif
+
+ uint64_t got;
+ GET_FP0(got);
+
+ if (getpid() == parent) { // Parent.
+ int status;
+ ASSERT_THAT(waitpid(child, &status, 0), SyscallSucceedsWithValue(child));
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0);
+ }
+
+ // TEST_CHECK_MSG since this may run in the child.
+ TEST_CHECK_MSG(expected == got, "Bad xmm0 value");
+
+ if (getpid() != parent) { // Child.
+ _exit(0);
+ }
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/fpsig_nested.cc b/test/syscalls/linux/fpsig_nested.cc
new file mode 100644
index 000000000..302d928d1
--- /dev/null
+++ b/test/syscalls/linux/fpsig_nested.cc
@@ -0,0 +1,167 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This program verifies that application floating point state is restored
+// correctly after a signal handler returns. It also verifies that this works
+// with nested signals.
+#include <sys/time.h>
+
+#include "gtest/gtest.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+#ifdef __x86_64__
+#define GET_XMM(__var, __xmm) \
+ asm volatile("movq %%" #__xmm ", %0" : "=r"(__var))
+#define SET_XMM(__var, __xmm) asm volatile("movq %0, %%" #__xmm : : "r"(__var))
+#define GET_FP0(__var) GET_XMM(__var, xmm0)
+#define SET_FP0(__var) SET_XMM(__var, xmm0)
+#elif __aarch64__
+#define __stringify_1(x...) #x
+#define __stringify(x...) __stringify_1(x)
+#define GET_FPREG(var, regname) \
+ asm volatile("str " __stringify(regname) ", %0" : "=m"(var))
+#define SET_FPREG(var, regname) \
+ asm volatile("ldr " __stringify(regname) ", %0" : "=m"(var))
+#define GET_FP0(var) GET_FPREG(var, d0)
+#define SET_FP0(var) SET_FPREG(var, d0)
+#endif
+
+int pid;
+int tid;
+
+volatile uint64_t entryxmm[2] = {~0UL, ~0UL};
+volatile uint64_t exitxmm[2];
+
+void sigusr2(int s, siginfo_t* siginfo, void* _uc) {
+ uint64_t val = SIGUSR2;
+
+ // Record the value of %xmm0 on entry and then clobber it.
+ GET_FP0(entryxmm[1]);
+ SET_FP0(val);
+ GET_FP0(exitxmm[1]);
+}
+
+void sigusr1(int s, siginfo_t* siginfo, void* _uc) {
+ uint64_t val = SIGUSR1;
+
+ // Record the value of %xmm0 on entry and then clobber it.
+ GET_FP0(entryxmm[0]);
+ SET_FP0(val);
+
+ // Send a SIGUSR2 to ourself. The signal mask is configured such that
+ // the SIGUSR2 handler will run before this handler returns.
+#ifdef __x86_64__
+ asm volatile(
+ "movl %[killnr], %%eax;"
+ "movl %[pid], %%edi;"
+ "movl %[tid], %%esi;"
+ "movl %[sig], %%edx;"
+ "syscall;"
+ :
+ : [ killnr ] "i"(__NR_tgkill), [ pid ] "rm"(pid), [ tid ] "rm"(tid),
+ [ sig ] "i"(SIGUSR2)
+ : "rax", "rdi", "rsi", "rdx",
+ // Clobbered by syscall.
+ "rcx", "r11");
+#elif __aarch64__
+ asm volatile(
+ "mov x8, %0\n"
+ "mov x0, %1\n"
+ "mov x1, %2\n"
+ "mov x2, %3\n"
+ "svc #0\n" ::"r"(__NR_tgkill),
+ "r"(pid), "r"(tid), "r"(SIGUSR2));
+#endif
+
+ // Record value of %xmm0 again to verify that the nested signal handler
+ // does not clobber it.
+ GET_FP0(exitxmm[0]);
+}
+
+TEST(FPSigTest, NestedSignals) {
+ pid = getpid();
+ tid = gettid();
+
+ struct sigaction sa = {};
+ sigemptyset(&sa.sa_mask);
+ sa.sa_flags = SA_SIGINFO;
+ sa.sa_sigaction = sigusr1;
+ ASSERT_THAT(sigaction(SIGUSR1, &sa, nullptr), SyscallSucceeds());
+
+ sa.sa_sigaction = sigusr2;
+ ASSERT_THAT(sigaction(SIGUSR2, &sa, nullptr), SyscallSucceeds());
+
+ // The amd64 ABI specifies that the XMM register set is caller-saved. This
+ // implies that if there is any function call between SET_XMM and GET_XMM the
+ // compiler might save/restore xmm0 implicitly. This defeats the entire
+ // purpose of the test which is to verify that fpstate is restored by
+ // sigreturn(2).
+ //
+ // This is the reason why 'tgkill(getpid(), gettid(), SIGUSR1)' is implemented
+ // in inline assembly below.
+ //
+ // If the OS is broken and registers are clobbered by the signal, using tgkill
+ // to signal the current thread ensures that this is the clobbered thread.
+
+ uint64_t expected = 0xdeadbeeffacefeed;
+ SET_FP0(expected);
+
+#ifdef __x86_64__
+ asm volatile(
+ "movl %[killnr], %%eax;"
+ "movl %[pid], %%edi;"
+ "movl %[tid], %%esi;"
+ "movl %[sig], %%edx;"
+ "syscall;"
+ :
+ : [ killnr ] "i"(__NR_tgkill), [ pid ] "rm"(pid), [ tid ] "rm"(tid),
+ [ sig ] "i"(SIGUSR1)
+ : "rax", "rdi", "rsi", "rdx",
+ // Clobbered by syscall.
+ "rcx", "r11");
+#elif __aarch64__
+ asm volatile(
+ "mov x8, %0\n"
+ "mov x0, %1\n"
+ "mov x1, %2\n"
+ "mov x2, %3\n"
+ "svc #0\n" ::"r"(__NR_tgkill),
+ "r"(pid), "r"(tid), "r"(SIGUSR1));
+#endif
+
+ uint64_t got;
+ GET_FP0(got);
+
+ //
+ // The checks below verifies the following:
+ // - signal handlers must called with a clean fpu state.
+ // - sigreturn(2) must restore fpstate of the interrupted context.
+ //
+ EXPECT_EQ(expected, got);
+ EXPECT_EQ(entryxmm[0], 0);
+ EXPECT_EQ(entryxmm[1], 0);
+ EXPECT_EQ(exitxmm[0], SIGUSR1);
+ EXPECT_EQ(exitxmm[1], SIGUSR2);
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/fsync.cc b/test/syscalls/linux/fsync.cc
new file mode 100644
index 000000000..e7e057f06
--- /dev/null
+++ b/test/syscalls/linux/fsync.cc
@@ -0,0 +1,58 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include <string>
+
+#include "gtest/gtest.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(FsyncTest, TempFileSucceeds) {
+ auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ auto fd = ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0666));
+ const std::string data = "some data to sync";
+ EXPECT_THAT(write(fd.get(), data.c_str(), data.size()),
+ SyscallSucceedsWithValue(data.size()));
+ EXPECT_THAT(fsync(fd.get()), SyscallSucceeds());
+}
+
+TEST(FsyncTest, TempDirSucceeds) {
+ auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ auto fd = ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_RDONLY | O_DIRECTORY));
+ EXPECT_THAT(fsync(fd.get()), SyscallSucceeds());
+}
+
+TEST(FsyncTest, CannotFsyncOnUnopenedFd) {
+ auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ int fd;
+ ASSERT_THAT(fd = open(file.path().c_str(), O_RDONLY), SyscallSucceeds());
+ ASSERT_THAT(close(fd), SyscallSucceeds());
+
+ // fd is now invalid.
+ EXPECT_THAT(fsync(fd), SyscallFailsWithErrno(EBADF));
+}
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/futex.cc b/test/syscalls/linux/futex.cc
new file mode 100644
index 000000000..40c80a6e1
--- /dev/null
+++ b/test/syscalls/linux/futex.cc
@@ -0,0 +1,742 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <linux/futex.h>
+#include <linux/types.h>
+#include <sys/syscall.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <atomic>
+#include <memory>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "absl/memory/memory.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/cleanup.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/memory_util.h"
+#include "test/util/save_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+#include "test/util/time_util.h"
+#include "test/util/timer_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Amount of time we wait for threads doing futex_wait to start running before
+// doing futex_wake.
+constexpr auto kWaiterStartupDelay = absl::Seconds(3);
+
+// Default timeout for waiters in tests where we expect a futex_wake to be
+// ineffective.
+constexpr auto kIneffectiveWakeTimeout = absl::Seconds(6);
+
+static_assert(kWaiterStartupDelay < kIneffectiveWakeTimeout,
+ "futex_wait will time out before futex_wake is called");
+
+int futex_wait(bool priv, std::atomic<int>* uaddr, int val,
+ absl::Duration timeout = absl::InfiniteDuration()) {
+ int op = FUTEX_WAIT;
+ if (priv) {
+ op |= FUTEX_PRIVATE_FLAG;
+ }
+
+ if (timeout == absl::InfiniteDuration()) {
+ return RetryEINTR(syscall)(SYS_futex, uaddr, op, val, nullptr);
+ }
+
+ // FUTEX_WAIT doesn't adjust the timeout if it returns EINTR, so we have to do
+ // so.
+ while (true) {
+ auto const timeout_ts = absl::ToTimespec(timeout);
+ MonotonicTimer timer;
+ timer.Start();
+ int const ret = syscall(SYS_futex, uaddr, op, val, &timeout_ts);
+ if (ret != -1 || errno != EINTR) {
+ return ret;
+ }
+ timeout = std::max(timeout - timer.Duration(), absl::ZeroDuration());
+ }
+}
+
+int futex_wait_bitset(bool priv, std::atomic<int>* uaddr, int val, int bitset,
+ absl::Time deadline = absl::InfiniteFuture()) {
+ int op = FUTEX_WAIT_BITSET | FUTEX_CLOCK_REALTIME;
+ if (priv) {
+ op |= FUTEX_PRIVATE_FLAG;
+ }
+
+ auto const deadline_ts = absl::ToTimespec(deadline);
+ return RetryEINTR(syscall)(
+ SYS_futex, uaddr, op, val,
+ deadline == absl::InfiniteFuture() ? nullptr : &deadline_ts, nullptr,
+ bitset);
+}
+
+int futex_wake(bool priv, std::atomic<int>* uaddr, int count) {
+ int op = FUTEX_WAKE;
+ if (priv) {
+ op |= FUTEX_PRIVATE_FLAG;
+ }
+ return syscall(SYS_futex, uaddr, op, count);
+}
+
+int futex_wake_bitset(bool priv, std::atomic<int>* uaddr, int count,
+ int bitset) {
+ int op = FUTEX_WAKE_BITSET;
+ if (priv) {
+ op |= FUTEX_PRIVATE_FLAG;
+ }
+ return syscall(SYS_futex, uaddr, op, count, nullptr, nullptr, bitset);
+}
+
+int futex_wake_op(bool priv, std::atomic<int>* uaddr1, std::atomic<int>* uaddr2,
+ int nwake1, int nwake2, uint32_t sub_op) {
+ int op = FUTEX_WAKE_OP;
+ if (priv) {
+ op |= FUTEX_PRIVATE_FLAG;
+ }
+ return syscall(SYS_futex, uaddr1, op, nwake1, nwake2, uaddr2, sub_op);
+}
+
+int futex_lock_pi(bool priv, std::atomic<int>* uaddr) {
+ int op = FUTEX_LOCK_PI;
+ if (priv) {
+ op |= FUTEX_PRIVATE_FLAG;
+ }
+ int zero = 0;
+ if (uaddr->compare_exchange_strong(zero, gettid())) {
+ return 0;
+ }
+ return RetryEINTR(syscall)(SYS_futex, uaddr, op, nullptr, nullptr);
+}
+
+int futex_trylock_pi(bool priv, std::atomic<int>* uaddr) {
+ int op = FUTEX_TRYLOCK_PI;
+ if (priv) {
+ op |= FUTEX_PRIVATE_FLAG;
+ }
+ int zero = 0;
+ if (uaddr->compare_exchange_strong(zero, gettid())) {
+ return 0;
+ }
+ return RetryEINTR(syscall)(SYS_futex, uaddr, op, nullptr, nullptr);
+}
+
+int futex_unlock_pi(bool priv, std::atomic<int>* uaddr) {
+ int op = FUTEX_UNLOCK_PI;
+ if (priv) {
+ op |= FUTEX_PRIVATE_FLAG;
+ }
+ int tid = gettid();
+ if (uaddr->compare_exchange_strong(tid, 0)) {
+ return 0;
+ }
+ return RetryEINTR(syscall)(SYS_futex, uaddr, op, nullptr, nullptr);
+}
+
+// Fixture for futex tests parameterized by whether to use private or shared
+// futexes.
+class PrivateAndSharedFutexTest : public ::testing::TestWithParam<bool> {
+ protected:
+ bool IsPrivate() const { return GetParam(); }
+ int PrivateFlag() const { return IsPrivate() ? FUTEX_PRIVATE_FLAG : 0; }
+};
+
+// FUTEX_WAIT with 0 timeout does not block.
+TEST_P(PrivateAndSharedFutexTest, Wait_ZeroTimeout) {
+ struct timespec timeout = {};
+
+ // Don't use the futex_wait helper because it adjusts timeout.
+ int a = 1;
+ EXPECT_THAT(syscall(SYS_futex, &a, FUTEX_WAIT | PrivateFlag(), a, &timeout),
+ SyscallFailsWithErrno(ETIMEDOUT));
+}
+
+TEST_P(PrivateAndSharedFutexTest, Wait_Timeout) {
+ std::atomic<int> a = ATOMIC_VAR_INIT(1);
+
+ MonotonicTimer timer;
+ timer.Start();
+ constexpr absl::Duration kTimeout = absl::Seconds(1);
+ EXPECT_THAT(futex_wait(IsPrivate(), &a, a, kTimeout),
+ SyscallFailsWithErrno(ETIMEDOUT));
+ EXPECT_GE(timer.Duration(), kTimeout);
+}
+
+TEST_P(PrivateAndSharedFutexTest, Wait_BitsetTimeout) {
+ std::atomic<int> a = ATOMIC_VAR_INIT(1);
+
+ MonotonicTimer timer;
+ timer.Start();
+ constexpr absl::Duration kTimeout = absl::Seconds(1);
+ EXPECT_THAT(
+ futex_wait_bitset(IsPrivate(), &a, a, 0xffffffff, absl::Now() + kTimeout),
+ SyscallFailsWithErrno(ETIMEDOUT));
+ EXPECT_GE(timer.Duration(), kTimeout);
+}
+
+TEST_P(PrivateAndSharedFutexTest, WaitBitset_NegativeTimeout) {
+ std::atomic<int> a = ATOMIC_VAR_INIT(1);
+
+ MonotonicTimer timer;
+ timer.Start();
+ EXPECT_THAT(futex_wait_bitset(IsPrivate(), &a, a, 0xffffffff,
+ absl::Now() - absl::Seconds(1)),
+ SyscallFailsWithErrno(ETIMEDOUT));
+}
+
+TEST_P(PrivateAndSharedFutexTest, Wait_WrongVal) {
+ std::atomic<int> a = ATOMIC_VAR_INIT(1);
+ EXPECT_THAT(futex_wait(IsPrivate(), &a, a + 1),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST_P(PrivateAndSharedFutexTest, Wait_ZeroBitset) {
+ std::atomic<int> a = ATOMIC_VAR_INIT(1);
+ EXPECT_THAT(futex_wait_bitset(IsPrivate(), &a, a, 0),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(PrivateAndSharedFutexTest, Wake1_NoRandomSave) {
+ constexpr int kInitialValue = 1;
+ std::atomic<int> a = ATOMIC_VAR_INIT(kInitialValue);
+
+ // Prevent save/restore from interrupting futex_wait, which will cause it to
+ // return EAGAIN instead of the expected result if futex_wait is restarted
+ // after we change the value of a below.
+ DisableSave ds;
+ ScopedThread thread([&] {
+ EXPECT_THAT(futex_wait(IsPrivate(), &a, kInitialValue),
+ SyscallSucceedsWithValue(0));
+ });
+ absl::SleepFor(kWaiterStartupDelay);
+
+ // Change a so that if futex_wake happens before futex_wait, the latter
+ // returns EAGAIN instead of hanging the test.
+ a.fetch_add(1);
+ EXPECT_THAT(futex_wake(IsPrivate(), &a, 1), SyscallSucceedsWithValue(1));
+}
+
+TEST_P(PrivateAndSharedFutexTest, Wake0_NoRandomSave) {
+ constexpr int kInitialValue = 1;
+ std::atomic<int> a = ATOMIC_VAR_INIT(kInitialValue);
+
+ // Prevent save/restore from interrupting futex_wait, which will cause it to
+ // return EAGAIN instead of the expected result if futex_wait is restarted
+ // after we change the value of a below.
+ DisableSave ds;
+ ScopedThread thread([&] {
+ EXPECT_THAT(futex_wait(IsPrivate(), &a, kInitialValue),
+ SyscallSucceedsWithValue(0));
+ });
+ absl::SleepFor(kWaiterStartupDelay);
+
+ // Change a so that if futex_wake happens before futex_wait, the latter
+ // returns EAGAIN instead of hanging the test.
+ a.fetch_add(1);
+ // The Linux kernel wakes one waiter even if val is 0 or negative.
+ EXPECT_THAT(futex_wake(IsPrivate(), &a, 0), SyscallSucceedsWithValue(1));
+}
+
+TEST_P(PrivateAndSharedFutexTest, WakeAll_NoRandomSave) {
+ constexpr int kInitialValue = 1;
+ std::atomic<int> a = ATOMIC_VAR_INIT(kInitialValue);
+
+ DisableSave ds;
+ constexpr int kThreads = 5;
+ std::vector<std::unique_ptr<ScopedThread>> threads;
+ threads.reserve(kThreads);
+ for (int i = 0; i < kThreads; i++) {
+ threads.push_back(absl::make_unique<ScopedThread>([&] {
+ EXPECT_THAT(futex_wait(IsPrivate(), &a, kInitialValue),
+ SyscallSucceeds());
+ }));
+ }
+ absl::SleepFor(kWaiterStartupDelay);
+
+ a.fetch_add(1);
+ EXPECT_THAT(futex_wake(IsPrivate(), &a, kThreads),
+ SyscallSucceedsWithValue(kThreads));
+}
+
+TEST_P(PrivateAndSharedFutexTest, WakeSome_NoRandomSave) {
+ constexpr int kInitialValue = 1;
+ std::atomic<int> a = ATOMIC_VAR_INIT(kInitialValue);
+
+ DisableSave ds;
+ constexpr int kThreads = 5;
+ constexpr int kWokenThreads = 3;
+ static_assert(kWokenThreads < kThreads,
+ "can't wake more threads than are created");
+ std::vector<std::unique_ptr<ScopedThread>> threads;
+ threads.reserve(kThreads);
+ std::vector<int> rets;
+ rets.reserve(kThreads);
+ std::vector<int> errs;
+ errs.reserve(kThreads);
+ for (int i = 0; i < kThreads; i++) {
+ rets.push_back(-1);
+ errs.push_back(0);
+ }
+ for (int i = 0; i < kThreads; i++) {
+ threads.push_back(absl::make_unique<ScopedThread>([&, i] {
+ rets[i] =
+ futex_wait(IsPrivate(), &a, kInitialValue, kIneffectiveWakeTimeout);
+ errs[i] = errno;
+ }));
+ }
+ absl::SleepFor(kWaiterStartupDelay);
+
+ a.fetch_add(1);
+ EXPECT_THAT(futex_wake(IsPrivate(), &a, kWokenThreads),
+ SyscallSucceedsWithValue(kWokenThreads));
+
+ int woken = 0;
+ int timedout = 0;
+ for (int i = 0; i < kThreads; i++) {
+ threads[i]->Join();
+ if (rets[i] == 0) {
+ woken++;
+ } else if (errs[i] == ETIMEDOUT) {
+ timedout++;
+ } else {
+ ADD_FAILURE() << " thread " << i << ": returned " << rets[i] << ", errno "
+ << errs[i];
+ }
+ }
+ EXPECT_EQ(woken, kWokenThreads);
+ EXPECT_EQ(timedout, kThreads - kWokenThreads);
+}
+
+TEST_P(PrivateAndSharedFutexTest, WaitBitset_Wake_NoRandomSave) {
+ constexpr int kInitialValue = 1;
+ std::atomic<int> a = ATOMIC_VAR_INIT(kInitialValue);
+
+ DisableSave ds;
+ ScopedThread thread([&] {
+ EXPECT_THAT(futex_wait_bitset(IsPrivate(), &a, kInitialValue, 0b01001000),
+ SyscallSucceeds());
+ });
+ absl::SleepFor(kWaiterStartupDelay);
+
+ a.fetch_add(1);
+ EXPECT_THAT(futex_wake(IsPrivate(), &a, 1), SyscallSucceedsWithValue(1));
+}
+
+TEST_P(PrivateAndSharedFutexTest, Wait_WakeBitset_NoRandomSave) {
+ constexpr int kInitialValue = 1;
+ std::atomic<int> a = ATOMIC_VAR_INIT(kInitialValue);
+
+ DisableSave ds;
+ ScopedThread thread([&] {
+ EXPECT_THAT(futex_wait(IsPrivate(), &a, kInitialValue), SyscallSucceeds());
+ });
+ absl::SleepFor(kWaiterStartupDelay);
+
+ a.fetch_add(1);
+ EXPECT_THAT(futex_wake_bitset(IsPrivate(), &a, 1, 0b01001000),
+ SyscallSucceedsWithValue(1));
+}
+
+TEST_P(PrivateAndSharedFutexTest, WaitBitset_WakeBitsetMatch_NoRandomSave) {
+ constexpr int kInitialValue = 1;
+ std::atomic<int> a = ATOMIC_VAR_INIT(kInitialValue);
+
+ constexpr int kBitset = 0b01001000;
+
+ DisableSave ds;
+ ScopedThread thread([&] {
+ EXPECT_THAT(futex_wait_bitset(IsPrivate(), &a, kInitialValue, kBitset),
+ SyscallSucceeds());
+ });
+ absl::SleepFor(kWaiterStartupDelay);
+
+ a.fetch_add(1);
+ EXPECT_THAT(futex_wake_bitset(IsPrivate(), &a, 1, kBitset),
+ SyscallSucceedsWithValue(1));
+}
+
+TEST_P(PrivateAndSharedFutexTest, WaitBitset_WakeBitsetNoMatch_NoRandomSave) {
+ constexpr int kInitialValue = 1;
+ std::atomic<int> a = ATOMIC_VAR_INIT(kInitialValue);
+
+ constexpr int kWaitBitset = 0b01000001;
+ constexpr int kWakeBitset = 0b00101000;
+ static_assert((kWaitBitset & kWakeBitset) == 0,
+ "futex_wake_bitset will wake waiter");
+
+ DisableSave ds;
+ ScopedThread thread([&] {
+ EXPECT_THAT(futex_wait_bitset(IsPrivate(), &a, kInitialValue, kWaitBitset,
+ absl::Now() + kIneffectiveWakeTimeout),
+ SyscallFailsWithErrno(ETIMEDOUT));
+ });
+ absl::SleepFor(kWaiterStartupDelay);
+
+ a.fetch_add(1);
+ EXPECT_THAT(futex_wake_bitset(IsPrivate(), &a, 1, kWakeBitset),
+ SyscallSucceedsWithValue(0));
+}
+
+TEST_P(PrivateAndSharedFutexTest, WakeOpCondSuccess_NoRandomSave) {
+ constexpr int kInitialValue = 1;
+ std::atomic<int> a = ATOMIC_VAR_INIT(kInitialValue);
+ std::atomic<int> b = ATOMIC_VAR_INIT(kInitialValue);
+
+ DisableSave ds;
+ ScopedThread thread_a([&] {
+ EXPECT_THAT(futex_wait(IsPrivate(), &a, kInitialValue), SyscallSucceeds());
+ });
+ ScopedThread thread_b([&] {
+ EXPECT_THAT(futex_wait(IsPrivate(), &b, kInitialValue), SyscallSucceeds());
+ });
+ absl::SleepFor(kWaiterStartupDelay);
+
+ a.fetch_add(1);
+ b.fetch_add(1);
+ // This futex_wake_op should:
+ // - Wake 1 waiter on a unconditionally.
+ // - Wake 1 waiter on b if b == kInitialValue + 1, which it is.
+ // - Do "b += 1".
+ EXPECT_THAT(futex_wake_op(IsPrivate(), &a, &b, 1, 1,
+ FUTEX_OP(FUTEX_OP_ADD, 1, FUTEX_OP_CMP_EQ,
+ (kInitialValue + 1))),
+ SyscallSucceedsWithValue(2));
+ EXPECT_EQ(b, kInitialValue + 2);
+}
+
+TEST_P(PrivateAndSharedFutexTest, WakeOpCondFailure_NoRandomSave) {
+ constexpr int kInitialValue = 1;
+ std::atomic<int> a = ATOMIC_VAR_INIT(kInitialValue);
+ std::atomic<int> b = ATOMIC_VAR_INIT(kInitialValue);
+
+ DisableSave ds;
+ ScopedThread thread_a([&] {
+ EXPECT_THAT(futex_wait(IsPrivate(), &a, kInitialValue), SyscallSucceeds());
+ });
+ ScopedThread thread_b([&] {
+ EXPECT_THAT(
+ futex_wait(IsPrivate(), &b, kInitialValue, kIneffectiveWakeTimeout),
+ SyscallFailsWithErrno(ETIMEDOUT));
+ });
+ absl::SleepFor(kWaiterStartupDelay);
+
+ a.fetch_add(1);
+ b.fetch_add(1);
+ // This futex_wake_op should:
+ // - Wake 1 waiter on a unconditionally.
+ // - Wake 1 waiter on b if b == kInitialValue - 1, which it isn't.
+ // - Do "b += 1".
+ EXPECT_THAT(futex_wake_op(IsPrivate(), &a, &b, 1, 1,
+ FUTEX_OP(FUTEX_OP_ADD, 1, FUTEX_OP_CMP_EQ,
+ (kInitialValue - 1))),
+ SyscallSucceedsWithValue(1));
+ EXPECT_EQ(b, kInitialValue + 2);
+}
+
+TEST_P(PrivateAndSharedFutexTest, NoWakeInterprocessPrivateAnon_NoRandomSave) {
+ auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+ MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+ auto const ptr = static_cast<std::atomic<int>*>(mapping.ptr());
+ constexpr int kInitialValue = 1;
+ ptr->store(kInitialValue);
+
+ DisableSave ds;
+ pid_t const child_pid = fork();
+ if (child_pid == 0) {
+ TEST_PCHECK(futex_wait(IsPrivate(), ptr, kInitialValue,
+ kIneffectiveWakeTimeout) == -1 &&
+ errno == ETIMEDOUT);
+ _exit(0);
+ }
+ ASSERT_THAT(child_pid, SyscallSucceeds());
+ absl::SleepFor(kWaiterStartupDelay);
+
+ EXPECT_THAT(futex_wake(IsPrivate(), ptr, 1), SyscallSucceedsWithValue(0));
+
+ int status;
+ ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+ << " status " << status;
+}
+
+TEST_P(PrivateAndSharedFutexTest, WakeAfterCOWBreak_NoRandomSave) {
+ // Use a futex on a non-stack mapping so we can be sure that the child process
+ // below isn't the one that breaks copy-on-write.
+ auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+ MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+ auto const ptr = static_cast<std::atomic<int>*>(mapping.ptr());
+ constexpr int kInitialValue = 1;
+ ptr->store(kInitialValue);
+
+ DisableSave ds;
+ ScopedThread thread([&] {
+ EXPECT_THAT(futex_wait(IsPrivate(), ptr, kInitialValue), SyscallSucceeds());
+ });
+ absl::SleepFor(kWaiterStartupDelay);
+
+ pid_t const child_pid = fork();
+ if (child_pid == 0) {
+ // Wait to be killed by the parent.
+ while (true) pause();
+ }
+ ASSERT_THAT(child_pid, SyscallSucceeds());
+ auto cleanup_child = Cleanup([&] {
+ EXPECT_THAT(kill(child_pid, SIGKILL), SyscallSucceeds());
+ int status;
+ ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGKILL)
+ << " status " << status;
+ });
+
+ // In addition to preventing a late futex_wait from sleeping, this breaks
+ // copy-on-write on the mapped page.
+ ptr->fetch_add(1);
+ EXPECT_THAT(futex_wake(IsPrivate(), ptr, 1), SyscallSucceedsWithValue(1));
+}
+
+TEST_P(PrivateAndSharedFutexTest, WakeWrongKind_NoRandomSave) {
+ constexpr int kInitialValue = 1;
+ std::atomic<int> a = ATOMIC_VAR_INIT(kInitialValue);
+
+ DisableSave ds;
+ ScopedThread thread([&] {
+ EXPECT_THAT(
+ futex_wait(IsPrivate(), &a, kInitialValue, kIneffectiveWakeTimeout),
+ SyscallFailsWithErrno(ETIMEDOUT));
+ });
+ absl::SleepFor(kWaiterStartupDelay);
+
+ a.fetch_add(1);
+ // The value of priv passed to futex_wake is the opposite of that passed to
+ // the futex_waiter; we expect this not to wake the waiter.
+ EXPECT_THAT(futex_wake(!IsPrivate(), &a, 1), SyscallSucceedsWithValue(0));
+}
+
+INSTANTIATE_TEST_SUITE_P(SharedPrivate, PrivateAndSharedFutexTest,
+ ::testing::Bool());
+
+// Passing null as the address only works for private futexes.
+
+TEST(PrivateFutexTest, WakeOp0Set) {
+ std::atomic<int> a = ATOMIC_VAR_INIT(1);
+
+ int futex_op = FUTEX_OP(FUTEX_OP_SET, 2, 0, 0);
+ EXPECT_THAT(futex_wake_op(true, nullptr, &a, 0, 0, futex_op),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(a, 2);
+}
+
+TEST(PrivateFutexTest, WakeOp0Add) {
+ std::atomic<int> a = ATOMIC_VAR_INIT(1);
+ int futex_op = FUTEX_OP(FUTEX_OP_ADD, 1, 0, 0);
+ EXPECT_THAT(futex_wake_op(true, nullptr, &a, 0, 0, futex_op),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(a, 2);
+}
+
+TEST(PrivateFutexTest, WakeOp0Or) {
+ std::atomic<int> a = ATOMIC_VAR_INIT(0b01);
+ int futex_op = FUTEX_OP(FUTEX_OP_OR, 0b10, 0, 0);
+ EXPECT_THAT(futex_wake_op(true, nullptr, &a, 0, 0, futex_op),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(a, 0b11);
+}
+
+TEST(PrivateFutexTest, WakeOp0Andn) {
+ std::atomic<int> a = ATOMIC_VAR_INIT(0b11);
+ int futex_op = FUTEX_OP(FUTEX_OP_ANDN, 0b10, 0, 0);
+ EXPECT_THAT(futex_wake_op(true, nullptr, &a, 0, 0, futex_op),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(a, 0b01);
+}
+
+TEST(PrivateFutexTest, WakeOp0Xor) {
+ std::atomic<int> a = ATOMIC_VAR_INIT(0b1010);
+ int futex_op = FUTEX_OP(FUTEX_OP_XOR, 0b1100, 0, 0);
+ EXPECT_THAT(futex_wake_op(true, nullptr, &a, 0, 0, futex_op),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(a, 0b0110);
+}
+
+TEST(SharedFutexTest, WakeInterprocessSharedAnon_NoRandomSave) {
+ auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+ MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED));
+ auto const ptr = static_cast<std::atomic<int>*>(mapping.ptr());
+ constexpr int kInitialValue = 1;
+ ptr->store(kInitialValue);
+
+ DisableSave ds;
+ pid_t const child_pid = fork();
+ if (child_pid == 0) {
+ TEST_PCHECK(futex_wait(false, ptr, kInitialValue) == 0);
+ _exit(0);
+ }
+ ASSERT_THAT(child_pid, SyscallSucceeds());
+ auto kill_child = Cleanup(
+ [&] { EXPECT_THAT(kill(child_pid, SIGKILL), SyscallSucceeds()); });
+ absl::SleepFor(kWaiterStartupDelay);
+
+ ptr->fetch_add(1);
+ // This is an ASSERT so that if it fails, we immediately abort the test (and
+ // kill the subprocess).
+ ASSERT_THAT(futex_wake(false, ptr, 1), SyscallSucceedsWithValue(1));
+
+ kill_child.Release();
+ int status;
+ ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+ << " status " << status;
+}
+
+TEST(SharedFutexTest, WakeInterprocessFile_NoRandomSave) {
+ auto const file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ ASSERT_THAT(truncate(file.path().c_str(), kPageSize), SyscallSucceeds());
+ auto const fd = ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR));
+ auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(Mmap(
+ nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd.get(), 0));
+ auto const ptr = static_cast<std::atomic<int>*>(mapping.ptr());
+ constexpr int kInitialValue = 1;
+ ptr->store(kInitialValue);
+
+ DisableSave ds;
+ pid_t const child_pid = fork();
+ if (child_pid == 0) {
+ TEST_PCHECK(futex_wait(false, ptr, kInitialValue) == 0);
+ _exit(0);
+ }
+ ASSERT_THAT(child_pid, SyscallSucceeds());
+ auto kill_child = Cleanup(
+ [&] { EXPECT_THAT(kill(child_pid, SIGKILL), SyscallSucceeds()); });
+ absl::SleepFor(kWaiterStartupDelay);
+
+ ptr->fetch_add(1);
+ // This is an ASSERT so that if it fails, we immediately abort the test (and
+ // kill the subprocess).
+ ASSERT_THAT(futex_wake(false, ptr, 1), SyscallSucceedsWithValue(1));
+
+ kill_child.Release();
+ int status;
+ ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+ << " status " << status;
+}
+
+TEST_P(PrivateAndSharedFutexTest, PIBasic) {
+ std::atomic<int> a = ATOMIC_VAR_INIT(0);
+
+ ASSERT_THAT(futex_lock_pi(IsPrivate(), &a), SyscallSucceeds());
+ EXPECT_EQ(a.load(), gettid());
+ EXPECT_THAT(futex_lock_pi(IsPrivate(), &a), SyscallFailsWithErrno(EDEADLK));
+
+ ASSERT_THAT(futex_unlock_pi(IsPrivate(), &a), SyscallSucceeds());
+ EXPECT_EQ(a.load(), 0);
+ EXPECT_THAT(futex_unlock_pi(IsPrivate(), &a), SyscallFailsWithErrno(EPERM));
+}
+
+TEST_P(PrivateAndSharedFutexTest, PIConcurrency_NoRandomSave) {
+ DisableSave ds; // Too many syscalls.
+
+ std::atomic<int> a = ATOMIC_VAR_INIT(0);
+ const bool is_priv = IsPrivate();
+
+ std::unique_ptr<ScopedThread> threads[100];
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(threads); ++i) {
+ threads[i] = absl::make_unique<ScopedThread>([is_priv, &a] {
+ for (size_t j = 0; j < 10; ++j) {
+ ASSERT_THAT(futex_lock_pi(is_priv, &a), SyscallSucceeds());
+ EXPECT_EQ(a.load() & FUTEX_TID_MASK, gettid());
+ SleepSafe(absl::Milliseconds(5));
+ ASSERT_THAT(futex_unlock_pi(is_priv, &a), SyscallSucceeds());
+ }
+ });
+ }
+}
+
+TEST_P(PrivateAndSharedFutexTest, PIWaiters) {
+ std::atomic<int> a = ATOMIC_VAR_INIT(0);
+ const bool is_priv = IsPrivate();
+
+ ASSERT_THAT(futex_lock_pi(is_priv, &a), SyscallSucceeds());
+ EXPECT_EQ(a.load(), gettid());
+
+ ScopedThread th([is_priv, &a] {
+ ASSERT_THAT(futex_lock_pi(is_priv, &a), SyscallSucceeds());
+ ASSERT_THAT(futex_unlock_pi(is_priv, &a), SyscallSucceeds());
+ });
+
+ // Wait until the thread blocks on the futex, setting the waiters bit.
+ auto start = absl::Now();
+ while (a.load() != (FUTEX_WAITERS | gettid())) {
+ ASSERT_LT(absl::Now() - start, absl::Seconds(5));
+ absl::SleepFor(absl::Milliseconds(100));
+ }
+ ASSERT_THAT(futex_unlock_pi(is_priv, &a), SyscallSucceeds());
+}
+
+TEST_P(PrivateAndSharedFutexTest, PITryLock) {
+ std::atomic<int> a = ATOMIC_VAR_INIT(0);
+ const bool is_priv = IsPrivate();
+
+ ASSERT_THAT(futex_trylock_pi(IsPrivate(), &a), SyscallSucceeds());
+ EXPECT_EQ(a.load(), gettid());
+
+ EXPECT_THAT(futex_trylock_pi(is_priv, &a), SyscallFailsWithErrno(EDEADLK));
+ ScopedThread th([is_priv, &a] {
+ EXPECT_THAT(futex_trylock_pi(is_priv, &a), SyscallFailsWithErrno(EAGAIN));
+ });
+ th.Join();
+
+ ASSERT_THAT(futex_unlock_pi(IsPrivate(), &a), SyscallSucceeds());
+}
+
+TEST_P(PrivateAndSharedFutexTest, PITryLockConcurrency_NoRandomSave) {
+ DisableSave ds; // Too many syscalls.
+
+ std::atomic<int> a = ATOMIC_VAR_INIT(0);
+ const bool is_priv = IsPrivate();
+
+ std::unique_ptr<ScopedThread> threads[10];
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(threads); ++i) {
+ threads[i] = absl::make_unique<ScopedThread>([is_priv, &a] {
+ for (size_t j = 0; j < 10;) {
+ if (futex_trylock_pi(is_priv, &a) == 0) {
+ ++j;
+ EXPECT_EQ(a.load() & FUTEX_TID_MASK, gettid());
+ SleepSafe(absl::Milliseconds(5));
+ ASSERT_THAT(futex_unlock_pi(is_priv, &a), SyscallSucceeds());
+ }
+ }
+ });
+ }
+}
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/getcpu.cc b/test/syscalls/linux/getcpu.cc
new file mode 100644
index 000000000..f4d94bd6a
--- /dev/null
+++ b/test/syscalls/linux/getcpu.cc
@@ -0,0 +1,40 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sched.h>
+
+#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(GetcpuTest, IsValidCpuStress) {
+ const int num_cpus = NumCPUs();
+ absl::Time deadline = absl::Now() + absl::Seconds(10);
+ while (absl::Now() < deadline) {
+ int cpu;
+ ASSERT_THAT(cpu = sched_getcpu(), SyscallSucceeds());
+ ASSERT_LT(cpu, num_cpus);
+ }
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/getdents.cc b/test/syscalls/linux/getdents.cc
new file mode 100644
index 000000000..b147d6181
--- /dev/null
+++ b/test/syscalls/linux/getdents.cc
@@ -0,0 +1,539 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <dirent.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <syscall.h>
+#include <unistd.h>
+
+#include <map>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_cat.h"
+#include "test/util/eventfd_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+using ::testing::Contains;
+using ::testing::IsEmpty;
+using ::testing::IsSupersetOf;
+using ::testing::Not;
+using ::testing::NotNull;
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// New Linux dirent format.
+struct linux_dirent64 {
+ uint64_t d_ino; // Inode number
+ int64_t d_off; // Offset to next linux_dirent64
+ unsigned short d_reclen; // NOLINT, Length of this linux_dirent64
+ unsigned char d_type; // NOLINT, File type
+ char d_name[0]; // Filename (null-terminated)
+};
+
+// Old Linux dirent format.
+struct linux_dirent {
+ unsigned long d_ino; // NOLINT
+ unsigned long d_off; // NOLINT
+ unsigned short d_reclen; // NOLINT
+ char d_name[0];
+};
+
+// Wraps a buffer to provide a set of dirents.
+// T is the underlying dirent type.
+template <typename T>
+class DirentBuffer {
+ public:
+ // DirentBuffer manages the buffer.
+ explicit DirentBuffer(size_t size)
+ : managed_(true), actual_size_(size), reported_size_(size) {
+ data_ = new char[actual_size_];
+ }
+
+ // The buffer is managed externally.
+ DirentBuffer(char* data, size_t actual_size, size_t reported_size)
+ : managed_(false),
+ data_(data),
+ actual_size_(actual_size),
+ reported_size_(reported_size) {}
+
+ ~DirentBuffer() {
+ if (managed_) {
+ delete[] data_;
+ }
+ }
+
+ T* Data() { return reinterpret_cast<T*>(data_); }
+
+ T* Start(size_t read) {
+ read_ = read;
+ if (read_) {
+ return Data();
+ } else {
+ return nullptr;
+ }
+ }
+
+ T* Current() { return reinterpret_cast<T*>(&data_[off_]); }
+
+ T* Next() {
+ size_t new_off = off_ + Current()->d_reclen;
+ if (new_off >= read_ || new_off >= actual_size_) {
+ return nullptr;
+ }
+
+ off_ = new_off;
+ return Current();
+ }
+
+ size_t Size() { return reported_size_; }
+
+ void Reset() {
+ off_ = 0;
+ read_ = 0;
+ memset(data_, 0, actual_size_);
+ }
+
+ private:
+ bool managed_;
+ char* data_;
+ size_t actual_size_;
+ size_t reported_size_;
+
+ size_t off_ = 0;
+
+ size_t read_ = 0;
+};
+
+// Test for getdents/getdents64.
+// T is the Linux dirent type.
+template <typename T>
+class GetdentsTest : public ::testing::Test {
+ public:
+ using LinuxDirentType = T;
+ using DirentBufferType = DirentBuffer<T>;
+
+ protected:
+ void SetUp() override {
+ dir_ = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ fd_ = ASSERT_NO_ERRNO_AND_VALUE(Open(dir_.path(), O_RDONLY | O_DIRECTORY));
+ }
+
+ // Must be overridden with explicit specialization. See below.
+ int SyscallNum();
+
+ int Getdents(LinuxDirentType* dirp, unsigned int count) {
+ return RetryEINTR(syscall)(SyscallNum(), fd_.get(), dirp, count);
+ }
+
+ // Fill directory with num files, named by number starting at 0.
+ void FillDirectory(size_t num) {
+ for (size_t i = 0; i < num; i++) {
+ auto name = JoinPath(dir_.path(), absl::StrCat(i));
+ TEST_CHECK(CreateWithContents(name, "").ok());
+ }
+ }
+
+ // Fill directory with a given list of filenames.
+ void FillDirectoryWithFiles(const std::vector<std::string>& filenames) {
+ for (const auto& filename : filenames) {
+ auto name = JoinPath(dir_.path(), filename);
+ TEST_CHECK(CreateWithContents(name, "").ok());
+ }
+ }
+
+ // Seek to the start of the directory.
+ PosixError SeekStart() {
+ constexpr off_t kStartOfFile = 0;
+ off_t offset = lseek(fd_.get(), kStartOfFile, SEEK_SET);
+ if (offset < 0) {
+ return PosixError(errno, absl::StrCat("error seeking to ", kStartOfFile));
+ }
+ if (offset != kStartOfFile) {
+ return PosixError(EINVAL, absl::StrCat("tried to seek to ", kStartOfFile,
+ " but got ", offset));
+ }
+ return NoError();
+ }
+
+ // Call getdents multiple times, reading all dirents and calling f on each.
+ // f has the type signature PosixError f(T*).
+ // If f returns a non-OK error, so does ReadDirents.
+ template <typename F>
+ PosixError ReadDirents(DirentBufferType* dirents, F const& f) {
+ int n;
+ do {
+ dirents->Reset();
+
+ n = Getdents(dirents->Data(), dirents->Size());
+ MaybeSave();
+ if (n < 0) {
+ return PosixError(errno, "getdents");
+ }
+
+ for (auto d = dirents->Start(n); d; d = dirents->Next()) {
+ RETURN_IF_ERRNO(f(d));
+ }
+ } while (n > 0);
+
+ return NoError();
+ }
+
+ // Call Getdents successively and count all entries.
+ int ReadAndCountAllEntries(DirentBufferType* dirents) {
+ int found = 0;
+
+ EXPECT_NO_ERRNO(ReadDirents(dirents, [&](LinuxDirentType* d) {
+ found++;
+ return NoError();
+ }));
+
+ return found;
+ }
+
+ private:
+ TempPath dir_;
+ FileDescriptor fd_;
+};
+
+// Multiple template parameters are not allowed, so we must use explicit
+// template specialization to set the syscall number.
+
+// SYS_getdents isn't defined on arm64.
+#ifdef __x86_64__
+template <>
+int GetdentsTest<struct linux_dirent>::SyscallNum() {
+ return SYS_getdents;
+}
+#endif
+
+template <>
+int GetdentsTest<struct linux_dirent64>::SyscallNum() {
+ return SYS_getdents64;
+}
+
+#ifdef __x86_64__
+// Test both legacy getdents and getdents64 on x86_64.
+typedef ::testing::Types<struct linux_dirent, struct linux_dirent64>
+ GetdentsTypes;
+#elif __aarch64__
+// Test only getdents64 on arm64.
+typedef ::testing::Types<struct linux_dirent64> GetdentsTypes;
+#endif
+TYPED_TEST_SUITE(GetdentsTest, GetdentsTypes);
+
+// N.B. TYPED_TESTs require explicitly using this-> to access members of
+// GetdentsTest, since we are inside of a derived class template.
+
+TYPED_TEST(GetdentsTest, VerifyEntries) {
+ typename TestFixture::DirentBufferType dirents(1024);
+
+ this->FillDirectory(2);
+
+ // Map of all the entries we expect to find.
+ std::map<std::string, bool> found;
+ found["."] = false;
+ found[".."] = false;
+ found["0"] = false;
+ found["1"] = false;
+
+ EXPECT_NO_ERRNO(this->ReadDirents(
+ &dirents, [&](typename TestFixture::LinuxDirentType* d) {
+ auto kv = found.find(d->d_name);
+ EXPECT_NE(kv, found.end()) << "Unexpected file: " << d->d_name;
+ if (kv != found.end()) {
+ EXPECT_FALSE(kv->second);
+ }
+ found[d->d_name] = true;
+ return NoError();
+ }));
+
+ for (auto& kv : found) {
+ EXPECT_TRUE(kv.second) << "File not found: " << kv.first;
+ }
+}
+
+TYPED_TEST(GetdentsTest, VerifyPadding) {
+ typename TestFixture::DirentBufferType dirents(1024);
+
+ // Create files with names of length 1 through 16.
+ std::vector<std::string> files;
+ std::string filename;
+ for (int i = 0; i < 16; ++i) {
+ absl::StrAppend(&filename, "a");
+ files.push_back(filename);
+ }
+ this->FillDirectoryWithFiles(files);
+
+ // We expect to find all the files, plus '.' and '..'.
+ const int expect_found = 2 + files.size();
+ int found = 0;
+
+ EXPECT_NO_ERRNO(this->ReadDirents(
+ &dirents, [&](typename TestFixture::LinuxDirentType* d) {
+ EXPECT_EQ(d->d_reclen % 8, 0)
+ << "Dirent " << d->d_name
+ << " had reclen that was not byte aligned: " << d->d_name;
+ found++;
+ return NoError();
+ }));
+
+ // Make sure we found all the files.
+ EXPECT_EQ(found, expect_found);
+}
+
+// For a small directory, the provided buffer should be large enough
+// for all entries.
+TYPED_TEST(GetdentsTest, SmallDir) {
+ // . and .. should be in an otherwise empty directory.
+ int expect = 2;
+
+ // Add some actual files.
+ this->FillDirectory(2);
+ expect += 2;
+
+ typename TestFixture::DirentBufferType dirents(256);
+
+ EXPECT_EQ(expect, this->ReadAndCountAllEntries(&dirents));
+}
+
+// A directory with lots of files requires calling getdents multiple times.
+TYPED_TEST(GetdentsTest, LargeDir) {
+ // . and .. should be in an otherwise empty directory.
+ int expect = 2;
+
+ // Add some actual files.
+ this->FillDirectory(100);
+ expect += 100;
+
+ typename TestFixture::DirentBufferType dirents(256);
+
+ EXPECT_EQ(expect, this->ReadAndCountAllEntries(&dirents));
+}
+
+// If we lie about the size of the buffer, we should still be able to read the
+// entries with the available space.
+TYPED_TEST(GetdentsTest, PartialBuffer) {
+ // . and .. should be in an otherwise empty directory.
+ int expect = 2;
+
+ // Add some actual files.
+ this->FillDirectory(100);
+ expect += 100;
+
+ void* addr = mmap(0, 2 * kPageSize, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ ASSERT_NE(addr, MAP_FAILED);
+
+ char* buf = reinterpret_cast<char*>(addr);
+
+ // Guard page
+ EXPECT_THAT(
+ mprotect(reinterpret_cast<void*>(buf + kPageSize), kPageSize, PROT_NONE),
+ SyscallSucceeds());
+
+ // Limit space in buf to 256 bytes.
+ buf += kPageSize - 256;
+
+ // Lie about the buffer. Even though we claim the buffer is 1 page,
+ // we should still get all of the dirents in the first 256 bytes.
+ typename TestFixture::DirentBufferType dirents(buf, 256, kPageSize);
+
+ EXPECT_EQ(expect, this->ReadAndCountAllEntries(&dirents));
+
+ EXPECT_THAT(munmap(addr, 2 * kPageSize), SyscallSucceeds());
+}
+
+// Open many file descriptors, then scan through /proc/self/fd to find and close
+// them all. (The latter is commonly used to handle races between fork/execve
+// and the creation of unwanted non-O_CLOEXEC file descriptors.) This tests that
+// getdents iterates correctly despite mutation of /proc/self/fd.
+TYPED_TEST(GetdentsTest, ProcSelfFd) {
+ constexpr size_t kNfds = 10;
+ std::unordered_map<int, FileDescriptor> fds;
+ fds.reserve(kNfds);
+ for (size_t i = 0; i < kNfds; i++) {
+ FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NewEventFD());
+ fds.emplace(fd.get(), std::move(fd));
+ }
+
+ const FileDescriptor proc_self_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/proc/self/fd", O_RDONLY | O_DIRECTORY));
+
+ // Make the buffer very small since we want to iterate.
+ typename TestFixture::DirentBufferType dirents(
+ 2 * sizeof(typename TestFixture::LinuxDirentType));
+ std::unordered_set<int> prev_fds;
+ while (true) {
+ dirents.Reset();
+ int rv;
+ ASSERT_THAT(rv = RetryEINTR(syscall)(this->SyscallNum(), proc_self_fd.get(),
+ dirents.Data(), dirents.Size()),
+ SyscallSucceeds());
+ if (rv == 0) {
+ break;
+ }
+ for (auto* d = dirents.Start(rv); d; d = dirents.Next()) {
+ int dfd;
+ if (!absl::SimpleAtoi(d->d_name, &dfd)) continue;
+ EXPECT_TRUE(prev_fds.insert(dfd).second)
+ << "Repeated observation of /proc/self/fd/" << dfd;
+ fds.erase(dfd);
+ }
+ }
+
+ // Check that we closed every fd.
+ EXPECT_THAT(fds, ::testing::IsEmpty());
+}
+
+// Test that getdents returns ENOTDIR when called on a file.
+TYPED_TEST(GetdentsTest, NotDir) {
+ auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ auto fd = ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY));
+
+ typename TestFixture::DirentBufferType dirents(256);
+ EXPECT_THAT(RetryEINTR(syscall)(this->SyscallNum(), fd.get(), dirents.Data(),
+ dirents.Size()),
+ SyscallFailsWithErrno(ENOTDIR));
+}
+
+// Test that SEEK_SET to 0 causes getdents to re-read the entries.
+TYPED_TEST(GetdentsTest, SeekResetsCursor) {
+ // . and .. should be in an otherwise empty directory.
+ int expect = 2;
+
+ // Add some files to the directory.
+ this->FillDirectory(10);
+ expect += 10;
+
+ typename TestFixture::DirentBufferType dirents(256);
+
+ // We should get all the expected entries.
+ EXPECT_EQ(expect, this->ReadAndCountAllEntries(&dirents));
+
+ // Seek back to 0.
+ ASSERT_NO_ERRNO(this->SeekStart());
+
+ // We should get all the expected entries again.
+ EXPECT_EQ(expect, this->ReadAndCountAllEntries(&dirents));
+}
+
+// Test that getdents() after SEEK_END succeeds.
+// This is a regression test for #128.
+TYPED_TEST(GetdentsTest, Issue128ProcSeekEnd) {
+ auto fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/proc/self", O_RDONLY | O_DIRECTORY));
+ typename TestFixture::DirentBufferType dirents(256);
+
+ ASSERT_THAT(lseek(fd.get(), 0, SEEK_END), SyscallSucceeds());
+ ASSERT_THAT(RetryEINTR(syscall)(this->SyscallNum(), fd.get(), dirents.Data(),
+ dirents.Size()),
+ SyscallSucceeds());
+}
+
+// Some tests using the glibc readdir interface.
+TEST(ReaddirTest, OpenDir) {
+ DIR* dev;
+ ASSERT_THAT(dev = opendir("/dev"), NotNull());
+ EXPECT_THAT(closedir(dev), SyscallSucceeds());
+}
+
+TEST(ReaddirTest, RootContainsBasicDirectories) {
+ EXPECT_THAT(ListDir("/", true),
+ IsPosixErrorOkAndHolds(IsSupersetOf(
+ {"bin", "dev", "etc", "lib", "proc", "sbin", "usr"})));
+}
+
+TEST(ReaddirTest, Bug24096713Dev) {
+ auto contents = ASSERT_NO_ERRNO_AND_VALUE(ListDir("/dev", true));
+ EXPECT_THAT(contents, Not(IsEmpty()));
+}
+
+TEST(ReaddirTest, Bug24096713ProcTid) {
+ auto contents = ASSERT_NO_ERRNO_AND_VALUE(
+ ListDir(absl::StrCat("/proc/", syscall(SYS_gettid), "/"), true));
+ EXPECT_THAT(contents, Not(IsEmpty()));
+}
+
+TEST(ReaddirTest, Bug33429925Proc) {
+ auto contents = ASSERT_NO_ERRNO_AND_VALUE(ListDir("/proc", true));
+ EXPECT_THAT(contents, Not(IsEmpty()));
+}
+
+TEST(ReaddirTest, Bug35110122Root) {
+ auto contents = ASSERT_NO_ERRNO_AND_VALUE(ListDir("/", true));
+ EXPECT_THAT(contents, Not(IsEmpty()));
+}
+
+// Unlink should invalidate getdents cache.
+TEST(ReaddirTest, GoneAfterRemoveCache) {
+ TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir.path()));
+ std::string name = std::string(Basename(file.path()));
+
+ auto contents = ASSERT_NO_ERRNO_AND_VALUE(ListDir(dir.path(), true));
+ EXPECT_THAT(contents, Contains(name));
+
+ file.reset();
+
+ contents = ASSERT_NO_ERRNO_AND_VALUE(ListDir(dir.path(), true));
+ EXPECT_THAT(contents, Not(Contains(name)));
+}
+
+// Regression test for b/137398511. Rename should invalidate getdents cache.
+TEST(ReaddirTest, GoneAfterRenameCache) {
+ TempPath src = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ TempPath dst = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+ TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(src.path()));
+ std::string name = std::string(Basename(file.path()));
+
+ auto contents = ASSERT_NO_ERRNO_AND_VALUE(ListDir(src.path(), true));
+ EXPECT_THAT(contents, Contains(name));
+
+ ASSERT_THAT(rename(file.path().c_str(), JoinPath(dst.path(), name).c_str()),
+ SyscallSucceeds());
+ // Release file since it was renamed. dst cleanup will ultimately delete it.
+ file.release();
+
+ contents = ASSERT_NO_ERRNO_AND_VALUE(ListDir(src.path(), true));
+ EXPECT_THAT(contents, Not(Contains(name)));
+
+ contents = ASSERT_NO_ERRNO_AND_VALUE(ListDir(dst.path(), true));
+ EXPECT_THAT(contents, Contains(name));
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/getrandom.cc b/test/syscalls/linux/getrandom.cc
new file mode 100644
index 000000000..f87cdd7a1
--- /dev/null
+++ b/test/syscalls/linux/getrandom.cc
@@ -0,0 +1,63 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+#ifndef SYS_getrandom
+#if defined(__x86_64__)
+#define SYS_getrandom 318
+#elif defined(__i386__)
+#define SYS_getrandom 355
+#elif defined(__aarch64__)
+#define SYS_getrandom 278
+#else
+#error "Unknown architecture"
+#endif
+#endif // SYS_getrandom
+
+bool SomeByteIsNonZero(char* random_bytes, int length) {
+ for (int i = 0; i < length; i++) {
+ if (random_bytes[i] != 0) {
+ return true;
+ }
+ }
+ return false;
+}
+
+TEST(GetrandomTest, IsRandom) {
+ // This test calls get_random and makes sure that the array is filled in with
+ // something that is non-zero. Perhaps we get back \x00\x00\x00\x00\x00.... as
+ // a random result, but it's so unlikely that we'll just ignore this.
+ char random_bytes[64] = {};
+ int n = syscall(SYS_getrandom, random_bytes, 64, 0);
+ SKIP_IF(!IsRunningOnGvisor() && n < 0 && errno == ENOSYS);
+ EXPECT_THAT(n, SyscallSucceeds());
+ EXPECT_GT(n, 0); // Some bytes should be returned.
+ EXPECT_TRUE(SomeByteIsNonZero(random_bytes, n));
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/getrusage.cc b/test/syscalls/linux/getrusage.cc
new file mode 100644
index 000000000..0e51d42a8
--- /dev/null
+++ b/test/syscalls/linux/getrusage.cc
@@ -0,0 +1,177 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <signal.h>
+#include <sys/mman.h>
+#include <sys/resource.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/logging.h"
+#include "test/util/memory_util.h"
+#include "test/util/signal_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(GetrusageTest, BasicFork) {
+ pid_t pid = fork();
+ if (pid == 0) {
+ struct rusage rusage_self;
+ TEST_PCHECK(getrusage(RUSAGE_SELF, &rusage_self) == 0);
+ struct rusage rusage_children;
+ TEST_PCHECK(getrusage(RUSAGE_CHILDREN, &rusage_children) == 0);
+ // The child has consumed some memory.
+ TEST_CHECK(rusage_self.ru_maxrss != 0);
+ // The child has no children of its own.
+ TEST_CHECK(rusage_children.ru_maxrss == 0);
+ _exit(0);
+ }
+ ASSERT_THAT(pid, SyscallSucceeds());
+ int status;
+ ASSERT_THAT(RetryEINTR(waitpid)(pid, &status, 0), SyscallSucceeds());
+ struct rusage rusage_self;
+ ASSERT_THAT(getrusage(RUSAGE_SELF, &rusage_self), SyscallSucceeds());
+ struct rusage rusage_children;
+ ASSERT_THAT(getrusage(RUSAGE_CHILDREN, &rusage_children), SyscallSucceeds());
+ // The parent has consumed some memory.
+ EXPECT_GT(rusage_self.ru_maxrss, 0);
+ // The child has consumed some memory, and because it has exited we can get
+ // its max RSS.
+ EXPECT_GT(rusage_children.ru_maxrss, 0);
+}
+
+// Verifies that a process can get the max resident set size of its grandchild,
+// i.e. that maxrss propagates correctly from children to waiting parents.
+TEST(GetrusageTest, Grandchild) {
+ constexpr int kGrandchildSizeKb = 1024;
+ pid_t pid = fork();
+ if (pid == 0) {
+ pid = fork();
+ if (pid == 0) {
+ int flags = MAP_ANONYMOUS | MAP_POPULATE | MAP_PRIVATE;
+ void* addr =
+ mmap(nullptr, kGrandchildSizeKb * 1024, PROT_WRITE, flags, -1, 0);
+ TEST_PCHECK(addr != MAP_FAILED);
+ } else {
+ int status;
+ TEST_PCHECK(RetryEINTR(waitpid)(pid, &status, 0) == pid);
+ }
+ _exit(0);
+ }
+ ASSERT_THAT(pid, SyscallSucceeds());
+ int status;
+ ASSERT_THAT(RetryEINTR(waitpid)(pid, &status, 0), SyscallSucceeds());
+ struct rusage rusage_self;
+ ASSERT_THAT(getrusage(RUSAGE_SELF, &rusage_self), SyscallSucceeds());
+ struct rusage rusage_children;
+ ASSERT_THAT(getrusage(RUSAGE_CHILDREN, &rusage_children), SyscallSucceeds());
+ // The parent has consumed some memory.
+ EXPECT_GT(rusage_self.ru_maxrss, 0);
+ // The child should consume next to no memory, but the grandchild will
+ // consume at least 1MB. Verify that usage bubbles up to the grandparent.
+ EXPECT_GT(rusage_children.ru_maxrss, kGrandchildSizeKb);
+}
+
+// Verifies that processes ignoring SIGCHLD do not have updated child maxrss
+// updated.
+TEST(GetrusageTest, IgnoreSIGCHLD) {
+ struct sigaction sa;
+ sa.sa_handler = SIG_IGN;
+ sa.sa_flags = 0;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGCHLD, sa));
+ pid_t pid = fork();
+ if (pid == 0) {
+ struct rusage rusage_self;
+ TEST_PCHECK(getrusage(RUSAGE_SELF, &rusage_self) == 0);
+ // The child has consumed some memory.
+ TEST_CHECK(rusage_self.ru_maxrss != 0);
+ _exit(0);
+ }
+ ASSERT_THAT(pid, SyscallSucceeds());
+ int status;
+ ASSERT_THAT(RetryEINTR(waitpid)(pid, &status, 0),
+ SyscallFailsWithErrno(ECHILD));
+ struct rusage rusage_self;
+ ASSERT_THAT(getrusage(RUSAGE_SELF, &rusage_self), SyscallSucceeds());
+ struct rusage rusage_children;
+ ASSERT_THAT(getrusage(RUSAGE_CHILDREN, &rusage_children), SyscallSucceeds());
+ // The parent has consumed some memory.
+ EXPECT_GT(rusage_self.ru_maxrss, 0);
+ // The child's maxrss should not have propagated up.
+ EXPECT_EQ(rusage_children.ru_maxrss, 0);
+}
+
+// Verifies that zombie processes do not update their parent's maxrss. Only
+// reaped processes should do this.
+TEST(GetrusageTest, IgnoreZombie) {
+ pid_t pid = fork();
+ if (pid == 0) {
+ struct rusage rusage_self;
+ TEST_PCHECK(getrusage(RUSAGE_SELF, &rusage_self) == 0);
+ struct rusage rusage_children;
+ TEST_PCHECK(getrusage(RUSAGE_CHILDREN, &rusage_children) == 0);
+ // The child has consumed some memory.
+ TEST_CHECK(rusage_self.ru_maxrss != 0);
+ // The child has no children of its own.
+ TEST_CHECK(rusage_children.ru_maxrss == 0);
+ _exit(0);
+ }
+ ASSERT_THAT(pid, SyscallSucceeds());
+ // Give the child time to exit. Because we don't call wait, the child should
+ // remain a zombie.
+ absl::SleepFor(absl::Seconds(5));
+ struct rusage rusage_self;
+ ASSERT_THAT(getrusage(RUSAGE_SELF, &rusage_self), SyscallSucceeds());
+ struct rusage rusage_children;
+ ASSERT_THAT(getrusage(RUSAGE_CHILDREN, &rusage_children), SyscallSucceeds());
+ // The parent has consumed some memory.
+ EXPECT_GT(rusage_self.ru_maxrss, 0);
+ // The child has consumed some memory, but hasn't been reaped.
+ EXPECT_EQ(rusage_children.ru_maxrss, 0);
+}
+
+TEST(GetrusageTest, Wait4) {
+ pid_t pid = fork();
+ if (pid == 0) {
+ struct rusage rusage_self;
+ TEST_PCHECK(getrusage(RUSAGE_SELF, &rusage_self) == 0);
+ struct rusage rusage_children;
+ TEST_PCHECK(getrusage(RUSAGE_CHILDREN, &rusage_children) == 0);
+ // The child has consumed some memory.
+ TEST_CHECK(rusage_self.ru_maxrss != 0);
+ // The child has no children of its own.
+ TEST_CHECK(rusage_children.ru_maxrss == 0);
+ _exit(0);
+ }
+ ASSERT_THAT(pid, SyscallSucceeds());
+ struct rusage rusage_children;
+ int status;
+ ASSERT_THAT(RetryEINTR(wait4)(pid, &status, 0, &rusage_children),
+ SyscallSucceeds());
+ // The child has consumed some memory, and because it has exited we can get
+ // its max RSS.
+ EXPECT_GT(rusage_children.ru_maxrss, 0);
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/inotify.cc b/test/syscalls/linux/inotify.cc
new file mode 100644
index 000000000..220874aeb
--- /dev/null
+++ b/test/syscalls/linux/inotify.cc
@@ -0,0 +1,2380 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <libgen.h>
+#include <sched.h>
+#include <sys/epoll.h>
+#include <sys/inotify.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <sys/xattr.h>
+
+#include <atomic>
+#include <list>
+#include <string>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/epoll_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+using ::absl::StreamFormat;
+using ::absl::StrFormat;
+
+constexpr int kBufSize = 1024;
+
+// C++-friendly version of struct inotify_event.
+struct Event {
+ int32_t wd;
+ uint32_t mask;
+ uint32_t cookie;
+ uint32_t len;
+ std::string name;
+
+ Event(uint32_t mask, int32_t wd, absl::string_view name, uint32_t cookie)
+ : wd(wd),
+ mask(mask),
+ cookie(cookie),
+ len(name.size()),
+ name(std::string(name)) {}
+ Event(uint32_t mask, int32_t wd, absl::string_view name)
+ : Event(mask, wd, name, 0) {}
+ Event(uint32_t mask, int32_t wd) : Event(mask, wd, "", 0) {}
+ Event() : Event(0, 0, "", 0) {}
+};
+
+// Prints the symbolic name for a struct inotify_event's 'mask' field.
+std::string FlagString(uint32_t flags) {
+ std::vector<std::string> names;
+
+#define EMIT(target) \
+ if (flags & target) { \
+ names.push_back(#target); \
+ flags &= ~target; \
+ }
+
+ EMIT(IN_ACCESS);
+ EMIT(IN_ATTRIB);
+ EMIT(IN_CLOSE_WRITE);
+ EMIT(IN_CLOSE_NOWRITE);
+ EMIT(IN_CREATE);
+ EMIT(IN_DELETE);
+ EMIT(IN_DELETE_SELF);
+ EMIT(IN_MODIFY);
+ EMIT(IN_MOVE_SELF);
+ EMIT(IN_MOVED_FROM);
+ EMIT(IN_MOVED_TO);
+ EMIT(IN_OPEN);
+
+ EMIT(IN_DONT_FOLLOW);
+ EMIT(IN_EXCL_UNLINK);
+ EMIT(IN_ONESHOT);
+ EMIT(IN_ONLYDIR);
+
+ EMIT(IN_IGNORED);
+ EMIT(IN_ISDIR);
+ EMIT(IN_Q_OVERFLOW);
+ EMIT(IN_UNMOUNT);
+
+#undef EMIT
+
+ // If we have anything left over at the end, print it as a hex value.
+ if (flags) {
+ names.push_back(absl::StrCat("0x", absl::Hex(flags)));
+ }
+
+ return absl::StrJoin(names, "|");
+}
+
+std::string DumpEvent(const Event& event) {
+ return StrFormat(
+ "%s, wd=%d%s%s", FlagString(event.mask), event.wd,
+ (event.len > 0) ? StrFormat(", name=%s", event.name) : "",
+ (event.cookie > 0) ? StrFormat(", cookie=%ud", event.cookie) : "");
+}
+
+std::string DumpEvents(const std::vector<Event>& events, int indent_level) {
+ std::stringstream ss;
+ ss << StreamFormat("%d event%s:\n", events.size(),
+ (events.size() > 1) ? "s" : "");
+ int i = 0;
+ for (const Event& ev : events) {
+ ss << StreamFormat("%sevents[%d]: %s\n", std::string(indent_level, '\t'),
+ i++, DumpEvent(ev));
+ }
+ return ss.str();
+}
+
+// A matcher which takes an expected list of events to match against another
+// list of inotify events, in order. This is similar to the ElementsAre matcher,
+// but displays more informative messages on mismatch.
+class EventsAreMatcher
+ : public ::testing::MatcherInterface<std::vector<Event>> {
+ public:
+ explicit EventsAreMatcher(std::vector<Event> references)
+ : references_(std::move(references)) {}
+
+ bool MatchAndExplain(
+ std::vector<Event> events,
+ ::testing::MatchResultListener* const listener) const override {
+ if (references_.size() != events.size()) {
+ *listener << StreamFormat("\n\tCount mismatch, got %s",
+ DumpEvents(events, 2));
+ return false;
+ }
+
+ bool success = true;
+ for (unsigned int i = 0; i < references_.size(); ++i) {
+ const Event& reference = references_[i];
+ const Event& target = events[i];
+
+ if (target.mask != reference.mask || target.wd != reference.wd ||
+ target.name != reference.name || target.cookie != reference.cookie) {
+ *listener << StreamFormat("\n\tMismatch at index %d, want %s, got %s,",
+ i, DumpEvent(reference), DumpEvent(target));
+ success = false;
+ }
+ }
+
+ if (!success) {
+ *listener << StreamFormat("\n\tIn total of %s", DumpEvents(events, 2));
+ }
+ return success;
+ }
+
+ void DescribeTo(::std::ostream* const os) const override {
+ *os << StreamFormat("%s", DumpEvents(references_, 1));
+ }
+
+ void DescribeNegationTo(::std::ostream* const os) const override {
+ *os << StreamFormat("mismatch from %s", DumpEvents(references_, 1));
+ }
+
+ private:
+ std::vector<Event> references_;
+};
+
+::testing::Matcher<std::vector<Event>> Are(std::vector<Event> events) {
+ return MakeMatcher(new EventsAreMatcher(std::move(events)));
+}
+
+// Similar to the EventsAre matcher, but the order of events are ignored.
+class UnorderedEventsAreMatcher
+ : public ::testing::MatcherInterface<std::vector<Event>> {
+ public:
+ explicit UnorderedEventsAreMatcher(std::vector<Event> references)
+ : references_(std::move(references)) {}
+
+ bool MatchAndExplain(
+ std::vector<Event> events,
+ ::testing::MatchResultListener* const listener) const override {
+ if (references_.size() != events.size()) {
+ *listener << StreamFormat("\n\tCount mismatch, got %s",
+ DumpEvents(events, 2));
+ return false;
+ }
+
+ std::vector<Event> unmatched(references_);
+
+ for (const Event& candidate : events) {
+ for (auto it = unmatched.begin(); it != unmatched.end();) {
+ const Event& reference = *it;
+ if (candidate.mask == reference.mask && candidate.wd == reference.wd &&
+ candidate.name == reference.name &&
+ candidate.cookie == reference.cookie) {
+ it = unmatched.erase(it);
+ break;
+ } else {
+ ++it;
+ }
+ }
+ }
+
+ // Anything left unmatched? If so, the matcher fails.
+ if (!unmatched.empty()) {
+ *listener << StreamFormat("\n\tFailed to match %s",
+ DumpEvents(unmatched, 2));
+ *listener << StreamFormat("\n\tIn total of %s", DumpEvents(events, 2));
+ return false;
+ }
+
+ return true;
+ }
+
+ void DescribeTo(::std::ostream* const os) const override {
+ *os << StreamFormat("unordered %s", DumpEvents(references_, 1));
+ }
+
+ void DescribeNegationTo(::std::ostream* const os) const override {
+ *os << StreamFormat("mismatch from unordered %s",
+ DumpEvents(references_, 1));
+ }
+
+ private:
+ std::vector<Event> references_;
+};
+
+::testing::Matcher<std::vector<Event>> AreUnordered(std::vector<Event> events) {
+ return MakeMatcher(new UnorderedEventsAreMatcher(std::move(events)));
+}
+
+// Reads events from an inotify fd until either EOF, or read returns EAGAIN.
+PosixErrorOr<std::vector<Event>> DrainEvents(int fd) {
+ std::vector<Event> events;
+ while (true) {
+ int events_size = 0;
+ if (ioctl(fd, FIONREAD, &events_size) < 0) {
+ return PosixError(errno, "ioctl(FIONREAD) failed on inotify fd");
+ }
+ // Deliberately use a buffer that is larger than necessary, expecting to
+ // only read events_size bytes.
+ std::vector<char> buf(events_size + kBufSize, 0);
+ const ssize_t readlen = read(fd, buf.data(), buf.size());
+ MaybeSave();
+ // Read error?
+ if (readlen < 0) {
+ if (errno == EAGAIN) {
+ // If EAGAIN, no more events at the moment. Return what we have so far.
+ return events;
+ }
+ // Some other read error. Return an error. Right now if we encounter this
+ // after already reading some events, they get lost. However, we don't
+ // expect to see any error, and the calling test will fail immediately if
+ // we signal an error anyways, so this is acceptable.
+ return PosixError(errno, "read() failed on inotify fd");
+ }
+ if (readlen < static_cast<int>(sizeof(struct inotify_event))) {
+ // Impossibly short read.
+ return PosixError(
+ EIO,
+ "read() didn't return enough data represent even a single event");
+ }
+ if (readlen != events_size) {
+ return PosixError(EINVAL, absl::StrCat("read ", readlen,
+ " bytes, expected ", events_size));
+ }
+ if (readlen == 0) {
+ // EOF.
+ return events;
+ }
+
+ // Normal read.
+ const char* cursor = buf.data();
+ while (cursor < (buf.data() + readlen)) {
+ struct inotify_event event = {};
+ memcpy(&event, cursor, sizeof(struct inotify_event));
+
+ Event ev;
+ ev.wd = event.wd;
+ ev.mask = event.mask;
+ ev.cookie = event.cookie;
+ ev.len = event.len;
+ if (event.len > 0) {
+ TEST_CHECK(static_cast<int>(sizeof(struct inotify_event) + event.len) <=
+ readlen);
+ ev.name = std::string(cursor +
+ offsetof(struct inotify_event, name)); // NOLINT
+ // Name field should always be smaller than event.len, otherwise we have
+ // a buffer overflow. The two sizes aren't equal because the string
+ // constructor will stop at the first null byte, while event.name may be
+ // padded up to event.len using multiple null bytes.
+ TEST_CHECK(ev.name.size() <= event.len);
+ }
+
+ events.push_back(ev);
+ cursor += sizeof(struct inotify_event) + event.len;
+ }
+ }
+}
+
+PosixErrorOr<FileDescriptor> InotifyInit1(int flags) {
+ int fd;
+ EXPECT_THAT(fd = inotify_init1(flags), SyscallSucceeds());
+ if (fd < 0) {
+ return PosixError(errno, "inotify_init1() failed");
+ }
+ return FileDescriptor(fd);
+}
+
+PosixErrorOr<int> InotifyAddWatch(int fd, const std::string& path,
+ uint32_t mask) {
+ int wd;
+ EXPECT_THAT(wd = inotify_add_watch(fd, path.c_str(), mask),
+ SyscallSucceeds());
+ if (wd < 0) {
+ return PosixError(errno, "inotify_add_watch() failed");
+ }
+ return wd;
+}
+
+TEST(Inotify, IllegalSeek) {
+ const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(0));
+ EXPECT_THAT(lseek(fd.get(), 0, SEEK_SET), SyscallFailsWithErrno(ESPIPE));
+}
+
+TEST(Inotify, IllegalPread) {
+ const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(0));
+ int val;
+ EXPECT_THAT(pread(fd.get(), &val, sizeof(val), 0),
+ SyscallFailsWithErrno(ESPIPE));
+}
+
+TEST(Inotify, IllegalPwrite) {
+ const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(0));
+ EXPECT_THAT(pwrite(fd.get(), "x", 1, 0), SyscallFailsWithErrno(ESPIPE));
+}
+
+TEST(Inotify, IllegalWrite) {
+ const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(0));
+ int val = 0;
+ EXPECT_THAT(write(fd.get(), &val, sizeof(val)), SyscallFailsWithErrno(EBADF));
+}
+
+TEST(Inotify, InitFlags) {
+ EXPECT_THAT(inotify_init1(IN_NONBLOCK | IN_CLOEXEC), SyscallSucceeds());
+ EXPECT_THAT(inotify_init1(12345), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(Inotify, NonBlockingReadReturnsEagain) {
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+ std::vector<char> buf(kBufSize, 0);
+
+ // The read below should return fail with EAGAIN because there is no data to
+ // read and we've specified IN_NONBLOCK. We're guaranteed that there is no
+ // data to read because we haven't registered any watches yet.
+ EXPECT_THAT(read(fd.get(), buf.data(), buf.size()),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST(Inotify, AddWatchOnInvalidFdFails) {
+ // Garbage fd.
+ EXPECT_THAT(inotify_add_watch(-1, "/tmp", IN_ALL_EVENTS),
+ SyscallFailsWithErrno(EBADF));
+ EXPECT_THAT(inotify_add_watch(1337, "/tmp", IN_ALL_EVENTS),
+ SyscallFailsWithErrno(EBADF));
+
+ // Non-inotify fds.
+ EXPECT_THAT(inotify_add_watch(0, "/tmp", IN_ALL_EVENTS),
+ SyscallFailsWithErrno(EINVAL));
+ EXPECT_THAT(inotify_add_watch(1, "/tmp", IN_ALL_EVENTS),
+ SyscallFailsWithErrno(EINVAL));
+ EXPECT_THAT(inotify_add_watch(2, "/tmp", IN_ALL_EVENTS),
+ SyscallFailsWithErrno(EINVAL));
+ const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open("/tmp", O_RDONLY));
+ EXPECT_THAT(inotify_add_watch(fd.get(), "/tmp", IN_ALL_EVENTS),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(Inotify, RemovingWatchGeneratesEvent) {
+ const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+ const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+ EXPECT_THAT(inotify_rm_watch(fd.get(), wd), SyscallSucceeds());
+
+ // Read events, ensure the first event is IN_IGNORED.
+ const std::vector<Event> events =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ EXPECT_THAT(events, Are({Event(IN_IGNORED, wd)}));
+}
+
+TEST(Inotify, CanDeleteFileAfterRemovingWatch) {
+ const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ TempPath file1 =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+ const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS));
+
+ EXPECT_THAT(inotify_rm_watch(fd.get(), wd), SyscallSucceeds());
+ file1.reset();
+}
+
+TEST(Inotify, RemoveWatchAfterDeletingFileFails) {
+ const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ TempPath file1 =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+ const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS));
+
+ file1.reset();
+ const std::vector<Event> events =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ EXPECT_THAT(events, Are({Event(IN_ATTRIB, wd), Event(IN_DELETE_SELF, wd),
+ Event(IN_IGNORED, wd)}));
+
+ EXPECT_THAT(inotify_rm_watch(fd.get(), wd), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(Inotify, DuplicateWatchRemovalFails) {
+ const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+ const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+
+ EXPECT_THAT(inotify_rm_watch(fd.get(), wd), SyscallSucceeds());
+ EXPECT_THAT(inotify_rm_watch(fd.get(), wd), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(Inotify, ConcurrentFileDeletionAndWatchRemoval) {
+ const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+ const std::string filename = NewTempAbsPathInDir(root.path());
+
+ auto file_create_delete = [filename]() {
+ const DisableSave ds; // Too expensive.
+ for (int i = 0; i < 100; ++i) {
+ FileDescriptor file_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(filename, O_CREAT, S_IRUSR | S_IWUSR));
+ file_fd.reset(); // Close before unlinking (although save is disabled).
+ EXPECT_THAT(unlink(filename.c_str()), SyscallSucceeds());
+ }
+ };
+
+ const int shared_fd = fd.get(); // We need to pass it to the thread.
+ auto add_remove_watch = [shared_fd, filename]() {
+ for (int i = 0; i < 100; ++i) {
+ int wd = inotify_add_watch(shared_fd, filename.c_str(), IN_ALL_EVENTS);
+ MaybeSave();
+ if (wd != -1) {
+ // Watch added successfully, try removal.
+ if (inotify_rm_watch(shared_fd, wd)) {
+ // If removal fails, the only acceptable reason is if the wd
+ // is invalid, which will be the case if we try to remove
+ // the watch after the file has been deleted.
+ EXPECT_EQ(errno, EINVAL);
+ }
+ } else {
+ // Add watch failed, this should only fail if the target file doesn't
+ // exist.
+ EXPECT_EQ(errno, ENOENT);
+ }
+ }
+ };
+
+ ScopedThread t1(file_create_delete);
+ ScopedThread t2(add_remove_watch);
+}
+
+TEST(Inotify, DeletingChildGeneratesEvents) {
+ const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ TempPath file1 =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+ const int root_wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+ const int file1_wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS));
+
+ const std::string file1_path = file1.reset();
+
+ const std::vector<Event> events =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ ASSERT_THAT(
+ events,
+ AreUnordered({Event(IN_ATTRIB, file1_wd), Event(IN_DELETE_SELF, file1_wd),
+ Event(IN_IGNORED, file1_wd),
+ Event(IN_DELETE, root_wd, Basename(file1_path))}));
+}
+
+// Creating a file in "parent/child" should generate events for child, but not
+// parent.
+TEST(Inotify, CreatingFileGeneratesEvents) {
+ const TempPath parent = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const TempPath child =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(parent.path()));
+
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+ ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), parent.path(), IN_ALL_EVENTS));
+ const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), child.path(), IN_ALL_EVENTS));
+
+ // Create a new file in the directory.
+ const TempPath file1 =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(child.path()));
+ const std::vector<Event> events =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+
+ // The library function we use to create the new file opens it for writing to
+ // create it and sets permissions on it, so we expect the three extra events.
+ ASSERT_THAT(events, Are({Event(IN_CREATE, wd, Basename(file1.path())),
+ Event(IN_OPEN, wd, Basename(file1.path())),
+ Event(IN_CLOSE_WRITE, wd, Basename(file1.path())),
+ Event(IN_ATTRIB, wd, Basename(file1.path()))}));
+}
+
+TEST(Inotify, ReadingFileGeneratesAccessEvent) {
+ const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+ const TempPath file1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ root.path(), "some content", TempPath::kDefaultFileMode));
+
+ const FileDescriptor file1_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDONLY));
+ const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+
+ char buf;
+ EXPECT_THAT(read(file1_fd.get(), &buf, 1), SyscallSucceeds());
+
+ const std::vector<Event> events =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ ASSERT_THAT(events, Are({Event(IN_ACCESS, wd, Basename(file1.path()))}));
+}
+
+TEST(Inotify, WritingFileGeneratesModifyEvent) {
+ const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+ const TempPath file1 =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+
+ const FileDescriptor file1_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_WRONLY));
+ const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+
+ const std::string data = "some content";
+ EXPECT_THAT(write(file1_fd.get(), data.c_str(), data.length()),
+ SyscallSucceeds());
+
+ const std::vector<Event> events =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ ASSERT_THAT(events, Are({Event(IN_MODIFY, wd, Basename(file1.path()))}));
+}
+
+TEST(Inotify, SizeZeroReadWriteGeneratesNothing) {
+ const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+ const TempPath file1 =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+
+ const FileDescriptor file1_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDWR));
+ ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+
+ // Read from the empty file.
+ int val;
+ ASSERT_THAT(read(file1_fd.get(), &val, sizeof(val)),
+ SyscallSucceedsWithValue(0));
+
+ // Write zero bytes.
+ ASSERT_THAT(write(file1_fd.get(), "", 0), SyscallSucceedsWithValue(0));
+
+ const std::vector<Event> events =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ ASSERT_THAT(events, Are({}));
+}
+
+TEST(Inotify, FailedFileCreationGeneratesNoEvents) {
+ const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const std::string dir_path = dir.path();
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch(fd.get(), dir_path, IN_ALL_EVENTS));
+
+ const char* p = dir_path.c_str();
+ ASSERT_THAT(mkdir(p, 0777), SyscallFails());
+ ASSERT_THAT(mknod(p, S_IFIFO, 0777), SyscallFails());
+ ASSERT_THAT(symlink(p, p), SyscallFails());
+ ASSERT_THAT(link(p, p), SyscallFails());
+ std::vector<Event> events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ ASSERT_THAT(events, Are({}));
+}
+
+TEST(Inotify, WatchSetAfterOpenReportsCloseFdEvent) {
+ const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+ const TempPath file1 =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+
+ FileDescriptor file1_fd_writable =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_WRONLY));
+ FileDescriptor file1_fd_not_writable =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDONLY));
+ const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+
+ file1_fd_writable.reset(); // Close file1_fd_writable.
+ std::vector<Event> events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ ASSERT_THAT(events, Are({Event(IN_CLOSE_WRITE, wd, Basename(file1.path()))}));
+
+ file1_fd_not_writable.reset(); // Close file1_fd_not_writable.
+ events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ ASSERT_THAT(events,
+ Are({Event(IN_CLOSE_NOWRITE, wd, Basename(file1.path()))}));
+}
+
+TEST(Inotify, ChildrenDeletionInWatchedDirGeneratesEvent) {
+ const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+ TempPath file1 =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+ TempPath dir1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(root.path()));
+
+ const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+
+ const std::string file1_path = file1.reset();
+ const std::string dir1_path = dir1.release();
+ EXPECT_THAT(rmdir(dir1_path.c_str()), SyscallSucceeds());
+
+ const std::vector<Event> events =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+
+ ASSERT_THAT(events,
+ Are({Event(IN_DELETE, wd, Basename(file1_path)),
+ Event(IN_DELETE | IN_ISDIR, wd, Basename(dir1_path))}));
+}
+
+TEST(Inotify, RmdirOnWatchedTargetGeneratesEvent) {
+ const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+ const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+
+ EXPECT_THAT(rmdir(root.path().c_str()), SyscallSucceeds());
+ const std::vector<Event> events =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ ASSERT_THAT(events, Are({Event(IN_DELETE_SELF, wd), Event(IN_IGNORED, wd)}));
+}
+
+TEST(Inotify, MoveGeneratesEvents) {
+ const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+ TempPath file1 =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+
+ const TempPath dir1 =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(root.path()));
+ const TempPath dir2 =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(root.path()));
+
+ const int root_wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+ const int dir1_wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), dir1.path(), IN_ALL_EVENTS));
+ const int dir2_wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), dir2.path(), IN_ALL_EVENTS));
+ // Test move from root -> root.
+ std::string newpath = NewTempAbsPathInDir(root.path());
+ std::string oldpath = file1.release();
+ EXPECT_THAT(rename(oldpath.c_str(), newpath.c_str()), SyscallSucceeds());
+ file1.reset(newpath);
+ std::vector<Event> events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ ASSERT_THAT(
+ events,
+ Are({Event(IN_MOVED_FROM, root_wd, Basename(oldpath), events[0].cookie),
+ Event(IN_MOVED_TO, root_wd, Basename(newpath), events[1].cookie)}));
+ EXPECT_NE(events[0].cookie, 0);
+ EXPECT_EQ(events[0].cookie, events[1].cookie);
+ uint32_t last_cookie = events[0].cookie;
+
+ // Test move from root -> root/dir1.
+ newpath = NewTempAbsPathInDir(dir1.path());
+ oldpath = file1.release();
+ EXPECT_THAT(rename(oldpath.c_str(), newpath.c_str()), SyscallSucceeds());
+ file1.reset(newpath);
+ events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ ASSERT_THAT(
+ events,
+ Are({Event(IN_MOVED_FROM, root_wd, Basename(oldpath), events[0].cookie),
+ Event(IN_MOVED_TO, dir1_wd, Basename(newpath), events[1].cookie)}));
+ // Cookies should be distinct between distinct rename events.
+ EXPECT_NE(events[0].cookie, last_cookie);
+ EXPECT_EQ(events[0].cookie, events[1].cookie);
+ last_cookie = events[0].cookie;
+
+ // Test move from root/dir1 -> root/dir2.
+ newpath = NewTempAbsPathInDir(dir2.path());
+ oldpath = file1.release();
+ EXPECT_THAT(rename(oldpath.c_str(), newpath.c_str()), SyscallSucceeds());
+ file1.reset(newpath);
+ events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ ASSERT_THAT(
+ events,
+ Are({Event(IN_MOVED_FROM, dir1_wd, Basename(oldpath), events[0].cookie),
+ Event(IN_MOVED_TO, dir2_wd, Basename(newpath), events[1].cookie)}));
+ EXPECT_NE(events[0].cookie, last_cookie);
+ EXPECT_EQ(events[0].cookie, events[1].cookie);
+ last_cookie = events[0].cookie;
+}
+
+TEST(Inotify, MoveWatchedTargetGeneratesEvents) {
+ const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+ TempPath file1 =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+
+ const int root_wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+ const int file1_wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS));
+
+ const std::string newpath = NewTempAbsPathInDir(root.path());
+ const std::string oldpath = file1.release();
+ EXPECT_THAT(rename(oldpath.c_str(), newpath.c_str()), SyscallSucceeds());
+ file1.reset(newpath);
+ const std::vector<Event> events =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ ASSERT_THAT(
+ events,
+ Are({Event(IN_MOVED_FROM, root_wd, Basename(oldpath), events[0].cookie),
+ Event(IN_MOVED_TO, root_wd, Basename(newpath), events[1].cookie),
+ // Self move events do not have a cookie.
+ Event(IN_MOVE_SELF, file1_wd)}));
+ EXPECT_NE(events[0].cookie, 0);
+ EXPECT_EQ(events[0].cookie, events[1].cookie);
+}
+
+TEST(Inotify, CoalesceEvents) {
+ const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+ const TempPath file1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ root.path(), "some content", TempPath::kDefaultFileMode));
+
+ FileDescriptor file1_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDONLY));
+ const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+
+ // Read the file a few times. This will would generate multiple IN_ACCESS
+ // events but they should get coalesced to a single event.
+ char buf;
+ EXPECT_THAT(read(file1_fd.get(), &buf, 1), SyscallSucceeds());
+ EXPECT_THAT(read(file1_fd.get(), &buf, 1), SyscallSucceeds());
+ EXPECT_THAT(read(file1_fd.get(), &buf, 1), SyscallSucceeds());
+ EXPECT_THAT(read(file1_fd.get(), &buf, 1), SyscallSucceeds());
+
+ // Use the close event verify that we haven't simply left the additional
+ // IN_ACCESS events unread.
+ file1_fd.reset(); // Close file1_fd.
+
+ const std::string file1_name = std::string(Basename(file1.path()));
+ std::vector<Event> events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ ASSERT_THAT(events, Are({Event(IN_ACCESS, wd, file1_name),
+ Event(IN_CLOSE_NOWRITE, wd, file1_name)}));
+
+ // Now let's try interleaving other events into a stream of repeated events.
+ file1_fd = ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDWR));
+
+ EXPECT_THAT(read(file1_fd.get(), &buf, 1), SyscallSucceeds());
+ EXPECT_THAT(read(file1_fd.get(), &buf, 1), SyscallSucceeds());
+ EXPECT_THAT(write(file1_fd.get(), "x", 1), SyscallSucceeds());
+ EXPECT_THAT(write(file1_fd.get(), "x", 1), SyscallSucceeds());
+ EXPECT_THAT(write(file1_fd.get(), "x", 1), SyscallSucceeds());
+ EXPECT_THAT(read(file1_fd.get(), &buf, 1), SyscallSucceeds());
+ EXPECT_THAT(read(file1_fd.get(), &buf, 1), SyscallSucceeds());
+
+ file1_fd.reset(); // Close the file.
+
+ events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ ASSERT_THAT(
+ events,
+ Are({Event(IN_OPEN, wd, file1_name), Event(IN_ACCESS, wd, file1_name),
+ Event(IN_MODIFY, wd, file1_name), Event(IN_ACCESS, wd, file1_name),
+ Event(IN_CLOSE_WRITE, wd, file1_name)}));
+
+ // Ensure events aren't coalesced if they are from different files.
+ const TempPath file2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ root.path(), "some content", TempPath::kDefaultFileMode));
+ // Discard events resulting from creation of file2.
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+
+ file1_fd = ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDONLY));
+ FileDescriptor file2_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file2.path(), O_RDONLY));
+
+ EXPECT_THAT(read(file1_fd.get(), &buf, 1), SyscallSucceeds());
+ EXPECT_THAT(read(file2_fd.get(), &buf, 1), SyscallSucceeds());
+ EXPECT_THAT(read(file1_fd.get(), &buf, 1), SyscallSucceeds());
+ EXPECT_THAT(read(file1_fd.get(), &buf, 1), SyscallSucceeds());
+
+ // Close both files.
+ file1_fd.reset();
+ file2_fd.reset();
+
+ events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ const std::string file2_name = std::string(Basename(file2.path()));
+ ASSERT_THAT(
+ events,
+ Are({Event(IN_OPEN, wd, file1_name), Event(IN_OPEN, wd, file2_name),
+ Event(IN_ACCESS, wd, file1_name), Event(IN_ACCESS, wd, file2_name),
+ Event(IN_ACCESS, wd, file1_name),
+ Event(IN_CLOSE_NOWRITE, wd, file1_name),
+ Event(IN_CLOSE_NOWRITE, wd, file2_name)}));
+}
+
+TEST(Inotify, ClosingInotifyFdWithoutRemovingWatchesWorks) {
+ const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+ const TempPath file1 =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+ const FileDescriptor file1_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDONLY));
+
+ ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS));
+ // Note: The check on close will happen in FileDescriptor::~FileDescriptor().
+}
+
+TEST(Inotify, NestedWatches) {
+ const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+ const TempPath file1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ root.path(), "some content", TempPath::kDefaultFileMode));
+ const FileDescriptor file1_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDONLY));
+
+ const int root_wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+ const int file1_wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS));
+
+ // Read from file1. This should generate an event for both watches.
+ char buf;
+ EXPECT_THAT(read(file1_fd.get(), &buf, 1), SyscallSucceeds());
+
+ const std::vector<Event> events =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ ASSERT_THAT(events, Are({Event(IN_ACCESS, root_wd, Basename(file1.path())),
+ Event(IN_ACCESS, file1_wd)}));
+}
+
+TEST(Inotify, ConcurrentThreadsGeneratingEvents) {
+ const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+ std::vector<TempPath> files;
+ files.reserve(10);
+ for (int i = 0; i < 10; i++) {
+ files.emplace_back(ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ root.path(), "some content", TempPath::kDefaultFileMode)));
+ }
+
+ auto test_thread = [&files]() {
+ uint32_t seed = time(nullptr);
+ for (int i = 0; i < 20; i++) {
+ const TempPath& file = files[rand_r(&seed) % files.size()];
+ const FileDescriptor file_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_WRONLY));
+ TEST_PCHECK(write(file_fd.get(), "x", 1) == 1);
+ }
+ };
+
+ ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+
+ std::list<ScopedThread> threads;
+ for (int i = 0; i < 3; i++) {
+ threads.emplace_back(test_thread);
+ }
+ for (auto& t : threads) {
+ t.Join();
+ }
+
+ const std::vector<Event> events =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ // 3 threads doing 20 iterations, 3 events per iteration (open, write,
+ // close). However, some events may be coalesced, and we can't reliably
+ // predict how they'll be coalesced since the test threads aren't
+ // synchronized. We can only check that we aren't getting unexpected events.
+ for (const Event& ev : events) {
+ EXPECT_NE(ev.mask & (IN_OPEN | IN_MODIFY | IN_CLOSE_WRITE), 0);
+ }
+}
+
+TEST(Inotify, ReadWithTooSmallBufferFails) {
+ const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const TempPath file1 =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+ const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS));
+
+ // Open the file to queue an event. This event will not have a filename, so
+ // reading from the inotify fd should return sizeof(struct inotify_event)
+ // bytes of data.
+ FileDescriptor file1_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDONLY));
+ std::vector<char> buf(kBufSize, 0);
+ ssize_t readlen;
+
+ // Try a buffer too small to hold any potential event. This is rejected
+ // outright without the event being dequeued.
+ EXPECT_THAT(read(fd.get(), buf.data(), sizeof(struct inotify_event) - 1),
+ SyscallFailsWithErrno(EINVAL));
+ // Try a buffer just large enough. This should succeeed.
+ EXPECT_THAT(
+ readlen = read(fd.get(), buf.data(), sizeof(struct inotify_event)),
+ SyscallSucceeds());
+ EXPECT_EQ(readlen, sizeof(struct inotify_event));
+ // Event queue is now empty, the next read should return EAGAIN.
+ EXPECT_THAT(read(fd.get(), buf.data(), sizeof(struct inotify_event)),
+ SyscallFailsWithErrno(EAGAIN));
+
+ // Now put a watch on the directory, so that generated events contain a name.
+ EXPECT_THAT(inotify_rm_watch(fd.get(), wd), SyscallSucceeds());
+
+ // Drain the event generated from the watch removal.
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+
+ ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+
+ file1_fd.reset(); // Close file to generate an event.
+
+ // Try a buffer too small to hold any event and one too small to hold an event
+ // with a name. These should both fail without consuming the event.
+ EXPECT_THAT(read(fd.get(), buf.data(), sizeof(struct inotify_event) - 1),
+ SyscallFailsWithErrno(EINVAL));
+ EXPECT_THAT(read(fd.get(), buf.data(), sizeof(struct inotify_event)),
+ SyscallFailsWithErrno(EINVAL));
+ // Now try with a large enough buffer. This should return the one event.
+ EXPECT_THAT(readlen = read(fd.get(), buf.data(), buf.size()),
+ SyscallSucceeds());
+ EXPECT_GE(readlen,
+ sizeof(struct inotify_event) + Basename(file1.path()).size());
+ // With the single event read, the queue should once again be empty.
+ EXPECT_THAT(read(fd.get(), buf.data(), sizeof(struct inotify_event)),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST(Inotify, BlockingReadOnInotifyFd) {
+ const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(0));
+ const TempPath file1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ root.path(), "some content", TempPath::kDefaultFileMode));
+
+ const FileDescriptor file1_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDONLY));
+
+ ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+
+ // Spawn a thread performing a blocking read for new events on the inotify fd.
+ std::vector<char> buf(kBufSize, 0);
+ const int shared_fd = fd.get(); // The thread needs it.
+ ScopedThread t([shared_fd, &buf]() {
+ ssize_t readlen;
+ EXPECT_THAT(readlen = read(shared_fd, buf.data(), buf.size()),
+ SyscallSucceeds());
+ });
+
+ // Perform a read on the watched file, which should generate an IN_ACCESS
+ // event, unblocking the event_reader thread.
+ char c;
+ EXPECT_THAT(read(file1_fd.get(), &c, 1), SyscallSucceeds());
+
+ // Wait for the thread to read the event and exit.
+ t.Join();
+
+ // Make sure the event we got back is sane.
+ uint32_t event_mask;
+ memcpy(&event_mask, buf.data() + offsetof(struct inotify_event, mask),
+ sizeof(event_mask));
+ EXPECT_EQ(event_mask, IN_ACCESS);
+}
+
+TEST(Inotify, WatchOnRelativePath) {
+ const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+ const TempPath file1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ root.path(), "some content", TempPath::kDefaultFileMode));
+
+ const FileDescriptor file1_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDONLY));
+
+ // Change working directory to root.
+ const FileDescriptor cwd = ASSERT_NO_ERRNO_AND_VALUE(Open(".", O_PATH));
+ EXPECT_THAT(chdir(root.path().c_str()), SyscallSucceeds());
+
+ // Add a watch on file1 with a relative path.
+ const int wd = ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch(
+ fd.get(), std::string(Basename(file1.path())), IN_ALL_EVENTS));
+
+ // Perform a read on file1, this should generate an IN_ACCESS event.
+ char c;
+ EXPECT_THAT(read(file1_fd.get(), &c, 1), SyscallSucceeds());
+
+ const std::vector<Event> events =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ EXPECT_THAT(events, Are({Event(IN_ACCESS, wd)}));
+
+ // Explicitly reset the working directory so that we don't continue to
+ // reference "root". Once the test ends, "root" will get unlinked. If we
+ // continue to hold a reference, random save/restore tests can fail if a save
+ // is triggered after "root" is unlinked; we can't save deleted fs objects
+ // with active references.
+ EXPECT_THAT(fchdir(cwd.get()), SyscallSucceeds());
+}
+
+TEST(Inotify, ZeroLengthReadWriteDoesNotGenerateEvent) {
+ const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+ const char kContent[] = "some content";
+ TempPath file1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ root.path(), kContent, TempPath::kDefaultFileMode));
+ const int kContentSize = sizeof(kContent) - 1;
+
+ const FileDescriptor file1_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDWR));
+
+ const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+
+ std::vector<char> buf(kContentSize, 0);
+ // Read all available data.
+ ssize_t readlen;
+ EXPECT_THAT(readlen = read(file1_fd.get(), buf.data(), kContentSize),
+ SyscallSucceeds());
+ EXPECT_EQ(readlen, kContentSize);
+ // Drain all events and make sure we got the IN_ACCESS for the read.
+ std::vector<Event> events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ EXPECT_THAT(events, Are({Event(IN_ACCESS, wd, Basename(file1.path()))}));
+
+ // Now try read again. This should be a 0-length read, since we're at EOF.
+ char c;
+ EXPECT_THAT(readlen = read(file1_fd.get(), &c, 1), SyscallSucceeds());
+ EXPECT_EQ(readlen, 0);
+ // We should have no new events.
+ events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ EXPECT_TRUE(events.empty());
+
+ // Try issuing a zero-length read.
+ EXPECT_THAT(readlen = read(file1_fd.get(), &c, 0), SyscallSucceeds());
+ EXPECT_EQ(readlen, 0);
+ // We should have no new events.
+ events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ EXPECT_TRUE(events.empty());
+
+ // Try issuing a zero-length write.
+ ssize_t writelen;
+ EXPECT_THAT(writelen = write(file1_fd.get(), &c, 0), SyscallSucceeds());
+ EXPECT_EQ(writelen, 0);
+ // We should have no new events.
+ events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ EXPECT_TRUE(events.empty());
+}
+
+TEST(Inotify, ChmodGeneratesAttribEvent_NoRandomSave) {
+ const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const TempPath file1 =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+
+ FileDescriptor root_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(root.path(), O_RDONLY));
+ FileDescriptor file1_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDWR));
+ FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+ const int root_wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+ const int file1_wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS));
+
+ auto verify_chmod_events = [&]() {
+ std::vector<Event> events =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ ASSERT_THAT(events, Are({Event(IN_ATTRIB, root_wd, Basename(file1.path())),
+ Event(IN_ATTRIB, file1_wd)}));
+ };
+
+ // Don't do cooperative S/R tests for any of the {f}chmod* syscalls below, the
+ // test will always fail because nodes cannot be saved when they have stricter
+ // permissions than the original host node.
+ const DisableSave ds;
+
+ // Chmod.
+ ASSERT_THAT(chmod(file1.path().c_str(), S_IWGRP), SyscallSucceeds());
+ verify_chmod_events();
+
+ // Fchmod.
+ ASSERT_THAT(fchmod(file1_fd.get(), S_IRGRP | S_IWGRP), SyscallSucceeds());
+ verify_chmod_events();
+
+ // Fchmodat.
+ const std::string file1_basename = std::string(Basename(file1.path()));
+ ASSERT_THAT(fchmodat(root_fd.get(), file1_basename.c_str(), S_IWGRP, 0),
+ SyscallSucceeds());
+ verify_chmod_events();
+
+ // Make sure the chmod'ed file descriptors are destroyed before DisableSave
+ // is destructed.
+ root_fd.reset();
+ file1_fd.reset();
+}
+
+TEST(Inotify, TruncateGeneratesModifyEvent) {
+ const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const TempPath file1 =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+ const FileDescriptor file1_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDWR));
+
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+ const int root_wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+ const int file1_wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS));
+
+ auto verify_truncate_events = [&]() {
+ std::vector<Event> events =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ ASSERT_THAT(events, Are({Event(IN_MODIFY, root_wd, Basename(file1.path())),
+ Event(IN_MODIFY, file1_wd)}));
+ };
+
+ // Truncate.
+ EXPECT_THAT(truncate(file1.path().c_str(), 4096), SyscallSucceeds());
+ verify_truncate_events();
+
+ // Ftruncate.
+ EXPECT_THAT(ftruncate(file1_fd.get(), 8192), SyscallSucceeds());
+ verify_truncate_events();
+
+ // No events if truncate fails.
+ EXPECT_THAT(ftruncate(file1_fd.get(), -1), SyscallFailsWithErrno(EINVAL));
+ const std::vector<Event> events =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ ASSERT_THAT(events, Are({}));
+}
+
+TEST(Inotify, GetdentsGeneratesAccessEvent) {
+ const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const TempPath file1 =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+ ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+ ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS));
+
+ // This internally calls getdents(2). We also expect to see an open/close
+ // event for the dirfd.
+ ASSERT_NO_ERRNO_AND_VALUE(ListDir(root.path(), false));
+ const std::vector<Event> events =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+
+ // Linux only seems to generate access events on getdents() on some
+ // calls. Allow the test to pass even if it isn't generated. gVisor will
+ // always generate the IN_ACCESS event so the test will at least ensure gVisor
+ // behaves reasonably.
+ int i = 0;
+ EXPECT_EQ(events[i].mask, IN_OPEN | IN_ISDIR);
+ ++i;
+ if (IsRunningOnGvisor()) {
+ EXPECT_EQ(events[i].mask, IN_ACCESS | IN_ISDIR);
+ ++i;
+ } else {
+ if (events[i].mask == (IN_ACCESS | IN_ISDIR)) {
+ // Skip over the IN_ACCESS event on Linux, it only shows up some of the
+ // time so we can't assert its existence.
+ ++i;
+ }
+ }
+ EXPECT_EQ(events[i].mask, IN_CLOSE_NOWRITE | IN_ISDIR);
+}
+
+TEST(Inotify, MknodGeneratesCreateEvent) {
+ const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+ const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+
+ const TempPath file1(root.path() + "/file1");
+ const int rc = mknod(file1.path().c_str(), S_IFREG, 0);
+ // mknod(2) is only supported on tmpfs in the sandbox.
+ SKIP_IF(IsRunningOnGvisor() && rc != 0);
+ ASSERT_THAT(rc, SyscallSucceeds());
+
+ const std::vector<Event> events =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ ASSERT_THAT(events, Are({Event(IN_CREATE, wd, Basename(file1.path()))}));
+}
+
+TEST(Inotify, SymlinkGeneratesCreateEvent) {
+ const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const TempPath file1 =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+ const TempPath link1(NewTempAbsPathInDir(root.path()));
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+ const int root_wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+ ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS));
+
+ ASSERT_THAT(symlink(file1.path().c_str(), link1.path().c_str()),
+ SyscallSucceeds());
+
+ const std::vector<Event> events =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+
+ ASSERT_THAT(events, Are({Event(IN_CREATE, root_wd, Basename(link1.path()))}));
+}
+
+TEST(Inotify, LinkGeneratesAttribAndCreateEvents) {
+ const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const TempPath file1 =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+ const TempPath link1(root.path() + "/link1");
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+ const int root_wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+ const int file1_wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS));
+
+ const int rc = link(file1.path().c_str(), link1.path().c_str());
+ // NOTE(b/34861058): link(2) is only supported on tmpfs in the sandbox.
+ SKIP_IF(IsRunningOnGvisor() && rc != 0 &&
+ (errno == EPERM || errno == ENOENT));
+ ASSERT_THAT(rc, SyscallSucceeds());
+
+ const std::vector<Event> events =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ ASSERT_THAT(events, Are({Event(IN_ATTRIB, file1_wd),
+ Event(IN_CREATE, root_wd, Basename(link1.path()))}));
+}
+
+TEST(Inotify, UtimesGeneratesAttribEvent) {
+ const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+ const TempPath file1 =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+
+ const FileDescriptor file1_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDWR));
+ const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+
+ const struct timeval times[2] = {{1, 0}, {2, 0}};
+ EXPECT_THAT(futimes(file1_fd.get(), times), SyscallSucceeds());
+
+ const std::vector<Event> events =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ ASSERT_THAT(events, Are({Event(IN_ATTRIB, wd, Basename(file1.path()))}));
+}
+
+TEST(Inotify, HardlinksReuseSameWatch) {
+ const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ TempPath file1 =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+ TempPath link1(root.path() + "/link1");
+ const int rc = link(file1.path().c_str(), link1.path().c_str());
+ // link(2) is only supported on tmpfs in the sandbox.
+ SKIP_IF(IsRunningOnGvisor() && rc != 0 &&
+ (errno == EPERM || errno == ENOENT));
+ ASSERT_THAT(rc, SyscallSucceeds());
+
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+ const int root_wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+ const int file1_wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS));
+ const int link1_wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), link1.path(), IN_ALL_EVENTS));
+
+ // The watch descriptors for watches on different links to the same file
+ // should be identical.
+ EXPECT_NE(root_wd, file1_wd);
+ EXPECT_EQ(file1_wd, link1_wd);
+
+ FileDescriptor file1_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_WRONLY));
+
+ std::vector<Event> events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ ASSERT_THAT(events,
+ AreUnordered({Event(IN_OPEN, root_wd, Basename(file1.path())),
+ Event(IN_OPEN, file1_wd)}));
+
+ // For the next step, we want to ensure all fds to the file are closed. Do
+ // that now and drain the resulting events.
+ file1_fd.reset();
+ events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ ASSERT_THAT(events,
+ Are({Event(IN_CLOSE_WRITE, root_wd, Basename(file1.path())),
+ Event(IN_CLOSE_WRITE, file1_wd)}));
+
+ // Try removing the link and let's see what events show up. Note that after
+ // this, we still have a link to the file so the watch shouldn't be
+ // automatically removed.
+ const std::string link1_path = link1.reset();
+
+ events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ ASSERT_THAT(events, Are({Event(IN_ATTRIB, link1_wd),
+ Event(IN_DELETE, root_wd, Basename(link1_path))}));
+
+ // Now remove the other link. Since this is the last link to the file, the
+ // watch should be automatically removed.
+ const std::string file1_path = file1.reset();
+
+ events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ ASSERT_THAT(
+ events,
+ AreUnordered({Event(IN_ATTRIB, file1_wd), Event(IN_DELETE_SELF, file1_wd),
+ Event(IN_IGNORED, file1_wd),
+ Event(IN_DELETE, root_wd, Basename(file1_path))}));
+}
+
+// Calling mkdir within "parent/child" should generate an event for child, but
+// not parent.
+TEST(Inotify, MkdirGeneratesCreateEventWithDirFlag) {
+ const TempPath parent = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const TempPath child =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(parent.path()));
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+ ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), parent.path(), IN_ALL_EVENTS));
+ const int child_wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), child.path(), IN_ALL_EVENTS));
+
+ const TempPath dir1(NewTempAbsPathInDir(child.path()));
+ ASSERT_THAT(mkdir(dir1.path().c_str(), 0777), SyscallSucceeds());
+
+ const std::vector<Event> events =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ ASSERT_THAT(
+ events,
+ Are({Event(IN_CREATE | IN_ISDIR, child_wd, Basename(dir1.path()))}));
+}
+
+TEST(Inotify, MultipleInotifyInstancesAndWatchesAllGetEvents) {
+ const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const TempPath file1 =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+
+ const FileDescriptor file1_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_WRONLY));
+ constexpr int kNumFds = 30;
+ std::vector<FileDescriptor> inotify_fds;
+
+ for (int i = 0; i < kNumFds; ++i) {
+ const DisableSave ds; // Too expensive.
+ inotify_fds.emplace_back(
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)));
+ const FileDescriptor& fd = inotify_fds[inotify_fds.size() - 1]; // Back.
+ ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+ ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS));
+ }
+
+ const std::string data = "some content";
+ EXPECT_THAT(write(file1_fd.get(), data.c_str(), data.length()),
+ SyscallSucceeds());
+
+ for (const FileDescriptor& fd : inotify_fds) {
+ const DisableSave ds; // Too expensive.
+ const std::vector<Event> events =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ if (events.size() >= 2) {
+ EXPECT_EQ(events[0].mask, IN_MODIFY);
+ EXPECT_EQ(events[0].wd, 1);
+ EXPECT_EQ(events[0].name, Basename(file1.path()));
+ EXPECT_EQ(events[1].mask, IN_MODIFY);
+ EXPECT_EQ(events[1].wd, 2);
+ EXPECT_EQ(events[1].name, "");
+ }
+ }
+}
+
+TEST(Inotify, EventsGoUpAtMostOneLevel) {
+ const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const TempPath dir1 =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(root.path()));
+ TempPath file1 =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir1.path()));
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+ ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS));
+ const int dir1_wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), dir1.path(), IN_ALL_EVENTS));
+
+ const std::string file1_path = file1.reset();
+
+ const std::vector<Event> events =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ ASSERT_THAT(events, Are({Event(IN_DELETE, dir1_wd, Basename(file1_path))}));
+}
+
+TEST(Inotify, DuplicateWatchReturnsSameWatchDescriptor) {
+ const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const TempPath file1 =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+ const int wd1 = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS));
+ const int wd2 = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS));
+
+ EXPECT_EQ(wd1, wd2);
+
+ const FileDescriptor file1_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_WRONLY));
+ const std::vector<Event> events =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ // The watch shouldn't be duplicated, we only expect one event.
+ ASSERT_THAT(events, Are({Event(IN_OPEN, wd1)}));
+}
+
+TEST(Inotify, UnmatchedEventsAreDiscarded) {
+ const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ TempPath file1 =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+ const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), file1.path(), IN_ACCESS));
+
+ FileDescriptor file1_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_WRONLY));
+
+ std::vector<Event> events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ // We only asked for access events, the open event should be discarded.
+ ASSERT_THAT(events, Are({}));
+
+ // IN_IGNORED events are always generated, regardless of the mask.
+ file1_fd.reset();
+ file1.reset();
+ events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ ASSERT_THAT(events, Are({Event(IN_IGNORED, wd)}));
+}
+
+TEST(Inotify, AddWatchWithInvalidEventMaskFails) {
+ const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+ EXPECT_THAT(inotify_add_watch(fd.get(), root.path().c_str(), 0),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(Inotify, AddWatchOnInvalidPathFails) {
+ const TempPath nonexistent(NewTempAbsPath());
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+ // Non-existent path.
+ EXPECT_THAT(
+ inotify_add_watch(fd.get(), nonexistent.path().c_str(), IN_CREATE),
+ SyscallFailsWithErrno(ENOENT));
+
+ // Garbage path pointer.
+ EXPECT_THAT(inotify_add_watch(fd.get(), nullptr, IN_CREATE),
+ SyscallFailsWithErrno(EFAULT));
+}
+
+TEST(Inotify, InOnlyDirFlagRespected) {
+ const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const TempPath file1 =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+ EXPECT_THAT(
+ inotify_add_watch(fd.get(), root.path().c_str(), IN_ACCESS | IN_ONLYDIR),
+ SyscallSucceeds());
+
+ EXPECT_THAT(
+ inotify_add_watch(fd.get(), file1.path().c_str(), IN_ACCESS | IN_ONLYDIR),
+ SyscallFailsWithErrno(ENOTDIR));
+}
+
+TEST(Inotify, MaskAddMergesWithExistingEventMask) {
+ const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const TempPath file1 =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path()));
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+ FileDescriptor file1_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_WRONLY));
+
+ const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), file1.path(), IN_OPEN | IN_CLOSE_WRITE));
+
+ const std::string data = "some content";
+ EXPECT_THAT(write(file1_fd.get(), data.c_str(), data.length()),
+ SyscallSucceeds());
+
+ // We shouldn't get any events, since IN_MODIFY wasn't in the event mask.
+ std::vector<Event> events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ ASSERT_THAT(events, Are({}));
+
+ // Add IN_MODIFY to event mask.
+ ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), file1.path(), IN_MODIFY | IN_MASK_ADD));
+
+ EXPECT_THAT(write(file1_fd.get(), data.c_str(), data.length()),
+ SyscallSucceeds());
+
+ // This time we should get the modify event.
+ events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ ASSERT_THAT(events, Are({Event(IN_MODIFY, wd)}));
+
+ // Now close the fd. If the modify event was added to the event mask rather
+ // than replacing the event mask we won't get the close event.
+ file1_fd.reset();
+ events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ ASSERT_THAT(events, Are({Event(IN_CLOSE_WRITE, wd)}));
+}
+
+// Test that control events bits are not considered when checking event mask.
+TEST(Inotify, ControlEvents) {
+ const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+ const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), dir.path(), IN_ACCESS));
+
+ // Check that events in the mask are dispatched and that control bits are
+ // part of the event mask.
+ std::vector<std::string> files =
+ ASSERT_NO_ERRNO_AND_VALUE(ListDir(dir.path(), false));
+ ASSERT_EQ(files.size(), 2);
+
+ const std::vector<Event> events1 =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ ASSERT_THAT(events1, Are({Event(IN_ACCESS | IN_ISDIR, wd)}));
+
+ // Check that events not in the mask are discarded.
+ const FileDescriptor dir_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_RDONLY | O_DIRECTORY));
+
+ const std::vector<Event> events2 =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ ASSERT_THAT(events2, Are({}));
+}
+
+// Regression test to ensure epoll and directory access doesn't deadlock.
+TEST(Inotify, EpollNoDeadlock) {
+ const DisableSave ds; // Too many syscalls.
+
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+ // Create lots of directories and watch all of them.
+ const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ std::vector<TempPath> children;
+ for (size_t i = 0; i < 1000; ++i) {
+ auto child = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(root.path()));
+ ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), child.path(), IN_ACCESS));
+ children.emplace_back(std::move(child));
+ }
+
+ // Run epoll_wait constantly in a separate thread.
+ std::atomic<bool> done(false);
+ ScopedThread th([&fd, &done] {
+ for (auto start = absl::Now(); absl::Now() - start < absl::Seconds(5);) {
+ FileDescriptor epoll_fd = ASSERT_NO_ERRNO_AND_VALUE(NewEpollFD());
+ ASSERT_NO_ERRNO(RegisterEpollFD(epoll_fd.get(), fd.get(),
+ EPOLLIN | EPOLLOUT | EPOLLET, 0));
+ struct epoll_event result[1];
+ EXPECT_THAT(RetryEINTR(epoll_wait)(epoll_fd.get(), result, 1, -1),
+ SyscallSucceedsWithValue(1));
+
+ sched_yield();
+ }
+ done = true;
+ });
+
+ // While epoll thread is running, constantly access all directories to
+ // generate inotify events.
+ while (!done) {
+ std::vector<std::string> files =
+ ASSERT_NO_ERRNO_AND_VALUE(ListDir(root.path(), false));
+ ASSERT_EQ(files.size(), 1002);
+ for (const auto& child : files) {
+ if (child == "." || child == "..") {
+ continue;
+ }
+ ASSERT_NO_ERRNO_AND_VALUE(ListDir(JoinPath(root.path(), child), false));
+ }
+ sched_yield();
+ }
+}
+
+// On Linux, inotify behavior is not very consistent with splice(2). We try our
+// best to emulate Linux for very basic calls to splice.
+TEST(Inotify, SpliceOnWatchTarget) {
+ int pipes[2];
+ ASSERT_THAT(pipe2(pipes, O_NONBLOCK), SyscallSucceeds());
+
+ const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const FileDescriptor inotify_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+ const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ dir.path(), "some content", TempPath::kDefaultFileMode));
+
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR));
+ const int dir_wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(inotify_fd.get(), dir.path(), IN_ALL_EVENTS));
+ const int file_wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(inotify_fd.get(), file.path(), IN_ALL_EVENTS));
+
+ EXPECT_THAT(splice(fd.get(), nullptr, pipes[1], nullptr, 1, /*flags=*/0),
+ SyscallSucceedsWithValue(1));
+
+ // Surprisingly, events are not generated in Linux if we read from a file.
+ std::vector<Event> events =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+ ASSERT_THAT(events, Are({}));
+
+ EXPECT_THAT(splice(pipes[0], nullptr, fd.get(), nullptr, 1, /*flags=*/0),
+ SyscallSucceedsWithValue(1));
+
+ events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+ ASSERT_THAT(events, Are({
+ Event(IN_MODIFY, dir_wd, Basename(file.path())),
+ Event(IN_MODIFY, file_wd),
+ }));
+}
+
+TEST(Inotify, SpliceOnInotifyFD) {
+ int pipes[2];
+ ASSERT_THAT(pipe2(pipes, O_NONBLOCK), SyscallSucceeds());
+
+ const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+ const TempPath file1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ root.path(), "some content", TempPath::kDefaultFileMode));
+
+ const FileDescriptor file1_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDONLY));
+ const int watcher = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS));
+
+ char buf;
+ EXPECT_THAT(read(file1_fd.get(), &buf, 1), SyscallSucceeds());
+
+ EXPECT_THAT(splice(fd.get(), nullptr, pipes[1], nullptr,
+ sizeof(struct inotify_event) + 1, SPLICE_F_NONBLOCK),
+ SyscallSucceedsWithValue(sizeof(struct inotify_event)));
+
+ const FileDescriptor read_fd(pipes[0]);
+ const std::vector<Event> events =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(read_fd.get()));
+ ASSERT_THAT(events, Are({Event(IN_ACCESS, watcher)}));
+}
+
+// Watches on a parent should not be triggered by actions on a hard link to one
+// of its children that has a different parent.
+TEST(Inotify, LinkOnOtherParent) {
+ const TempPath dir1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const TempPath dir2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const TempPath file =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir1.path()));
+ std::string link_path = NewTempAbsPathInDir(dir2.path());
+
+ const int rc = link(file.path().c_str(), link_path.c_str());
+ // NOTE(b/34861058): link(2) is only supported on tmpfs in the sandbox.
+ SKIP_IF(IsRunningOnGvisor() && rc != 0 &&
+ (errno == EPERM || errno == ENOENT));
+ ASSERT_THAT(rc, SyscallSucceeds());
+
+ const FileDescriptor inotify_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+ ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(inotify_fd.get(), dir1.path(), IN_ALL_EVENTS));
+
+ // Perform various actions on the link outside of dir1, which should trigger
+ // no inotify events.
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(link_path.c_str(), O_RDWR));
+ int val = 0;
+ ASSERT_THAT(write(fd.get(), &val, sizeof(val)), SyscallSucceeds());
+ ASSERT_THAT(read(fd.get(), &val, sizeof(val)), SyscallSucceeds());
+ ASSERT_THAT(ftruncate(fd.get(), 12345), SyscallSucceeds());
+ ASSERT_THAT(unlink(link_path.c_str()), SyscallSucceeds());
+ const std::vector<Event> events =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+ EXPECT_THAT(events, Are({}));
+}
+
+TEST(Inotify, Xattr) {
+ // TODO(gvisor.dev/issue/1636): Support extended attributes in runsc gofer.
+ SKIP_IF(IsRunningOnGvisor());
+
+ const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const std::string path = file.path();
+ const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_RDWR));
+ const FileDescriptor inotify_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+ const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(inotify_fd.get(), path, IN_ALL_EVENTS));
+
+ const char* cpath = path.c_str();
+ const char* name = "user.test";
+ int val = 123;
+ ASSERT_THAT(setxattr(cpath, name, &val, sizeof(val), /*flags=*/0),
+ SyscallSucceeds());
+ std::vector<Event> events =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+ EXPECT_THAT(events, Are({Event(IN_ATTRIB, wd)}));
+
+ ASSERT_THAT(getxattr(cpath, name, &val, sizeof(val)), SyscallSucceeds());
+ events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+ EXPECT_THAT(events, Are({}));
+
+ char list[100];
+ ASSERT_THAT(listxattr(cpath, list, sizeof(list)), SyscallSucceeds());
+ events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+ EXPECT_THAT(events, Are({}));
+
+ ASSERT_THAT(removexattr(cpath, name), SyscallSucceeds());
+ events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+ EXPECT_THAT(events, Are({Event(IN_ATTRIB, wd)}));
+
+ ASSERT_THAT(fsetxattr(fd.get(), name, &val, sizeof(val), /*flags=*/0),
+ SyscallSucceeds());
+ events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+ EXPECT_THAT(events, Are({Event(IN_ATTRIB, wd)}));
+
+ ASSERT_THAT(fgetxattr(fd.get(), name, &val, sizeof(val)), SyscallSucceeds());
+ events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+ EXPECT_THAT(events, Are({}));
+
+ ASSERT_THAT(flistxattr(fd.get(), list, sizeof(list)), SyscallSucceeds());
+ events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+ EXPECT_THAT(events, Are({}));
+
+ ASSERT_THAT(fremovexattr(fd.get(), name), SyscallSucceeds());
+ events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+ EXPECT_THAT(events, Are({Event(IN_ATTRIB, wd)}));
+}
+
+TEST(Inotify, Exec) {
+ const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const TempPath bin = ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateSymlinkTo(dir.path(), "/bin/true"));
+
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+ const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(fd.get(), bin.path(), IN_ALL_EVENTS));
+
+ // Perform exec.
+ ScopedThread t([&bin]() {
+ ASSERT_THAT(execl(bin.path().c_str(), bin.path().c_str(), (char*)nullptr),
+ SyscallSucceeds());
+ });
+ t.Join();
+
+ std::vector<Event> events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get()));
+ EXPECT_THAT(events, Are({Event(IN_OPEN, wd), Event(IN_ACCESS, wd)}));
+}
+
+// Watches without IN_EXCL_UNLINK, should continue to emit events for file
+// descriptors after their corresponding files have been unlinked.
+//
+// We need to disable S/R because there are filesystems where we cannot re-open
+// fds to an unlinked file across S/R, e.g. gofer-backed filesytems.
+TEST(Inotify, IncludeUnlinkedFile_NoRandomSave) {
+ const DisableSave ds;
+
+ const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateFileWith(dir.path(), "123", TempPath::kDefaultFileMode));
+ FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR));
+
+ const FileDescriptor inotify_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+ const int dir_wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(inotify_fd.get(), dir.path(), IN_ALL_EVENTS));
+ const int file_wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(inotify_fd.get(), file.path(), IN_ALL_EVENTS));
+
+ ASSERT_THAT(unlink(file.path().c_str()), SyscallSucceeds());
+ int val = 0;
+ ASSERT_THAT(read(fd.get(), &val, sizeof(val)), SyscallSucceeds());
+ ASSERT_THAT(write(fd.get(), &val, sizeof(val)), SyscallSucceeds());
+ std::vector<Event> events =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+ EXPECT_THAT(events, Are({
+ Event(IN_ATTRIB, file_wd),
+ Event(IN_DELETE, dir_wd, Basename(file.path())),
+ Event(IN_ACCESS, dir_wd, Basename(file.path())),
+ Event(IN_ACCESS, file_wd),
+ Event(IN_MODIFY, dir_wd, Basename(file.path())),
+ Event(IN_MODIFY, file_wd),
+ }));
+
+ fd.reset();
+ events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+ EXPECT_THAT(events, Are({
+ Event(IN_CLOSE_WRITE, dir_wd, Basename(file.path())),
+ Event(IN_CLOSE_WRITE, file_wd),
+ Event(IN_DELETE_SELF, file_wd),
+ Event(IN_IGNORED, file_wd),
+ }));
+}
+
+// Watches created with IN_EXCL_UNLINK will stop emitting events on fds for
+// children that have already been unlinked.
+//
+// We need to disable S/R because there are filesystems where we cannot re-open
+// fds to an unlinked file across S/R, e.g. gofer-backed filesytems.
+TEST(Inotify, ExcludeUnlink_NoRandomSave) {
+ const DisableSave ds;
+ // TODO(gvisor.dev/issue/1624): This test fails on VFS1.
+ SKIP_IF(IsRunningWithVFS1());
+
+ const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const TempPath file =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir.path()));
+
+ FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR));
+
+ const FileDescriptor inotify_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+ const int dir_wd = ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch(
+ inotify_fd.get(), dir.path(), IN_ALL_EVENTS | IN_EXCL_UNLINK));
+ const int file_wd = ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch(
+ inotify_fd.get(), file.path(), IN_ALL_EVENTS | IN_EXCL_UNLINK));
+
+ // Unlink the child, which should cause further operations on the open file
+ // descriptor to be ignored.
+ ASSERT_THAT(unlink(file.path().c_str()), SyscallSucceeds());
+ int val = 0;
+ ASSERT_THAT(write(fd.get(), &val, sizeof(val)), SyscallSucceeds());
+ ASSERT_THAT(read(fd.get(), &val, sizeof(val)), SyscallSucceeds());
+ std::vector<Event> events =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+ EXPECT_THAT(events, Are({
+ Event(IN_ATTRIB, file_wd),
+ Event(IN_DELETE, dir_wd, Basename(file.path())),
+ }));
+
+ fd.reset();
+ events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+ ASSERT_THAT(events, Are({
+ Event(IN_DELETE_SELF, file_wd),
+ Event(IN_IGNORED, file_wd),
+ }));
+}
+
+// We need to disable S/R because there are filesystems where we cannot re-open
+// fds to an unlinked file across S/R, e.g. gofer-backed filesytems.
+TEST(Inotify, ExcludeUnlinkDirectory_NoRandomSave) {
+ // TODO(gvisor.dev/issue/1624): This test fails on VFS1. Remove once VFS1 is
+ // deleted.
+ SKIP_IF(IsRunningWithVFS1());
+
+ const DisableSave ds;
+
+ const TempPath parent = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ TempPath dir =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(parent.path()));
+ std::string dirPath = dir.path();
+ const FileDescriptor inotify_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(dirPath.c_str(), O_RDONLY | O_DIRECTORY));
+ const int parent_wd = ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch(
+ inotify_fd.get(), parent.path(), IN_ALL_EVENTS | IN_EXCL_UNLINK));
+ const int self_wd = ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch(
+ inotify_fd.get(), dir.path(), IN_ALL_EVENTS | IN_EXCL_UNLINK));
+
+ // Unlink the dir, and then close the open fd.
+ ASSERT_THAT(rmdir(dirPath.c_str()), SyscallSucceeds());
+ dir.reset();
+
+ std::vector<Event> events =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+ // No close event should appear.
+ ASSERT_THAT(events,
+ Are({Event(IN_DELETE | IN_ISDIR, parent_wd, Basename(dirPath))}));
+
+ fd.reset();
+ events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+ ASSERT_THAT(events, Are({
+ Event(IN_DELETE_SELF, self_wd),
+ Event(IN_IGNORED, self_wd),
+ }));
+}
+
+// If "dir/child" and "dir/child2" are links to the same file, and "dir/child"
+// is unlinked, a watch on "dir" with IN_EXCL_UNLINK will exclude future events
+// for fds on "dir/child" but not "dir/child2".
+//
+// We need to disable S/R because there are filesystems where we cannot re-open
+// fds to an unlinked file across S/R, e.g. gofer-backed filesytems.
+TEST(Inotify, ExcludeUnlinkMultipleChildren_NoRandomSave) {
+ const DisableSave ds;
+ // TODO(gvisor.dev/issue/1624): This test fails on VFS1.
+ SKIP_IF(IsRunningWithVFS1());
+
+ const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const TempPath file =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir.path()));
+ std::string path1 = file.path();
+ std::string path2 = NewTempAbsPathInDir(dir.path());
+
+ const int rc = link(path1.c_str(), path2.c_str());
+ // NOTE(b/34861058): link(2) is only supported on tmpfs in the sandbox.
+ SKIP_IF(IsRunningOnGvisor() && rc != 0 &&
+ (errno == EPERM || errno == ENOENT));
+ ASSERT_THAT(rc, SyscallSucceeds());
+ const FileDescriptor fd1 =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(path1.c_str(), O_RDWR));
+ const FileDescriptor fd2 =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(path2.c_str(), O_RDWR));
+
+ const FileDescriptor inotify_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+ const int wd = ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch(
+ inotify_fd.get(), dir.path(), IN_ALL_EVENTS | IN_EXCL_UNLINK));
+
+ // After unlinking path1, only events on the fd for path2 should be generated.
+ ASSERT_THAT(unlink(path1.c_str()), SyscallSucceeds());
+ ASSERT_THAT(write(fd1.get(), "x", 1), SyscallSucceeds());
+ ASSERT_THAT(write(fd2.get(), "x", 1), SyscallSucceeds());
+
+ const std::vector<Event> events =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+ EXPECT_THAT(events, Are({
+ Event(IN_DELETE, wd, Basename(path1)),
+ Event(IN_MODIFY, wd, Basename(path2)),
+ }));
+}
+
+// On native Linux, actions of data type FSNOTIFY_EVENT_INODE are not affected
+// by IN_EXCL_UNLINK (see
+// fs/notify/inotify/inotify_fsnotify.c:inotify_handle_event). Inode-level
+// events include changes to metadata and extended attributes.
+//
+// We need to disable S/R because there are filesystems where we cannot re-open
+// fds to an unlinked file across S/R, e.g. gofer-backed filesytems.
+TEST(Inotify, ExcludeUnlinkInodeEvents_NoRandomSave) {
+ const DisableSave ds;
+
+ const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const TempPath file =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir.path()));
+
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path().c_str(), O_RDWR));
+
+ // NOTE(b/157163751): Create another link before unlinking. This is needed for
+ // the gofer filesystem in gVisor, where open fds will not work once the link
+ // count hits zero. In VFS2, we end up skipping the gofer test anyway, because
+ // hard links are not supported for gofer fs.
+ if (IsRunningOnGvisor()) {
+ std::string link_path = NewTempAbsPath();
+ const int rc = link(file.path().c_str(), link_path.c_str());
+ // NOTE(b/34861058): link(2) is only supported on tmpfs in the sandbox.
+ SKIP_IF(rc != 0 && (errno == EPERM || errno == ENOENT));
+ ASSERT_THAT(rc, SyscallSucceeds());
+ }
+
+ const FileDescriptor inotify_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+ const int dir_wd = ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch(
+ inotify_fd.get(), dir.path(), IN_ALL_EVENTS | IN_EXCL_UNLINK));
+ const int file_wd = ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch(
+ inotify_fd.get(), file.path(), IN_ALL_EVENTS | IN_EXCL_UNLINK));
+
+ // Even after unlinking, inode-level operations will trigger events regardless
+ // of IN_EXCL_UNLINK.
+ ASSERT_THAT(unlink(file.path().c_str()), SyscallSucceeds());
+
+ // Perform various actions on fd.
+ ASSERT_THAT(ftruncate(fd.get(), 12345), SyscallSucceeds());
+ std::vector<Event> events =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+ EXPECT_THAT(events, Are({
+ Event(IN_ATTRIB, file_wd),
+ Event(IN_DELETE, dir_wd, Basename(file.path())),
+ Event(IN_MODIFY, dir_wd, Basename(file.path())),
+ Event(IN_MODIFY, file_wd),
+ }));
+
+ const struct timeval times[2] = {{1, 0}, {2, 0}};
+ ASSERT_THAT(futimes(fd.get(), times), SyscallSucceeds());
+ events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+ EXPECT_THAT(events, Are({
+ Event(IN_ATTRIB, dir_wd, Basename(file.path())),
+ Event(IN_ATTRIB, file_wd),
+ }));
+
+ // S/R is disabled on this entire test due to behavior with unlink; it must
+ // also be disabled after this point because of fchmod.
+ ASSERT_THAT(fchmod(fd.get(), 0777), SyscallSucceeds());
+ events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+ EXPECT_THAT(events, Are({
+ Event(IN_ATTRIB, dir_wd, Basename(file.path())),
+ Event(IN_ATTRIB, file_wd),
+ }));
+}
+
+TEST(Inotify, OneShot) {
+ // TODO(gvisor.dev/issue/1624): IN_ONESHOT not supported in VFS1.
+ SKIP_IF(IsRunningWithVFS1());
+
+ const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const FileDescriptor inotify_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+
+ const int wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(inotify_fd.get(), file.path(), IN_MODIFY | IN_ONESHOT));
+
+ // Open an fd, write to it, and then close it.
+ FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_WRONLY));
+ ASSERT_THAT(write(fd.get(), "x", 1), SyscallSucceedsWithValue(1));
+ fd.reset();
+
+ // We should get a single event followed by IN_IGNORED indicating removal
+ // of the one-shot watch. Prior activity (i.e. open) that is not in the mask
+ // should not trigger removal, and activity after removal (i.e. close) should
+ // not generate events.
+ std::vector<Event> events =
+ ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get()));
+ EXPECT_THAT(events, Are({
+ Event(IN_MODIFY, wd),
+ Event(IN_IGNORED, wd),
+ }));
+
+ // The watch should already have been removed.
+ EXPECT_THAT(inotify_rm_watch(inotify_fd.get(), wd),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+// This test helps verify that the lock order of filesystem and inotify locks
+// is respected when inotify instances and watch targets are concurrently being
+// destroyed.
+TEST(InotifyTest, InotifyAndTargetDestructionDoNotDeadlock_NoRandomSave) {
+ const DisableSave ds; // Too many syscalls.
+
+ // A file descriptor protected by a mutex. This ensures that while a
+ // descriptor is in use, it cannot be closed and reused for a different file
+ // description.
+ struct atomic_fd {
+ int fd;
+ absl::Mutex mu;
+ };
+
+ // Set up initial inotify instances.
+ constexpr int num_fds = 3;
+ std::vector<atomic_fd> fds(num_fds);
+ for (int i = 0; i < num_fds; i++) {
+ int fd;
+ ASSERT_THAT(fd = inotify_init1(IN_NONBLOCK), SyscallSucceeds());
+ fds[i].fd = fd;
+ }
+
+ // Set up initial watch targets.
+ std::vector<std::string> paths;
+ for (int i = 0; i < 3; i++) {
+ paths.push_back(NewTempAbsPath());
+ ASSERT_THAT(mknod(paths[i].c_str(), S_IFREG | 0600, 0), SyscallSucceeds());
+ }
+
+ constexpr absl::Duration runtime = absl::Seconds(4);
+
+ // Constantly replace each inotify instance with a new one.
+ auto replace_fds = [&] {
+ for (auto start = absl::Now(); absl::Now() - start < runtime;) {
+ for (auto& afd : fds) {
+ int new_fd;
+ ASSERT_THAT(new_fd = inotify_init1(IN_NONBLOCK), SyscallSucceeds());
+ absl::MutexLock l(&afd.mu);
+ ASSERT_THAT(close(afd.fd), SyscallSucceeds());
+ afd.fd = new_fd;
+ for (auto& p : paths) {
+ // inotify_add_watch may fail if the file at p was deleted.
+ ASSERT_THAT(inotify_add_watch(afd.fd, p.c_str(), IN_ALL_EVENTS),
+ AnyOf(SyscallSucceeds(), SyscallFailsWithErrno(ENOENT)));
+ }
+ }
+ sched_yield();
+ }
+ };
+
+ std::list<ScopedThread> ts;
+ for (int i = 0; i < 3; i++) {
+ ts.emplace_back(replace_fds);
+ }
+
+ // Constantly replace each watch target with a new one.
+ for (auto start = absl::Now(); absl::Now() - start < runtime;) {
+ for (auto& p : paths) {
+ ASSERT_THAT(unlink(p.c_str()), SyscallSucceeds());
+ ASSERT_THAT(mknod(p.c_str(), S_IFREG | 0600, 0), SyscallSucceeds());
+ }
+ sched_yield();
+ }
+}
+
+// This test helps verify that the lock order of filesystem and inotify locks
+// is respected when adding/removing watches occurs concurrently with the
+// removal of their targets.
+TEST(InotifyTest, AddRemoveUnlinkDoNotDeadlock_NoRandomSave) {
+ const DisableSave ds; // Too many syscalls.
+
+ // Set up inotify instances.
+ constexpr int num_fds = 3;
+ std::vector<int> fds(num_fds);
+ for (int i = 0; i < num_fds; i++) {
+ ASSERT_THAT(fds[i] = inotify_init1(IN_NONBLOCK), SyscallSucceeds());
+ }
+
+ // Set up initial watch targets.
+ std::vector<std::string> paths;
+ for (int i = 0; i < 3; i++) {
+ paths.push_back(NewTempAbsPath());
+ ASSERT_THAT(mknod(paths[i].c_str(), S_IFREG | 0600, 0), SyscallSucceeds());
+ }
+
+ constexpr absl::Duration runtime = absl::Seconds(1);
+
+ // Constantly add/remove watches for each inotify instance/watch target pair.
+ auto add_remove_watches = [&] {
+ for (auto start = absl::Now(); absl::Now() - start < runtime;) {
+ for (int fd : fds) {
+ for (auto& p : paths) {
+ // Do not assert on inotify_add_watch and inotify_rm_watch. They may
+ // fail if the file at p was deleted. inotify_add_watch may also fail
+ // if another thread beat us to adding a watch.
+ const int wd = inotify_add_watch(fd, p.c_str(), IN_ALL_EVENTS);
+ if (wd > 0) {
+ inotify_rm_watch(fd, wd);
+ }
+ }
+ }
+ sched_yield();
+ }
+ };
+
+ std::list<ScopedThread> ts;
+ for (int i = 0; i < 15; i++) {
+ ts.emplace_back(add_remove_watches);
+ }
+
+ // Constantly replace each watch target with a new one.
+ for (auto start = absl::Now(); absl::Now() - start < runtime;) {
+ for (auto& p : paths) {
+ ASSERT_THAT(unlink(p.c_str()), SyscallSucceeds());
+ ASSERT_THAT(mknod(p.c_str(), S_IFREG | 0600, 0), SyscallSucceeds());
+ }
+ sched_yield();
+ }
+}
+
+// This test helps verify that the lock order of filesystem and inotify locks
+// is respected when many inotify events and filesystem operations occur
+// simultaneously.
+TEST(InotifyTest, NotifyNoDeadlock_NoRandomSave) {
+ const DisableSave ds; // Too many syscalls.
+
+ const TempPath parent = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const std::string dir = parent.path();
+
+ // mu protects file, which will change on rename.
+ absl::Mutex mu;
+ std::string file = NewTempAbsPathInDir(dir);
+ ASSERT_THAT(mknod(file.c_str(), 0644 | S_IFREG, 0), SyscallSucceeds());
+
+ const absl::Duration runtime = absl::Milliseconds(300);
+
+ // Add/remove watches on dir and file.
+ ScopedThread add_remove_watches([&] {
+ const FileDescriptor ifd =
+ ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK));
+ int dir_wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(ifd.get(), dir, IN_ALL_EVENTS));
+ int file_wd;
+ {
+ absl::ReaderMutexLock l(&mu);
+ file_wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(ifd.get(), file, IN_ALL_EVENTS));
+ }
+ for (auto start = absl::Now(); absl::Now() - start < runtime;) {
+ ASSERT_THAT(inotify_rm_watch(ifd.get(), file_wd), SyscallSucceeds());
+ ASSERT_THAT(inotify_rm_watch(ifd.get(), dir_wd), SyscallSucceeds());
+ dir_wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(ifd.get(), dir, IN_ALL_EVENTS));
+ {
+ absl::ReaderMutexLock l(&mu);
+ file_wd = ASSERT_NO_ERRNO_AND_VALUE(
+ InotifyAddWatch(ifd.get(), file, IN_ALL_EVENTS));
+ }
+ sched_yield();
+ }
+ });
+
+ // Modify attributes on dir and file.
+ ScopedThread stats([&] {
+ int fd, dir_fd;
+ {
+ absl::ReaderMutexLock l(&mu);
+ ASSERT_THAT(fd = open(file.c_str(), O_RDONLY), SyscallSucceeds());
+ }
+ ASSERT_THAT(dir_fd = open(dir.c_str(), O_RDONLY | O_DIRECTORY),
+ SyscallSucceeds());
+ const struct timeval times[2] = {{1, 0}, {2, 0}};
+
+ for (auto start = absl::Now(); absl::Now() - start < runtime;) {
+ {
+ absl::ReaderMutexLock l(&mu);
+ EXPECT_THAT(utimes(file.c_str(), times), SyscallSucceeds());
+ }
+ EXPECT_THAT(futimes(fd, times), SyscallSucceeds());
+ EXPECT_THAT(utimes(dir.c_str(), times), SyscallSucceeds());
+ EXPECT_THAT(futimes(dir_fd, times), SyscallSucceeds());
+ sched_yield();
+ }
+ });
+
+ // Modify extended attributes on dir and file.
+ ScopedThread xattrs([&] {
+ // TODO(gvisor.dev/issue/1636): Support extended attributes in runsc gofer.
+ if (!IsRunningOnGvisor()) {
+ int fd;
+ {
+ absl::ReaderMutexLock l(&mu);
+ ASSERT_THAT(fd = open(file.c_str(), O_RDONLY), SyscallSucceeds());
+ }
+
+ const char* name = "user.test";
+ int val = 123;
+ for (auto start = absl::Now(); absl::Now() - start < runtime;) {
+ {
+ absl::ReaderMutexLock l(&mu);
+ ASSERT_THAT(
+ setxattr(file.c_str(), name, &val, sizeof(val), /*flags=*/0),
+ SyscallSucceeds());
+ ASSERT_THAT(removexattr(file.c_str(), name), SyscallSucceeds());
+ }
+
+ ASSERT_THAT(fsetxattr(fd, name, &val, sizeof(val), /*flags=*/0),
+ SyscallSucceeds());
+ ASSERT_THAT(fremovexattr(fd, name), SyscallSucceeds());
+ sched_yield();
+ }
+ }
+ });
+
+ // Read and write file's contents. Read and write dir's entries.
+ ScopedThread read_write([&] {
+ int fd;
+ {
+ absl::ReaderMutexLock l(&mu);
+ ASSERT_THAT(fd = open(file.c_str(), O_RDWR), SyscallSucceeds());
+ }
+ for (auto start = absl::Now(); absl::Now() - start < runtime;) {
+ int val = 123;
+ ASSERT_THAT(write(fd, &val, sizeof(val)), SyscallSucceeds());
+ ASSERT_THAT(read(fd, &val, sizeof(val)), SyscallSucceeds());
+ TempPath new_file =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir));
+ ASSERT_NO_ERRNO(ListDir(dir, false));
+ new_file.reset();
+ sched_yield();
+ }
+ });
+
+ // Rename file.
+ for (auto start = absl::Now(); absl::Now() - start < runtime;) {
+ const std::string new_path = NewTempAbsPathInDir(dir);
+ {
+ absl::WriterMutexLock l(&mu);
+ ASSERT_THAT(rename(file.c_str(), new_path.c_str()), SyscallSucceeds());
+ file = new_path;
+ }
+ sched_yield();
+ }
+}
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/ioctl.cc b/test/syscalls/linux/ioctl.cc
new file mode 100644
index 000000000..b0a07a064
--- /dev/null
+++ b/test/syscalls/linux/ioctl.cc
@@ -0,0 +1,406 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <net/if.h>
+#include <netdb.h>
+#include <signal.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/signal_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+bool CheckNonBlocking(int fd) {
+ int ret = fcntl(fd, F_GETFL, 0);
+ TEST_CHECK(ret != -1);
+ return (ret & O_NONBLOCK) == O_NONBLOCK;
+}
+
+bool CheckCloExec(int fd) {
+ int ret = fcntl(fd, F_GETFD, 0);
+ TEST_CHECK(ret != -1);
+ return (ret & FD_CLOEXEC) == FD_CLOEXEC;
+}
+
+class IoctlTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ ASSERT_THAT(fd_ = open("/dev/null", O_RDONLY), SyscallSucceeds());
+ }
+
+ void TearDown() override {
+ if (fd_ >= 0) {
+ ASSERT_THAT(close(fd_), SyscallSucceeds());
+ fd_ = -1;
+ }
+ }
+
+ int fd() const { return fd_; }
+
+ private:
+ int fd_ = -1;
+};
+
+TEST_F(IoctlTest, BadFileDescriptor) {
+ EXPECT_THAT(ioctl(-1 /* fd */, 0), SyscallFailsWithErrno(EBADF));
+}
+
+TEST_F(IoctlTest, InvalidControlNumber) {
+ EXPECT_THAT(ioctl(STDOUT_FILENO, 0), SyscallFailsWithErrno(ENOTTY));
+}
+
+TEST_F(IoctlTest, FIONBIOSucceeds) {
+ EXPECT_FALSE(CheckNonBlocking(fd()));
+ int set = 1;
+ EXPECT_THAT(ioctl(fd(), FIONBIO, &set), SyscallSucceeds());
+ EXPECT_TRUE(CheckNonBlocking(fd()));
+ set = 0;
+ EXPECT_THAT(ioctl(fd(), FIONBIO, &set), SyscallSucceeds());
+ EXPECT_FALSE(CheckNonBlocking(fd()));
+}
+
+TEST_F(IoctlTest, FIONBIOFails) {
+ EXPECT_THAT(ioctl(fd(), FIONBIO, nullptr), SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(IoctlTest, FIONCLEXSucceeds) {
+ EXPECT_THAT(ioctl(fd(), FIONCLEX), SyscallSucceeds());
+ EXPECT_FALSE(CheckCloExec(fd()));
+}
+
+TEST_F(IoctlTest, FIOCLEXSucceeds) {
+ EXPECT_THAT(ioctl(fd(), FIOCLEX), SyscallSucceeds());
+ EXPECT_TRUE(CheckCloExec(fd()));
+}
+
+TEST_F(IoctlTest, FIOASYNCFails) {
+ EXPECT_THAT(ioctl(fd(), FIOASYNC, nullptr), SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(IoctlTest, FIOASYNCSucceeds) {
+ // Not all FDs support FIOASYNC.
+ const FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+ int before = -1;
+ ASSERT_THAT(before = fcntl(s.get(), F_GETFL), SyscallSucceeds());
+
+ int set = 1;
+ EXPECT_THAT(ioctl(s.get(), FIOASYNC, &set), SyscallSucceeds());
+
+ int after_set = -1;
+ ASSERT_THAT(after_set = fcntl(s.get(), F_GETFL), SyscallSucceeds());
+ EXPECT_EQ(after_set, before | O_ASYNC) << "before was " << before;
+
+ set = 0;
+ EXPECT_THAT(ioctl(s.get(), FIOASYNC, &set), SyscallSucceeds());
+
+ ASSERT_THAT(fcntl(s.get(), F_GETFL), SyscallSucceedsWithValue(before));
+}
+
+/* Count of the number of SIGIOs handled. */
+static volatile int io_received = 0;
+
+void inc_io_handler(int sig, siginfo_t* siginfo, void* arg) { io_received++; }
+
+TEST_F(IoctlTest, FIOASYNCNoTarget) {
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ // Count SIGIOs received.
+ io_received = 0;
+ struct sigaction sa;
+ sa.sa_sigaction = inc_io_handler;
+ sigfillset(&sa.sa_mask);
+ sa.sa_flags = SA_RESTART;
+ auto sa_cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGIO, sa));
+
+ // Actually allow SIGIO delivery.
+ auto mask_cleanup =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_UNBLOCK, SIGIO));
+
+ int set = 1;
+ EXPECT_THAT(ioctl(pair->second_fd(), FIOASYNC, &set), SyscallSucceeds());
+
+ constexpr char kData[] = "abc";
+ ASSERT_THAT(WriteFd(pair->first_fd(), kData, sizeof(kData)),
+ SyscallSucceedsWithValue(sizeof(kData)));
+
+ EXPECT_EQ(io_received, 0);
+}
+
+TEST_F(IoctlTest, FIOASYNCSelfTarget) {
+ // FIXME(b/120624367): gVisor erroneously sends SIGIO on close(2), which would
+ // kill the test when pair goes out of scope. Temporarily ignore SIGIO so that
+ // that the close signal is ignored.
+ struct sigaction sa;
+ sa.sa_handler = SIG_IGN;
+ auto early_sa_cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGIO, sa));
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ // Count SIGIOs received.
+ io_received = 0;
+ sa.sa_sigaction = inc_io_handler;
+ sigfillset(&sa.sa_mask);
+ sa.sa_flags = SA_RESTART;
+ auto sa_cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGIO, sa));
+
+ // Actually allow SIGIO delivery.
+ auto mask_cleanup =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_UNBLOCK, SIGIO));
+
+ int set = 1;
+ EXPECT_THAT(ioctl(pair->second_fd(), FIOASYNC, &set), SyscallSucceeds());
+
+ pid_t pid = getpid();
+ EXPECT_THAT(ioctl(pair->second_fd(), FIOSETOWN, &pid), SyscallSucceeds());
+
+ constexpr char kData[] = "abc";
+ ASSERT_THAT(WriteFd(pair->first_fd(), kData, sizeof(kData)),
+ SyscallSucceedsWithValue(sizeof(kData)));
+
+ EXPECT_EQ(io_received, 1);
+}
+
+// Equivalent to FIOASYNCSelfTarget except that FIOSETOWN is called before
+// FIOASYNC.
+TEST_F(IoctlTest, FIOASYNCSelfTarget2) {
+ // FIXME(b/120624367): gVisor erroneously sends SIGIO on close(2), which would
+ // kill the test when pair goes out of scope. Temporarily ignore SIGIO so that
+ // that the close signal is ignored.
+ struct sigaction sa;
+ sa.sa_handler = SIG_IGN;
+ auto early_sa_cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGIO, sa));
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ // Count SIGIOs received.
+ io_received = 0;
+ sa.sa_sigaction = inc_io_handler;
+ sigfillset(&sa.sa_mask);
+ sa.sa_flags = SA_RESTART;
+ auto sa_cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGIO, sa));
+
+ // Actually allow SIGIO delivery.
+ auto mask_cleanup =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_UNBLOCK, SIGIO));
+
+ pid_t pid = -1;
+ EXPECT_THAT(pid = getpid(), SyscallSucceeds());
+ EXPECT_THAT(ioctl(pair->second_fd(), FIOSETOWN, &pid), SyscallSucceeds());
+
+ int set = 1;
+ EXPECT_THAT(ioctl(pair->second_fd(), FIOASYNC, &set), SyscallSucceeds());
+
+ constexpr char kData[] = "abc";
+ ASSERT_THAT(WriteFd(pair->first_fd(), kData, sizeof(kData)),
+ SyscallSucceedsWithValue(sizeof(kData)));
+
+ EXPECT_EQ(io_received, 1);
+}
+
+// Check that closing an FD does not result in an event.
+TEST_F(IoctlTest, FIOASYNCSelfTargetClose) {
+ // Count SIGIOs received.
+ struct sigaction sa;
+ io_received = 0;
+ sa.sa_sigaction = inc_io_handler;
+ sigfillset(&sa.sa_mask);
+ sa.sa_flags = SA_RESTART;
+ auto sa_cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGIO, sa));
+
+ // Actually allow SIGIO delivery.
+ auto mask_cleanup =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_UNBLOCK, SIGIO));
+
+ for (int i = 0; i < 2; i++) {
+ auto pair = ASSERT_NO_ERRNO_AND_VALUE(
+ UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ pid_t pid = getpid();
+ EXPECT_THAT(ioctl(pair->second_fd(), FIOSETOWN, &pid), SyscallSucceeds());
+
+ int set = 1;
+ EXPECT_THAT(ioctl(pair->second_fd(), FIOASYNC, &set), SyscallSucceeds());
+ }
+
+ // FIXME(b/120624367): gVisor erroneously sends SIGIO on close.
+ SKIP_IF(IsRunningOnGvisor());
+
+ EXPECT_EQ(io_received, 0);
+}
+
+TEST_F(IoctlTest, FIOASYNCInvalidPID) {
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+ int set = 1;
+ ASSERT_THAT(ioctl(pair->second_fd(), FIOASYNC, &set), SyscallSucceeds());
+ pid_t pid = INT_MAX;
+ // This succeeds (with behavior equivalent to a pid of 0) in Linux prior to
+ // f73127356f34 "fs/fcntl: return -ESRCH in f_setown when pid/pgid can't be
+ // found", and fails with EPERM after that commit.
+ EXPECT_THAT(ioctl(pair->second_fd(), FIOSETOWN, &pid),
+ AnyOf(SyscallSucceeds(), SyscallFailsWithErrno(ESRCH)));
+}
+
+TEST_F(IoctlTest, FIOASYNCUnsetTarget) {
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ // Count SIGIOs received.
+ io_received = 0;
+ struct sigaction sa;
+ sa.sa_sigaction = inc_io_handler;
+ sigfillset(&sa.sa_mask);
+ sa.sa_flags = SA_RESTART;
+ auto sa_cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGIO, sa));
+
+ // Actually allow SIGIO delivery.
+ auto mask_cleanup =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_UNBLOCK, SIGIO));
+
+ int set = 1;
+ EXPECT_THAT(ioctl(pair->second_fd(), FIOASYNC, &set), SyscallSucceeds());
+
+ pid_t pid = getpid();
+ EXPECT_THAT(ioctl(pair->second_fd(), FIOSETOWN, &pid), SyscallSucceeds());
+
+ // Passing a PID of 0 unsets the target.
+ pid = 0;
+ EXPECT_THAT(ioctl(pair->second_fd(), FIOSETOWN, &pid), SyscallSucceeds());
+
+ constexpr char kData[] = "abc";
+ ASSERT_THAT(WriteFd(pair->first_fd(), kData, sizeof(kData)),
+ SyscallSucceedsWithValue(sizeof(kData)));
+
+ EXPECT_EQ(io_received, 0);
+}
+
+using IoctlTestSIOCGIFCONF = SimpleSocketTest;
+
+TEST_P(IoctlTestSIOCGIFCONF, ValidateNoArrayGetsLength) {
+ auto fd = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Validate that no array can be used to get the length required.
+ struct ifconf ifconf = {};
+ ASSERT_THAT(ioctl(fd->get(), SIOCGIFCONF, &ifconf), SyscallSucceeds());
+ ASSERT_GT(ifconf.ifc_len, 0);
+}
+
+// This test validates that we will only return a partial array list and not
+// partial ifrreq structs.
+TEST_P(IoctlTestSIOCGIFCONF, ValidateNoPartialIfrsReturned) {
+ auto fd = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ struct ifreq ifr = {};
+ struct ifconf ifconf = {};
+ ifconf.ifc_len = sizeof(ifr) - 1; // One byte too few.
+ ifconf.ifc_ifcu.ifcu_req = &ifr;
+
+ ASSERT_THAT(ioctl(fd->get(), SIOCGIFCONF, &ifconf), SyscallSucceeds());
+ ASSERT_EQ(ifconf.ifc_len, 0);
+ ASSERT_EQ(ifr.ifr_name[0], '\0'); // Nothing is returned.
+
+ ifconf.ifc_len = sizeof(ifreq);
+ ASSERT_THAT(ioctl(fd->get(), SIOCGIFCONF, &ifconf), SyscallSucceeds());
+ ASSERT_GT(ifconf.ifc_len, 0);
+ ASSERT_NE(ifr.ifr_name[0], '\0'); // An interface can now be returned.
+}
+
+TEST_P(IoctlTestSIOCGIFCONF, ValidateLoopbackIsPresent) {
+ auto fd = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ struct ifconf ifconf = {};
+ struct ifreq ifr[10] = {}; // Storage for up to 10 interfaces.
+
+ ifconf.ifc_req = ifr;
+ ifconf.ifc_len = sizeof(ifr);
+
+ ASSERT_THAT(ioctl(fd->get(), SIOCGIFCONF, &ifconf), SyscallSucceeds());
+ size_t num_if = ifconf.ifc_len / sizeof(struct ifreq);
+
+ // We should have at least one interface.
+ ASSERT_GE(num_if, 1);
+
+ // One of the interfaces should be a loopback.
+ bool found_loopback = false;
+ for (size_t i = 0; i < num_if; ++i) {
+ if (strcmp(ifr[i].ifr_name, "lo") == 0) {
+ // SIOCGIFCONF returns the ipv4 address of the interface, let's check it.
+ ASSERT_EQ(ifr[i].ifr_addr.sa_family, AF_INET);
+
+ // Validate the address is correct for loopback.
+ sockaddr_in* sin = reinterpret_cast<sockaddr_in*>(&ifr[i].ifr_addr);
+ ASSERT_EQ(htonl(sin->sin_addr.s_addr), INADDR_LOOPBACK);
+
+ found_loopback = true;
+ break;
+ }
+ }
+ ASSERT_TRUE(found_loopback);
+}
+
+std::vector<SocketKind> IoctlSocketTypes() {
+ return {SimpleSocket(AF_UNIX, SOCK_STREAM, 0),
+ SimpleSocket(AF_UNIX, SOCK_DGRAM, 0),
+ SimpleSocket(AF_INET, SOCK_STREAM, 0),
+ SimpleSocket(AF_INET6, SOCK_STREAM, 0),
+ SimpleSocket(AF_INET, SOCK_DGRAM, 0),
+ SimpleSocket(AF_INET6, SOCK_DGRAM, 0)};
+}
+
+INSTANTIATE_TEST_SUITE_P(IoctlTest, IoctlTestSIOCGIFCONF,
+ ::testing::ValuesIn(IoctlSocketTypes()));
+
+} // namespace
+
+TEST_F(IoctlTest, FIOGETOWNSucceeds) {
+ const FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+ int get = -1;
+ ASSERT_THAT(ioctl(s.get(), FIOGETOWN, &get), SyscallSucceeds());
+ EXPECT_EQ(get, 0);
+}
+
+TEST_F(IoctlTest, SIOCGPGRPSucceeds) {
+ const FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(AF_UNIX, SOCK_SEQPACKET | SOCK_NONBLOCK | SOCK_CLOEXEC, 0));
+
+ int get = -1;
+ ASSERT_THAT(ioctl(s.get(), SIOCGPGRP, &get), SyscallSucceeds());
+ EXPECT_EQ(get, 0);
+}
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/ip_socket_test_util.cc b/test/syscalls/linux/ip_socket_test_util.cc
new file mode 100644
index 000000000..98d07ae85
--- /dev/null
+++ b/test/syscalls/linux/ip_socket_test_util.cc
@@ -0,0 +1,239 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/ip_socket_test_util.h"
+
+#include <net/if.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+
+#include <cstring>
+
+namespace gvisor {
+namespace testing {
+
+uint32_t IPFromInetSockaddr(const struct sockaddr* addr) {
+ auto* in_addr = reinterpret_cast<const struct sockaddr_in*>(addr);
+ return in_addr->sin_addr.s_addr;
+}
+
+uint16_t PortFromInetSockaddr(const struct sockaddr* addr) {
+ auto* in_addr = reinterpret_cast<const struct sockaddr_in*>(addr);
+ return ntohs(in_addr->sin_port);
+}
+
+PosixErrorOr<int> InterfaceIndex(std::string name) {
+ int index = if_nametoindex(name.c_str());
+ if (index) {
+ return index;
+ }
+ return PosixError(errno);
+}
+
+namespace {
+
+std::string DescribeSocketType(int type) {
+ return absl::StrCat(((type & SOCK_NONBLOCK) != 0) ? "non-blocking " : "",
+ ((type & SOCK_CLOEXEC) != 0) ? "close-on-exec " : "");
+}
+
+} // namespace
+
+SocketPairKind IPv6TCPAcceptBindSocketPair(int type) {
+ std::string description =
+ absl::StrCat(DescribeSocketType(type), "connected IPv6 TCP socket");
+ return SocketPairKind{
+ description, AF_INET6, type | SOCK_STREAM, IPPROTO_TCP,
+ TCPAcceptBindSocketPairCreator(AF_INET6, type | SOCK_STREAM, 0,
+ /* dual_stack = */ false)};
+}
+
+SocketPairKind IPv4TCPAcceptBindSocketPair(int type) {
+ std::string description =
+ absl::StrCat(DescribeSocketType(type), "connected IPv4 TCP socket");
+ return SocketPairKind{
+ description, AF_INET, type | SOCK_STREAM, IPPROTO_TCP,
+ TCPAcceptBindSocketPairCreator(AF_INET, type | SOCK_STREAM, 0,
+ /* dual_stack = */ false)};
+}
+
+SocketPairKind DualStackTCPAcceptBindSocketPair(int type) {
+ std::string description =
+ absl::StrCat(DescribeSocketType(type), "connected dual stack TCP socket");
+ return SocketPairKind{
+ description, AF_INET6, type | SOCK_STREAM, IPPROTO_TCP,
+ TCPAcceptBindSocketPairCreator(AF_INET6, type | SOCK_STREAM, 0,
+ /* dual_stack = */ true)};
+}
+
+SocketPairKind IPv6TCPAcceptBindPersistentListenerSocketPair(int type) {
+ std::string description =
+ absl::StrCat(DescribeSocketType(type), "connected IPv6 TCP socket");
+ return SocketPairKind{description, AF_INET6, type | SOCK_STREAM, IPPROTO_TCP,
+ TCPAcceptBindPersistentListenerSocketPairCreator(
+ AF_INET6, type | SOCK_STREAM, 0,
+ /* dual_stack = */ false)};
+}
+
+SocketPairKind IPv4TCPAcceptBindPersistentListenerSocketPair(int type) {
+ std::string description =
+ absl::StrCat(DescribeSocketType(type), "connected IPv4 TCP socket");
+ return SocketPairKind{description, AF_INET, type | SOCK_STREAM, IPPROTO_TCP,
+ TCPAcceptBindPersistentListenerSocketPairCreator(
+ AF_INET, type | SOCK_STREAM, 0,
+ /* dual_stack = */ false)};
+}
+
+SocketPairKind DualStackTCPAcceptBindPersistentListenerSocketPair(int type) {
+ std::string description =
+ absl::StrCat(DescribeSocketType(type), "connected dual stack TCP socket");
+ return SocketPairKind{description, AF_INET6, type | SOCK_STREAM, IPPROTO_TCP,
+ TCPAcceptBindPersistentListenerSocketPairCreator(
+ AF_INET6, type | SOCK_STREAM, 0,
+ /* dual_stack = */ true)};
+}
+
+SocketPairKind IPv6UDPBidirectionalBindSocketPair(int type) {
+ std::string description =
+ absl::StrCat(DescribeSocketType(type), "connected IPv6 UDP socket");
+ return SocketPairKind{
+ description, AF_INET6, type | SOCK_DGRAM, IPPROTO_UDP,
+ UDPBidirectionalBindSocketPairCreator(AF_INET6, type | SOCK_DGRAM, 0,
+ /* dual_stack = */ false)};
+}
+
+SocketPairKind IPv4UDPBidirectionalBindSocketPair(int type) {
+ std::string description =
+ absl::StrCat(DescribeSocketType(type), "connected IPv4 UDP socket");
+ return SocketPairKind{
+ description, AF_INET, type | SOCK_DGRAM, IPPROTO_UDP,
+ UDPBidirectionalBindSocketPairCreator(AF_INET, type | SOCK_DGRAM, 0,
+ /* dual_stack = */ false)};
+}
+
+SocketPairKind DualStackUDPBidirectionalBindSocketPair(int type) {
+ std::string description =
+ absl::StrCat(DescribeSocketType(type), "connected dual stack UDP socket");
+ return SocketPairKind{
+ description, AF_INET6, type | SOCK_DGRAM, IPPROTO_UDP,
+ UDPBidirectionalBindSocketPairCreator(AF_INET6, type | SOCK_DGRAM, 0,
+ /* dual_stack = */ true)};
+}
+
+SocketPairKind IPv4UDPUnboundSocketPair(int type) {
+ std::string description =
+ absl::StrCat(DescribeSocketType(type), "IPv4 UDP socket");
+ return SocketPairKind{
+ description, AF_INET, type | SOCK_DGRAM, IPPROTO_UDP,
+ UDPUnboundSocketPairCreator(AF_INET, type | SOCK_DGRAM, 0,
+ /* dual_stack = */ false)};
+}
+
+SocketKind IPv4UDPUnboundSocket(int type) {
+ std::string description =
+ absl::StrCat(DescribeSocketType(type), "IPv4 UDP socket");
+ return SocketKind{
+ description, AF_INET, type | SOCK_DGRAM, IPPROTO_UDP,
+ UnboundSocketCreator(AF_INET, type | SOCK_DGRAM, IPPROTO_UDP)};
+}
+
+SocketKind IPv6UDPUnboundSocket(int type) {
+ std::string description =
+ absl::StrCat(DescribeSocketType(type), "IPv6 UDP socket");
+ return SocketKind{
+ description, AF_INET6, type | SOCK_DGRAM, IPPROTO_UDP,
+ UnboundSocketCreator(AF_INET6, type | SOCK_DGRAM, IPPROTO_UDP)};
+}
+
+SocketKind IPv4TCPUnboundSocket(int type) {
+ std::string description =
+ absl::StrCat(DescribeSocketType(type), "IPv4 TCP socket");
+ return SocketKind{
+ description, AF_INET, type | SOCK_STREAM, IPPROTO_TCP,
+ UnboundSocketCreator(AF_INET, type | SOCK_STREAM, IPPROTO_TCP)};
+}
+
+SocketKind IPv6TCPUnboundSocket(int type) {
+ std::string description =
+ absl::StrCat(DescribeSocketType(type), "IPv6 TCP socket");
+ return SocketKind{
+ description, AF_INET6, type | SOCK_STREAM, IPPROTO_TCP,
+ UnboundSocketCreator(AF_INET6, type | SOCK_STREAM, IPPROTO_TCP)};
+}
+
+PosixError IfAddrHelper::Load() {
+ Release();
+ RETURN_ERROR_IF_SYSCALL_FAIL(getifaddrs(&ifaddr_));
+ return NoError();
+}
+
+void IfAddrHelper::Release() {
+ if (ifaddr_) {
+ freeifaddrs(ifaddr_);
+ ifaddr_ = nullptr;
+ }
+}
+
+std::vector<std::string> IfAddrHelper::InterfaceList(int family) const {
+ std::vector<std::string> names;
+ for (auto ifa = ifaddr_; ifa != NULL; ifa = ifa->ifa_next) {
+ if (ifa->ifa_addr == NULL || ifa->ifa_addr->sa_family != family) {
+ continue;
+ }
+ names.emplace(names.end(), ifa->ifa_name);
+ }
+ return names;
+}
+
+const sockaddr* IfAddrHelper::GetAddr(int family, std::string name) const {
+ for (auto ifa = ifaddr_; ifa != NULL; ifa = ifa->ifa_next) {
+ if (ifa->ifa_addr == NULL || ifa->ifa_addr->sa_family != family) {
+ continue;
+ }
+ if (name == ifa->ifa_name) {
+ return ifa->ifa_addr;
+ }
+ }
+ return nullptr;
+}
+
+PosixErrorOr<int> IfAddrHelper::GetIndex(std::string name) const {
+ return InterfaceIndex(name);
+}
+
+std::string GetAddr4Str(const in_addr* a) {
+ char str[INET_ADDRSTRLEN];
+ inet_ntop(AF_INET, a, str, sizeof(str));
+ return std::string(str);
+}
+
+std::string GetAddr6Str(const in6_addr* a) {
+ char str[INET6_ADDRSTRLEN];
+ inet_ntop(AF_INET6, a, str, sizeof(str));
+ return std::string(str);
+}
+
+std::string GetAddrStr(const sockaddr* a) {
+ if (a->sa_family == AF_INET) {
+ auto src = &(reinterpret_cast<const sockaddr_in*>(a)->sin_addr);
+ return GetAddr4Str(src);
+ } else if (a->sa_family == AF_INET6) {
+ auto src = &(reinterpret_cast<const sockaddr_in6*>(a)->sin6_addr);
+ return GetAddr6Str(src);
+ }
+ return std::string("<invalid>");
+}
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/ip_socket_test_util.h b/test/syscalls/linux/ip_socket_test_util.h
new file mode 100644
index 000000000..9c3859fcd
--- /dev/null
+++ b/test/syscalls/linux/ip_socket_test_util.h
@@ -0,0 +1,135 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_IP_SOCKET_TEST_UTIL_H_
+#define GVISOR_TEST_SYSCALLS_IP_SOCKET_TEST_UTIL_H_
+
+#include <arpa/inet.h>
+#include <ifaddrs.h>
+#include <sys/types.h>
+
+#include <string>
+
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Extracts the IP address from an inet sockaddr in network byte order.
+uint32_t IPFromInetSockaddr(const struct sockaddr* addr);
+
+// Extracts the port from an inet sockaddr in host byte order.
+uint16_t PortFromInetSockaddr(const struct sockaddr* addr);
+
+// InterfaceIndex returns the index of the named interface.
+PosixErrorOr<int> InterfaceIndex(std::string name);
+
+// IPv6TCPAcceptBindSocketPair returns a SocketPairKind that represents
+// SocketPairs created with bind() and accept() syscalls with AF_INET6 and the
+// given type bound to the IPv6 loopback.
+SocketPairKind IPv6TCPAcceptBindSocketPair(int type);
+
+// IPv4TCPAcceptBindSocketPair returns a SocketPairKind that represents
+// SocketPairs created with bind() and accept() syscalls with AF_INET and the
+// given type bound to the IPv4 loopback.
+SocketPairKind IPv4TCPAcceptBindSocketPair(int type);
+
+// DualStackTCPAcceptBindSocketPair returns a SocketPairKind that represents
+// SocketPairs created with bind() and accept() syscalls with AF_INET6 and the
+// given type bound to the IPv4 loopback.
+SocketPairKind DualStackTCPAcceptBindSocketPair(int type);
+
+// IPv6TCPAcceptBindPersistentListenerSocketPair is like
+// IPv6TCPAcceptBindSocketPair except it uses a persistent listening socket to
+// create all socket pairs.
+SocketPairKind IPv6TCPAcceptBindPersistentListenerSocketPair(int type);
+
+// IPv4TCPAcceptBindPersistentListenerSocketPair is like
+// IPv4TCPAcceptBindSocketPair except it uses a persistent listening socket to
+// create all socket pairs.
+SocketPairKind IPv4TCPAcceptBindPersistentListenerSocketPair(int type);
+
+// DualStackTCPAcceptBindPersistentListenerSocketPair is like
+// DualStackTCPAcceptBindSocketPair except it uses a persistent listening socket
+// to create all socket pairs.
+SocketPairKind DualStackTCPAcceptBindPersistentListenerSocketPair(int type);
+
+// IPv6UDPBidirectionalBindSocketPair returns a SocketPairKind that represents
+// SocketPairs created with bind() and connect() syscalls with AF_INET6 and the
+// given type bound to the IPv6 loopback.
+SocketPairKind IPv6UDPBidirectionalBindSocketPair(int type);
+
+// IPv4UDPBidirectionalBindSocketPair returns a SocketPairKind that represents
+// SocketPairs created with bind() and connect() syscalls with AF_INET and the
+// given type bound to the IPv4 loopback.
+SocketPairKind IPv4UDPBidirectionalBindSocketPair(int type);
+
+// DualStackUDPBidirectionalBindSocketPair returns a SocketPairKind that
+// represents SocketPairs created with bind() and connect() syscalls with
+// AF_INET6 and the given type bound to the IPv4 loopback.
+SocketPairKind DualStackUDPBidirectionalBindSocketPair(int type);
+
+// IPv4UDPUnboundSocketPair returns a SocketPairKind that represents
+// SocketPairs created with AF_INET and the given type.
+SocketPairKind IPv4UDPUnboundSocketPair(int type);
+
+// IPv4UDPUnboundSocket returns a SocketKind that represents a SimpleSocket
+// created with AF_INET, SOCK_DGRAM, and the given type.
+SocketKind IPv4UDPUnboundSocket(int type);
+
+// IPv6UDPUnboundSocket returns a SocketKind that represents a SimpleSocket
+// created with AF_INET6, SOCK_DGRAM, and the given type.
+SocketKind IPv6UDPUnboundSocket(int type);
+
+// IPv4TCPUnboundSocket returns a SocketKind that represents a SimpleSocket
+// created with AF_INET, SOCK_STREAM and the given type.
+SocketKind IPv4TCPUnboundSocket(int type);
+
+// IPv6TCPUnboundSocket returns a SocketKind that represents a SimpleSocket
+// created with AF_INET6, SOCK_STREAM and the given type.
+SocketKind IPv6TCPUnboundSocket(int type);
+
+// IfAddrHelper is a helper class that determines the local interfaces present
+// and provides functions to obtain their names, index numbers, and IP address.
+class IfAddrHelper {
+ public:
+ IfAddrHelper() : ifaddr_(nullptr) {}
+ ~IfAddrHelper() { Release(); }
+
+ PosixError Load();
+ void Release();
+
+ std::vector<std::string> InterfaceList(int family) const;
+
+ const sockaddr* GetAddr(int family, std::string name) const;
+ PosixErrorOr<int> GetIndex(std::string name) const;
+
+ private:
+ struct ifaddrs* ifaddr_;
+};
+
+// GetAddr4Str returns the given IPv4 network address structure as a string.
+std::string GetAddr4Str(const in_addr* a);
+
+// GetAddr6Str returns the given IPv6 network address structure as a string.
+std::string GetAddr6Str(const in6_addr* a);
+
+// GetAddrStr returns the given IPv4 or IPv6 network address structure as a
+// string.
+std::string GetAddrStr(const sockaddr* a);
+
+} // namespace testing
+} // namespace gvisor
+
+#endif // GVISOR_TEST_SYSCALLS_IP_SOCKET_TEST_UTIL_H_
diff --git a/test/syscalls/linux/iptables.cc b/test/syscalls/linux/iptables.cc
new file mode 100644
index 000000000..b8e4ece64
--- /dev/null
+++ b/test/syscalls/linux/iptables.cc
@@ -0,0 +1,204 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/iptables.h"
+
+#include <arpa/inet.h>
+#include <linux/capability.h>
+#include <linux/netfilter/x_tables.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h>
+#include <stdio.h>
+#include <sys/poll.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <algorithm>
+
+#include "gtest/gtest.h"
+#include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+constexpr char kNatTablename[] = "nat";
+constexpr char kErrorTarget[] = "ERROR";
+constexpr size_t kEmptyStandardEntrySize =
+ sizeof(struct ipt_entry) + sizeof(struct ipt_standard_target);
+constexpr size_t kEmptyErrorEntrySize =
+ sizeof(struct ipt_entry) + sizeof(struct ipt_error_target);
+
+TEST(IPTablesBasic, CreateSocket) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ int sock;
+ ASSERT_THAT(sock = socket(AF_INET, SOCK_RAW, IPPROTO_ICMP),
+ SyscallSucceeds());
+
+ ASSERT_THAT(close(sock), SyscallSucceeds());
+}
+
+TEST(IPTablesBasic, FailSockoptNonRaw) {
+ // Even if the user has CAP_NET_RAW, they shouldn't be able to use the
+ // iptables sockopts with a non-raw socket.
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ int sock;
+ ASSERT_THAT(sock = socket(AF_INET, SOCK_DGRAM, 0), SyscallSucceeds());
+
+ struct ipt_getinfo info = {};
+ snprintf(info.name, XT_TABLE_MAXNAMELEN, "%s", kNatTablename);
+ socklen_t info_size = sizeof(info);
+ EXPECT_THAT(getsockopt(sock, IPPROTO_IP, SO_GET_INFO, &info, &info_size),
+ SyscallFailsWithErrno(ENOPROTOOPT));
+
+ ASSERT_THAT(close(sock), SyscallSucceeds());
+}
+
+// Fixture for iptables tests.
+class IPTablesTest : public ::testing::Test {
+ protected:
+ // Creates a socket to be used in tests.
+ void SetUp() override;
+
+ // Closes the socket created by SetUp().
+ void TearDown() override;
+
+ // The socket via which to manipulate iptables.
+ int s_;
+};
+
+void IPTablesTest::SetUp() {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ ASSERT_THAT(s_ = socket(AF_INET, SOCK_RAW, IPPROTO_ICMP), SyscallSucceeds());
+}
+
+void IPTablesTest::TearDown() {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ EXPECT_THAT(close(s_), SyscallSucceeds());
+}
+
+// This tests the initial state of a machine with empty iptables. We don't have
+// a guarantee that the iptables are empty when running in native, but we can
+// test that gVisor has the same initial state that a newly-booted Linux machine
+// would have.
+TEST_F(IPTablesTest, InitialState) {
+ SKIP_IF(!IsRunningOnGvisor());
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ //
+ // Get info via sockopt.
+ //
+ struct ipt_getinfo info = {};
+ snprintf(info.name, XT_TABLE_MAXNAMELEN, "%s", kNatTablename);
+ socklen_t info_size = sizeof(info);
+ ASSERT_THAT(getsockopt(s_, IPPROTO_IP, SO_GET_INFO, &info, &info_size),
+ SyscallSucceeds());
+
+ // The nat table supports PREROUTING, and OUTPUT.
+ unsigned int valid_hooks = (1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT) |
+ (1 << NF_IP_POST_ROUTING) | (1 << NF_IP_LOCAL_IN);
+
+ EXPECT_EQ(info.valid_hooks, valid_hooks);
+
+ // Each chain consists of an empty entry with a standard target..
+ EXPECT_EQ(info.hook_entry[NF_IP_PRE_ROUTING], 0);
+ EXPECT_EQ(info.hook_entry[NF_IP_LOCAL_IN], kEmptyStandardEntrySize);
+ EXPECT_EQ(info.hook_entry[NF_IP_LOCAL_OUT], kEmptyStandardEntrySize * 2);
+ EXPECT_EQ(info.hook_entry[NF_IP_POST_ROUTING], kEmptyStandardEntrySize * 3);
+
+ // The underflow points are the same as the entry points.
+ EXPECT_EQ(info.underflow[NF_IP_PRE_ROUTING], 0);
+ EXPECT_EQ(info.underflow[NF_IP_LOCAL_IN], kEmptyStandardEntrySize);
+ EXPECT_EQ(info.underflow[NF_IP_LOCAL_OUT], kEmptyStandardEntrySize * 2);
+ EXPECT_EQ(info.underflow[NF_IP_POST_ROUTING], kEmptyStandardEntrySize * 3);
+
+ // One entry for each chain, plus an error entry at the end.
+ EXPECT_EQ(info.num_entries, 5);
+
+ EXPECT_EQ(info.size, 4 * kEmptyStandardEntrySize + kEmptyErrorEntrySize);
+ EXPECT_EQ(strcmp(info.name, kNatTablename), 0);
+
+ //
+ // Use info to get entries.
+ //
+ socklen_t entries_size = sizeof(struct ipt_get_entries) + info.size;
+ struct ipt_get_entries* entries =
+ static_cast<struct ipt_get_entries*>(malloc(entries_size));
+ snprintf(entries->name, XT_TABLE_MAXNAMELEN, "%s", kNatTablename);
+ entries->size = info.size;
+ ASSERT_THAT(
+ getsockopt(s_, IPPROTO_IP, SO_GET_ENTRIES, entries, &entries_size),
+ SyscallSucceeds());
+
+ // Verify the name and size.
+ ASSERT_EQ(info.size, entries->size);
+ ASSERT_EQ(strcmp(entries->name, kNatTablename), 0);
+
+ // Verify that the entrytable is 4 entries with accept targets and no matches
+ // followed by a single error target.
+ size_t entry_offset = 0;
+ while (entry_offset < entries->size) {
+ struct ipt_entry* entry = reinterpret_cast<struct ipt_entry*>(
+ reinterpret_cast<char*>(entries->entrytable) + entry_offset);
+
+ // ip should be zeroes.
+ struct ipt_ip zeroed = {};
+ EXPECT_EQ(memcmp(static_cast<void*>(&zeroed),
+ static_cast<void*>(&entry->ip), sizeof(zeroed)),
+ 0);
+
+ // target_offset should be zero.
+ EXPECT_EQ(entry->target_offset, sizeof(ipt_entry));
+
+ if (entry_offset < kEmptyStandardEntrySize * 4) {
+ // The first 4 entries are standard targets
+ struct ipt_standard_target* target =
+ reinterpret_cast<struct ipt_standard_target*>(entry->elems);
+ EXPECT_EQ(entry->next_offset, kEmptyStandardEntrySize);
+ EXPECT_EQ(target->target.u.user.target_size, sizeof(*target));
+ EXPECT_EQ(strcmp(target->target.u.user.name, ""), 0);
+ EXPECT_EQ(target->target.u.user.revision, 0);
+ // This is what's returned for an accept verdict. I don't know why.
+ EXPECT_EQ(target->verdict, -NF_ACCEPT - 1);
+ } else {
+ // The last entry is an error target
+ struct ipt_error_target* target =
+ reinterpret_cast<struct ipt_error_target*>(entry->elems);
+ EXPECT_EQ(entry->next_offset, kEmptyErrorEntrySize);
+ EXPECT_EQ(target->target.u.user.target_size, sizeof(*target));
+ EXPECT_EQ(strcmp(target->target.u.user.name, kErrorTarget), 0);
+ EXPECT_EQ(target->target.u.user.revision, 0);
+ EXPECT_EQ(strcmp(target->errorname, kErrorTarget), 0);
+ }
+
+ entry_offset += entry->next_offset;
+ }
+
+ free(entries);
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/iptables.h b/test/syscalls/linux/iptables.h
new file mode 100644
index 000000000..0719c60a4
--- /dev/null
+++ b/test/syscalls/linux/iptables.h
@@ -0,0 +1,198 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// There are a number of structs and values that we can't #include because of a
+// difference between C and C++ (C++ won't let you implicitly cast from void* to
+// struct something*). We re-define them here.
+
+#ifndef GVISOR_TEST_SYSCALLS_IPTABLES_TYPES_H_
+#define GVISOR_TEST_SYSCALLS_IPTABLES_TYPES_H_
+
+// Netfilter headers require some headers to preceed them.
+// clang-format off
+#include <netinet/in.h>
+#include <stddef.h>
+// clang-format on
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/if.h>
+#include <netinet/ip.h>
+#include <stdint.h>
+
+#define ipt_standard_target xt_standard_target
+#define ipt_entry_target xt_entry_target
+#define ipt_error_target xt_error_target
+
+enum SockOpts {
+ // For setsockopt.
+ BASE_CTL = 64,
+ SO_SET_REPLACE = BASE_CTL,
+ SO_SET_ADD_COUNTERS,
+ SO_SET_MAX = SO_SET_ADD_COUNTERS,
+
+ // For getsockopt.
+ SO_GET_INFO = BASE_CTL,
+ SO_GET_ENTRIES,
+ SO_GET_REVISION_MATCH,
+ SO_GET_REVISION_TARGET,
+ SO_GET_MAX = SO_GET_REVISION_TARGET
+};
+
+// ipt_ip specifies basic matching criteria that can be applied by examining
+// only the IP header of a packet.
+struct ipt_ip {
+ // Source IP address.
+ struct in_addr src;
+
+ // Destination IP address.
+ struct in_addr dst;
+
+ // Source IP address mask.
+ struct in_addr smsk;
+
+ // Destination IP address mask.
+ struct in_addr dmsk;
+
+ // Input interface.
+ char iniface[IFNAMSIZ];
+
+ // Output interface.
+ char outiface[IFNAMSIZ];
+
+ // Input interface mask.
+ unsigned char iniface_mask[IFNAMSIZ];
+
+ // Output interface mask.
+ unsigned char outiface_mask[IFNAMSIZ];
+
+ // Transport protocol.
+ uint16_t proto;
+
+ // Flags.
+ uint8_t flags;
+
+ // Inverse flags.
+ uint8_t invflags;
+};
+
+// ipt_entry is an iptables rule. It contains information about what packets the
+// rule matches and what action (target) to perform for matching packets.
+struct ipt_entry {
+ // Basic matching information used to match a packet's IP header.
+ struct ipt_ip ip;
+
+ // A caching field that isn't used by userspace.
+ unsigned int nfcache;
+
+ // The number of bytes between the start of this ipt_entry struct and the
+ // rule's target.
+ uint16_t target_offset;
+
+ // The total size of this rule, from the beginning of the entry to the end of
+ // the target.
+ uint16_t next_offset;
+
+ // A return pointer not used by userspace.
+ unsigned int comefrom;
+
+ // Counters for packets and bytes, which we don't yet implement.
+ struct xt_counters counters;
+
+ // The data for all this rules matches followed by the target. This runs
+ // beyond the value of sizeof(struct ipt_entry).
+ unsigned char elems[0];
+};
+
+// Passed to getsockopt(SO_GET_INFO).
+struct ipt_getinfo {
+ // The name of the table. The user only fills this in, the rest is filled in
+ // when returning from getsockopt. Currently "nat" and "mangle" are supported.
+ char name[XT_TABLE_MAXNAMELEN];
+
+ // A bitmap of which hooks apply to the table. For example, a table with hooks
+ // PREROUTING and FORWARD has the value
+ // (1 << NF_IP_PRE_REOUTING) | (1 << NF_IP_FORWARD).
+ unsigned int valid_hooks;
+
+ // The offset into the entry table for each valid hook. The entry table is
+ // returned by getsockopt(SO_GET_ENTRIES).
+ unsigned int hook_entry[NF_IP_NUMHOOKS];
+
+ // For each valid hook, the underflow is the offset into the entry table to
+ // jump to in case traversing the table yields no verdict (although I have no
+ // clue how that could happen - builtin chains always end with a policy, and
+ // user-defined chains always end with a RETURN.
+ //
+ // The entry referred to must be an "unconditional" entry, meaning it has no
+ // matches, specifies no IP criteria, and either DROPs or ACCEPTs packets. It
+ // basically has to be capable of making a definitive decision no matter what
+ // it's passed.
+ unsigned int underflow[NF_IP_NUMHOOKS];
+
+ // The number of entries in the entry table returned by
+ // getsockopt(SO_GET_ENTRIES).
+ unsigned int num_entries;
+
+ // The size of the entry table returned by getsockopt(SO_GET_ENTRIES).
+ unsigned int size;
+};
+
+// Passed to getsockopt(SO_GET_ENTRIES).
+struct ipt_get_entries {
+ // The name of the table. The user fills this in. Currently "nat" and "mangle"
+ // are supported.
+ char name[XT_TABLE_MAXNAMELEN];
+
+ // The size of the entry table in bytes. The user fills this in with the value
+ // from struct ipt_getinfo.size.
+ unsigned int size;
+
+ // The entries for the given table. This will run past the size defined by
+ // sizeof(struct ipt_get_entries).
+ struct ipt_entry entrytable[0];
+};
+
+// Passed to setsockopt(SO_SET_REPLACE).
+struct ipt_replace {
+ // The name of the table.
+ char name[XT_TABLE_MAXNAMELEN];
+
+ // The same as struct ipt_getinfo.valid_hooks. Users don't change this.
+ unsigned int valid_hooks;
+
+ // The same as struct ipt_getinfo.num_entries.
+ unsigned int num_entries;
+
+ // The same as struct ipt_getinfo.size.
+ unsigned int size;
+
+ // The same as struct ipt_getinfo.hook_entry.
+ unsigned int hook_entry[NF_IP_NUMHOOKS];
+
+ // The same as struct ipt_getinfo.underflow.
+ unsigned int underflow[NF_IP_NUMHOOKS];
+
+ // The number of counters, which should equal the number of entries.
+ unsigned int num_counters;
+
+ // The unchanged values from each ipt_entry's counters.
+ struct xt_counters* counters;
+
+ // The entries to write to the table. This will run past the size defined by
+ // sizeof(srtuct ipt_replace);
+ struct ipt_entry entries[0];
+};
+
+#endif // GVISOR_TEST_SYSCALLS_IPTABLES_TYPES_H_
diff --git a/test/syscalls/linux/itimer.cc b/test/syscalls/linux/itimer.cc
new file mode 100644
index 000000000..e397d5f57
--- /dev/null
+++ b/test/syscalls/linux/itimer.cc
@@ -0,0 +1,366 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <signal.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <time.h>
+
+#include <atomic>
+#include <functional>
+#include <iostream>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/logging.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/signal_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+#include "test/util/timer_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+constexpr char kSIGALRMToMainThread[] = "--itimer_sigarlm_to_main_thread";
+constexpr char kSIGPROFFairnessActive[] = "--itimer_sigprof_fairness_active";
+constexpr char kSIGPROFFairnessIdle[] = "--itimer_sigprof_fairness_idle";
+
+// Time period to be set for the itimers.
+constexpr absl::Duration kPeriod = absl::Milliseconds(25);
+// Total amount of time to spend per thread.
+constexpr absl::Duration kTestDuration = absl::Seconds(20);
+// Amount of spin iterations to perform as the minimum work item per thread.
+// Chosen to be sub-millisecond range.
+constexpr int kIterations = 10000000;
+// Allow deviation in the number of samples.
+constexpr double kNumSamplesDeviationRatio = 0.2;
+
+TEST(ItimerTest, ItimervalUpdatedBeforeExpiration) {
+ constexpr int kSleepSecs = 10;
+ constexpr int kAlarmSecs = 15;
+ static_assert(
+ kSleepSecs < kAlarmSecs,
+ "kSleepSecs must be less than kAlarmSecs for the test to be meaningful");
+ constexpr int kMaxRemainingSecs = kAlarmSecs - kSleepSecs;
+
+ // Install a no-op handler for SIGALRM.
+ struct sigaction sa = {};
+ sigfillset(&sa.sa_mask);
+ sa.sa_handler = +[](int signo) {};
+ auto const cleanup_sa =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGALRM, sa));
+
+ // Set an itimer-based alarm for kAlarmSecs from now.
+ struct itimerval itv = {};
+ itv.it_value.tv_sec = kAlarmSecs;
+ auto const cleanup_itimer =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedItimer(ITIMER_REAL, itv));
+
+ // After sleeping for kSleepSecs, the itimer value should reflect the elapsed
+ // time even if it hasn't expired.
+ absl::SleepFor(absl::Seconds(kSleepSecs));
+ ASSERT_THAT(getitimer(ITIMER_REAL, &itv), SyscallSucceeds());
+ EXPECT_TRUE(
+ itv.it_value.tv_sec < kMaxRemainingSecs ||
+ (itv.it_value.tv_sec == kMaxRemainingSecs && itv.it_value.tv_usec == 0))
+ << "Remaining time: " << itv.it_value.tv_sec << " seconds + "
+ << itv.it_value.tv_usec << " microseconds";
+}
+
+ABSL_CONST_INIT static thread_local std::atomic_int signal_test_num_samples =
+ ATOMIC_VAR_INIT(0);
+
+void SignalTestSignalHandler(int /*signum*/) { signal_test_num_samples++; }
+
+struct SignalTestResult {
+ int expected_total;
+ int main_thread_samples;
+ std::vector<int> worker_samples;
+};
+
+std::ostream& operator<<(std::ostream& os, const SignalTestResult& r) {
+ os << "{expected_total: " << r.expected_total
+ << ", main_thread_samples: " << r.main_thread_samples
+ << ", worker_samples: [";
+ bool first = true;
+ for (int sample : r.worker_samples) {
+ if (!first) {
+ os << ", ";
+ }
+ os << sample;
+ first = false;
+ }
+ os << "]}";
+ return os;
+}
+
+// Starts two worker threads and itimer id and measures the number of signal
+// delivered to each thread.
+SignalTestResult ItimerSignalTest(int id, clock_t main_clock,
+ clock_t worker_clock, int signal,
+ absl::Duration sleep) {
+ signal_test_num_samples = 0;
+
+ struct sigaction sa = {};
+ sa.sa_handler = &SignalTestSignalHandler;
+ sa.sa_flags = SA_RESTART;
+ sigemptyset(&sa.sa_mask);
+ auto sigaction_cleanup = ScopedSigaction(signal, sa).ValueOrDie();
+
+ int socketfds[2];
+ TEST_PCHECK(socketpair(AF_UNIX, SOCK_STREAM, 0, socketfds) == 0);
+
+ // Do the spinning in the workers.
+ std::function<void*(int)> work = [&](int socket_fd) {
+ FileDescriptor fd(socket_fd);
+
+ absl::Time finish = Now(worker_clock) + kTestDuration;
+ while (Now(worker_clock) < finish) {
+ // Blocked on read.
+ char c;
+ RetryEINTR(read)(fd.get(), &c, 1);
+ for (int i = 0; i < kIterations; i++) {
+ // Ensure compiler won't optimize this loop away.
+ asm("");
+ }
+
+ if (sleep != absl::ZeroDuration()) {
+ // Sleep so that the entire process is idle for a while.
+ absl::SleepFor(sleep);
+ }
+
+ // Unblock the other thread.
+ RetryEINTR(write)(fd.get(), &c, 1);
+ }
+
+ return reinterpret_cast<void*>(signal_test_num_samples.load());
+ };
+
+ ScopedThread th1(
+ static_cast<std::function<void*()>>(std::bind(work, socketfds[0])));
+ ScopedThread th2(
+ static_cast<std::function<void*()>>(std::bind(work, socketfds[1])));
+
+ absl::Time start = Now(main_clock);
+ // Start the timer.
+ struct itimerval timer = {};
+ timer.it_value = absl::ToTimeval(kPeriod);
+ timer.it_interval = absl::ToTimeval(kPeriod);
+ auto cleanup_itimer = ScopedItimer(id, timer).ValueOrDie();
+
+ // Unblock th1.
+ //
+ // N.B. th2 owns socketfds[1] but can't close it until it unblocks.
+ char c = 0;
+ TEST_CHECK(write(socketfds[1], &c, 1) == 1);
+
+ SignalTestResult result;
+
+ // Wait for the workers to be done and collect their sample counts.
+ result.worker_samples.push_back(reinterpret_cast<int64_t>(th1.Join()));
+ result.worker_samples.push_back(reinterpret_cast<int64_t>(th2.Join()));
+ cleanup_itimer.Release()();
+ result.expected_total = (Now(main_clock) - start) / kPeriod;
+ result.main_thread_samples = signal_test_num_samples.load();
+
+ return result;
+}
+
+int TestSIGALRMToMainThread() {
+ SignalTestResult result =
+ ItimerSignalTest(ITIMER_REAL, CLOCK_REALTIME, CLOCK_REALTIME, SIGALRM,
+ absl::ZeroDuration());
+
+ std::cerr << "result: " << result << std::endl;
+
+ // ITIMER_REAL-generated SIGALRMs prefer to deliver to the thread group leader
+ // (but don't guarantee it), so we expect to see most samples on the main
+ // thread.
+ //
+ // The number of SIGALRMs delivered to a worker should not exceed 20%
+ // of the number of total signals expected (this is somewhat arbitrary).
+ const int worker_threshold = result.expected_total / 5;
+
+ //
+ // Linux only guarantees timers will never expire before the requested time.
+ // Thus, we only check the upper bound and also it at least have one sample.
+ TEST_CHECK(result.main_thread_samples <= result.expected_total);
+ TEST_CHECK(result.main_thread_samples > 0);
+ for (int num : result.worker_samples) {
+ TEST_CHECK_MSG(num <= worker_threshold, "worker received too many samples");
+ }
+
+ return 0;
+}
+
+// Random save/restore is disabled as it introduces additional latency and
+// unpredictable distribution patterns.
+TEST(ItimerTest, DeliversSIGALRMToMainThread_NoRandomSave) {
+ pid_t child;
+ int execve_errno;
+ auto kill = ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec("/proc/self/exe", {"/proc/self/exe", kSIGALRMToMainThread},
+ {}, &child, &execve_errno));
+ EXPECT_EQ(0, execve_errno);
+
+ int status;
+ EXPECT_THAT(RetryEINTR(waitpid)(child, &status, 0),
+ SyscallSucceedsWithValue(child));
+
+ // Not required anymore.
+ kill.Release();
+
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0) << status;
+}
+
+// Signals are delivered to threads fairly.
+//
+// sleep indicates how long to sleep worker threads each iteration to make the
+// entire process idle.
+int TestSIGPROFFairness(absl::Duration sleep) {
+ SignalTestResult result =
+ ItimerSignalTest(ITIMER_PROF, CLOCK_PROCESS_CPUTIME_ID,
+ CLOCK_THREAD_CPUTIME_ID, SIGPROF, sleep);
+
+ std::cerr << "result: " << result << std::endl;
+
+ // The number of samples on the main thread should be very low as it did
+ // nothing.
+ TEST_CHECK(result.main_thread_samples < 80);
+
+ // Both workers should get roughly equal number of samples.
+ TEST_CHECK(result.worker_samples.size() == 2);
+
+ TEST_CHECK(result.expected_total > 0);
+
+ // In an ideal world each thread would get exactly 50% of the signals,
+ // but since that's unlikely to happen we allow for them to get no less than
+ // kNumSamplesDeviationRatio of the total observed samples.
+ TEST_CHECK_MSG(std::abs(result.worker_samples[0] - result.worker_samples[1]) <
+ ((result.worker_samples[0] + result.worker_samples[1]) *
+ kNumSamplesDeviationRatio),
+ "one worker received disproportionate share of samples");
+
+ return 0;
+}
+
+// Random save/restore is disabled as it introduces additional latency and
+// unpredictable distribution patterns.
+TEST(ItimerTest, DeliversSIGPROFToThreadsRoughlyFairlyActive_NoRandomSave) {
+ // On the KVM and ptrace platforms, switches between sentry and application
+ // context are sometimes extremely slow, causing the itimer to send SIGPROF to
+ // a thread that either already has one pending or has had SIGPROF delivered,
+ // but hasn't handled it yet (and thus therefore still has SIGPROF masked). In
+ // either case, since itimer signals are group-directed, signal sending falls
+ // back to notifying the thread group leader. ItimerSignalTest() fails if "too
+ // many" signals are delivered to the thread group leader, so these tests are
+ // flaky on these platforms.
+ //
+ // TODO(b/143247272): Clarify why context switches are so slow on KVM.
+ const auto gvisor_platform = GvisorPlatform();
+ SKIP_IF(gvisor_platform == Platform::kKVM ||
+ gvisor_platform == Platform::kPtrace);
+
+ pid_t child;
+ int execve_errno;
+ auto kill = ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec("/proc/self/exe", {"/proc/self/exe", kSIGPROFFairnessActive},
+ {}, &child, &execve_errno));
+ EXPECT_EQ(0, execve_errno);
+
+ int status;
+ EXPECT_THAT(RetryEINTR(waitpid)(child, &status, 0),
+ SyscallSucceedsWithValue(child));
+
+ // Not required anymore.
+ kill.Release();
+
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+ << "Exited with code: " << status;
+}
+
+// Random save/restore is disabled as it introduces additional latency and
+// unpredictable distribution patterns.
+TEST(ItimerTest, DeliversSIGPROFToThreadsRoughlyFairlyIdle_NoRandomSave) {
+ // See comment in DeliversSIGPROFToThreadsRoughlyFairlyActive.
+ const auto gvisor_platform = GvisorPlatform();
+ SKIP_IF(gvisor_platform == Platform::kKVM ||
+ gvisor_platform == Platform::kPtrace);
+
+ pid_t child;
+ int execve_errno;
+ auto kill = ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec("/proc/self/exe", {"/proc/self/exe", kSIGPROFFairnessIdle},
+ {}, &child, &execve_errno));
+ EXPECT_EQ(0, execve_errno);
+
+ int status;
+ EXPECT_THAT(RetryEINTR(waitpid)(child, &status, 0),
+ SyscallSucceedsWithValue(child));
+
+ // Not required anymore.
+ kill.Release();
+
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+ << "Exited with code: " << status;
+}
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
+
+namespace {
+void MaskSIGPIPE() {
+ // Always mask SIGPIPE as it's common and tests aren't expected to handle it.
+ // We don't take the TestInit() path so we must do this manually.
+ struct sigaction sa = {};
+ sa.sa_handler = SIG_IGN;
+ TEST_CHECK(sigaction(SIGPIPE, &sa, nullptr) == 0);
+}
+} // namespace
+
+int main(int argc, char** argv) {
+ // These tests require no background threads, so check for them before
+ // TestInit.
+ for (int i = 0; i < argc; i++) {
+ absl::string_view arg(argv[i]);
+
+ if (arg == gvisor::testing::kSIGALRMToMainThread) {
+ MaskSIGPIPE();
+ return gvisor::testing::TestSIGALRMToMainThread();
+ }
+ if (arg == gvisor::testing::kSIGPROFFairnessActive) {
+ MaskSIGPIPE();
+ return gvisor::testing::TestSIGPROFFairness(absl::ZeroDuration());
+ }
+ if (arg == gvisor::testing::kSIGPROFFairnessIdle) {
+ MaskSIGPIPE();
+ // Sleep time > ClockTick (10ms) exercises sleeping gVisor's
+ // kernel.cpuClockTicker.
+ return gvisor::testing::TestSIGPROFFairness(absl::Milliseconds(25));
+ }
+ }
+
+ gvisor::testing::TestInit(&argc, &argv);
+ return gvisor::testing::RunAllTests();
+}
diff --git a/test/syscalls/linux/kill.cc b/test/syscalls/linux/kill.cc
new file mode 100644
index 000000000..db29bd59c
--- /dev/null
+++ b/test/syscalls/linux/kill.cc
@@ -0,0 +1,383 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <cerrno>
+#include <csignal>
+
+#include "gtest/gtest.h"
+#include "absl/flags/flag.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/logging.h"
+#include "test/util/signal_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+ABSL_FLAG(int32_t, scratch_uid, 65534, "scratch UID");
+ABSL_FLAG(int32_t, scratch_gid, 65534, "scratch GID");
+
+using ::testing::Ge;
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(KillTest, CanKillValidPid) {
+ // If pid is positive, then signal sig is sent to the process with the ID
+ // specified by pid.
+ EXPECT_THAT(kill(getpid(), 0), SyscallSucceeds());
+ // If pid equals 0, then sig is sent to every process in the process group of
+ // the calling process.
+ EXPECT_THAT(kill(0, 0), SyscallSucceeds());
+
+ ScopedThread([] { EXPECT_THAT(kill(gettid(), 0), SyscallSucceeds()); });
+}
+
+void SigHandler(int sig, siginfo_t* info, void* context) { _exit(0); }
+
+// If pid equals -1, then sig is sent to every process for which the calling
+// process has permission to send signals, except for process 1 (init).
+TEST(KillTest, CanKillAllPIDs) {
+ int pipe_fds[2];
+ ASSERT_THAT(pipe(pipe_fds), SyscallSucceeds());
+ FileDescriptor read_fd(pipe_fds[0]);
+ FileDescriptor write_fd(pipe_fds[1]);
+
+ pid_t pid = fork();
+ if (pid == 0) {
+ read_fd.reset();
+
+ struct sigaction sa;
+ sa.sa_sigaction = SigHandler;
+ sigfillset(&sa.sa_mask);
+ sa.sa_flags = SA_SIGINFO;
+ TEST_PCHECK(sigaction(SIGWINCH, &sa, nullptr) == 0);
+ MaybeSave();
+
+ // Indicate to the parent that we're ready.
+ write_fd.reset();
+
+ // Wait until we get the signal from the parent.
+ while (true) {
+ pause();
+ }
+ }
+
+ ASSERT_THAT(pid, SyscallSucceeds());
+
+ write_fd.reset();
+
+ // Wait for the child to indicate that it's unmasked the signal by closing
+ // the write end.
+ char buf;
+ ASSERT_THAT(ReadFd(read_fd.get(), &buf, 1), SyscallSucceedsWithValue(0));
+
+ // Signal the child and wait for it to die with status 0, indicating that
+ // it got the expected signal.
+ EXPECT_THAT(kill(-1, SIGWINCH), SyscallSucceeds());
+
+ int status;
+ ASSERT_THAT(RetryEINTR(waitpid)(pid, &status, 0),
+ SyscallSucceedsWithValue(pid));
+ EXPECT_TRUE(WIFEXITED(status));
+ EXPECT_EQ(0, WEXITSTATUS(status));
+}
+
+TEST(KillTest, CannotKillInvalidPID) {
+ // We need an unused pid to verify that kill fails when given one.
+ //
+ // There is no way to guarantee that a PID is unused, but the PID of a
+ // recently exited process likely won't be reused soon.
+ pid_t fake_pid = fork();
+ if (fake_pid == 0) {
+ _exit(0);
+ }
+
+ ASSERT_THAT(fake_pid, SyscallSucceeds());
+
+ int status;
+ ASSERT_THAT(RetryEINTR(waitpid)(fake_pid, &status, 0),
+ SyscallSucceedsWithValue(fake_pid));
+ EXPECT_TRUE(WIFEXITED(status));
+ EXPECT_EQ(0, WEXITSTATUS(status));
+
+ EXPECT_THAT(kill(fake_pid, 0), SyscallFailsWithErrno(ESRCH));
+}
+
+TEST(KillTest, CannotUseInvalidSignal) {
+ EXPECT_THAT(kill(getpid(), 200), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(KillTest, CanKillRemoteProcess) {
+ pid_t pid = fork();
+ if (pid == 0) {
+ while (true) {
+ pause();
+ }
+ }
+
+ ASSERT_THAT(pid, SyscallSucceeds());
+
+ EXPECT_THAT(kill(pid, SIGKILL), SyscallSucceeds());
+
+ int status;
+ ASSERT_THAT(RetryEINTR(waitpid)(pid, &status, 0),
+ SyscallSucceedsWithValue(pid));
+ EXPECT_TRUE(WIFSIGNALED(status));
+ EXPECT_EQ(SIGKILL, WTERMSIG(status));
+}
+
+TEST(KillTest, CanKillOwnProcess) {
+ EXPECT_THAT(kill(getpid(), 0), SyscallSucceeds());
+}
+
+// Verify that you can kill a process even using a tid from a thread other than
+// the group leader.
+TEST(KillTest, CannotKillTid) {
+ pid_t tid;
+ bool tid_available = false;
+ bool finished = false;
+ absl::Mutex mu;
+ ScopedThread t([&] {
+ mu.Lock();
+ tid = gettid();
+ tid_available = true;
+ mu.Await(absl::Condition(&finished));
+ mu.Unlock();
+ });
+ mu.LockWhen(absl::Condition(&tid_available));
+ EXPECT_THAT(kill(tid, 0), SyscallSucceeds());
+ finished = true;
+ mu.Unlock();
+}
+
+TEST(KillTest, SetPgid) {
+ for (int i = 0; i < 10; i++) {
+ // The following in the normal pattern for creating a new process group.
+ // Both the parent and child process will call setpgid in order to avoid any
+ // race conditions. We do this ten times to catch races.
+ pid_t pid = fork();
+ if (pid == 0) {
+ setpgid(0, 0);
+ while (true) {
+ pause();
+ }
+ }
+
+ ASSERT_THAT(pid, SyscallSucceeds());
+
+ // Set the child's group and exit.
+ ASSERT_THAT(setpgid(pid, pid), SyscallSucceeds());
+ EXPECT_THAT(kill(pid, SIGKILL), SyscallSucceeds());
+
+ int status;
+ EXPECT_THAT(RetryEINTR(waitpid)(-pid, &status, 0),
+ SyscallSucceedsWithValue(pid));
+ EXPECT_TRUE(WIFSIGNALED(status));
+ EXPECT_EQ(SIGKILL, WTERMSIG(status));
+ }
+}
+
+TEST(KillTest, ProcessGroups) {
+ // Fork a new child.
+ //
+ // other_child is used as a placeholder process. We use this PID as our "does
+ // not exist" process group to ensure some amount of safety. (It is still
+ // possible to violate this assumption, but extremely unlikely.)
+ pid_t child = fork();
+ if (child == 0) {
+ while (true) {
+ pause();
+ }
+ }
+ ASSERT_THAT(child, SyscallSucceeds());
+
+ pid_t other_child = fork();
+ if (other_child == 0) {
+ while (true) {
+ pause();
+ }
+ }
+ ASSERT_THAT(other_child, SyscallSucceeds());
+
+ // Ensure the kill does not succeed without the new group.
+ EXPECT_THAT(kill(-child, SIGKILL), SyscallFailsWithErrno(ESRCH));
+
+ // Put the child in its own process group.
+ ASSERT_THAT(setpgid(child, child), SyscallSucceeds());
+
+ // This should be not allowed: you can only create a new group with the same
+ // id or join an existing one. The other_child group should not exist.
+ ASSERT_THAT(setpgid(child, other_child), SyscallFailsWithErrno(EPERM));
+
+ // Done with other_child; kill it.
+ EXPECT_THAT(kill(other_child, SIGKILL), SyscallSucceeds());
+ int status;
+ EXPECT_THAT(RetryEINTR(waitpid)(other_child, &status, 0), SyscallSucceeds());
+
+ // Linux returns success for the no-op call.
+ ASSERT_THAT(setpgid(child, child), SyscallSucceeds());
+
+ // Kill the child's process group.
+ ASSERT_THAT(kill(-child, SIGKILL), SyscallSucceeds());
+
+ // Wait on the process group; ensure that the signal was as expected.
+ EXPECT_THAT(RetryEINTR(waitpid)(-child, &status, 0),
+ SyscallSucceedsWithValue(child));
+ EXPECT_TRUE(WIFSIGNALED(status));
+ EXPECT_EQ(SIGKILL, WTERMSIG(status));
+
+ // Try to kill the process group again; ensure that the wait fails.
+ EXPECT_THAT(kill(-child, SIGKILL), SyscallFailsWithErrno(ESRCH));
+ EXPECT_THAT(RetryEINTR(waitpid)(-child, &status, 0),
+ SyscallFailsWithErrno(ECHILD));
+}
+
+TEST(KillTest, ChildDropsPrivsCannotKill) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SETUID)));
+
+ const int uid = absl::GetFlag(FLAGS_scratch_uid);
+ const int gid = absl::GetFlag(FLAGS_scratch_gid);
+
+ // Create the child that drops privileges and tries to kill the parent.
+ pid_t pid = fork();
+ if (pid == 0) {
+ TEST_PCHECK(setresgid(gid, gid, gid) == 0);
+ MaybeSave();
+
+ TEST_PCHECK(setresuid(uid, uid, uid) == 0);
+ MaybeSave();
+
+ // setresuid should have dropped CAP_KILL. Make sure.
+ TEST_CHECK(!HaveCapability(CAP_KILL).ValueOrDie());
+
+ // Try to kill parent with every signal-sending syscall possible.
+ pid_t parent = getppid();
+
+ TEST_CHECK(kill(parent, SIGKILL) < 0);
+ TEST_PCHECK_MSG(errno == EPERM, "kill failed with wrong errno");
+ MaybeSave();
+
+ TEST_CHECK(tgkill(parent, parent, SIGKILL) < 0);
+ TEST_PCHECK_MSG(errno == EPERM, "tgkill failed with wrong errno");
+ MaybeSave();
+
+ TEST_CHECK(syscall(SYS_tkill, parent, SIGKILL) < 0);
+ TEST_PCHECK_MSG(errno == EPERM, "tkill failed with wrong errno");
+ MaybeSave();
+
+ siginfo_t uinfo;
+ uinfo.si_code = -1; // SI_QUEUE (allowed).
+
+ TEST_CHECK(syscall(SYS_rt_sigqueueinfo, parent, SIGKILL, &uinfo) < 0);
+ TEST_PCHECK_MSG(errno == EPERM, "rt_sigqueueinfo failed with wrong errno");
+ MaybeSave();
+
+ TEST_CHECK(syscall(SYS_rt_tgsigqueueinfo, parent, parent, SIGKILL, &uinfo) <
+ 0);
+ TEST_PCHECK_MSG(errno == EPERM, "rt_sigqueueinfo failed with wrong errno");
+ MaybeSave();
+
+ _exit(0);
+ }
+
+ ASSERT_THAT(pid, SyscallSucceeds());
+
+ int status;
+ EXPECT_THAT(RetryEINTR(waitpid)(pid, &status, 0),
+ SyscallSucceedsWithValue(pid));
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+ << "status = " << status;
+}
+
+TEST(KillTest, CanSIGCONTSameSession) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SETUID)));
+
+ pid_t stopped_child = fork();
+ if (stopped_child == 0) {
+ raise(SIGSTOP);
+ _exit(0);
+ }
+
+ ASSERT_THAT(stopped_child, SyscallSucceeds());
+
+ // Put the child in its own process group. The child and parent process
+ // groups also share a session.
+ ASSERT_THAT(setpgid(stopped_child, stopped_child), SyscallSucceeds());
+
+ // Make sure child stopped.
+ int status;
+ EXPECT_THAT(RetryEINTR(waitpid)(stopped_child, &status, WUNTRACED),
+ SyscallSucceedsWithValue(stopped_child));
+ EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP)
+ << "status " << status;
+
+ const int uid = absl::GetFlag(FLAGS_scratch_uid);
+ const int gid = absl::GetFlag(FLAGS_scratch_gid);
+
+ // Drop privileges only in child process, or else this parent process won't be
+ // able to open some log files after the test ends.
+ pid_t other_child = fork();
+ if (other_child == 0) {
+ // Drop privileges.
+ TEST_PCHECK(setresgid(gid, gid, gid) == 0);
+ MaybeSave();
+
+ TEST_PCHECK(setresuid(uid, uid, uid) == 0);
+ MaybeSave();
+
+ // setresuid should have dropped CAP_KILL.
+ TEST_CHECK(!HaveCapability(CAP_KILL).ValueOrDie());
+
+ // Child 2 and child should now not share a thread group and any UIDs.
+ // Child 2 should have no privileges. That means any signal other than
+ // SIGCONT should fail.
+ TEST_CHECK(kill(stopped_child, SIGKILL) < 0);
+ TEST_PCHECK_MSG(errno == EPERM, "kill failed with wrong errno");
+ MaybeSave();
+
+ TEST_PCHECK(kill(stopped_child, SIGCONT) == 0);
+ MaybeSave();
+
+ _exit(0);
+ }
+
+ ASSERT_THAT(stopped_child, SyscallSucceeds());
+
+ // Make sure child exited normally.
+ EXPECT_THAT(RetryEINTR(waitpid)(stopped_child, &status, 0),
+ SyscallSucceedsWithValue(stopped_child));
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+ << "status " << status;
+
+ // Make sure other_child exited normally.
+ EXPECT_THAT(RetryEINTR(waitpid)(other_child, &status, 0),
+ SyscallSucceedsWithValue(other_child));
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+ << "status " << status;
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/link.cc b/test/syscalls/linux/link.cc
new file mode 100644
index 000000000..544681168
--- /dev/null
+++ b/test/syscalls/linux/link.cc
@@ -0,0 +1,305 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <string>
+
+#include "gtest/gtest.h"
+#include "absl/flags/flag.h"
+#include "absl/strings/str_cat.h"
+#include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+ABSL_FLAG(int32_t, scratch_uid, 65534, "scratch UID");
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// IsSameFile returns true if both filenames have the same device and inode.
+bool IsSameFile(const std::string& f1, const std::string& f2) {
+ // Use lstat rather than stat, so that symlinks are not followed.
+ struct stat stat1 = {};
+ EXPECT_THAT(lstat(f1.c_str(), &stat1), SyscallSucceeds());
+ struct stat stat2 = {};
+ EXPECT_THAT(lstat(f2.c_str(), &stat2), SyscallSucceeds());
+
+ return stat1.st_dev == stat2.st_dev && stat1.st_ino == stat2.st_ino;
+}
+
+TEST(LinkTest, CanCreateLinkFile) {
+ auto oldfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const std::string newname = NewTempAbsPath();
+
+ // Get the initial link count.
+ uint64_t initial_link_count =
+ ASSERT_NO_ERRNO_AND_VALUE(Links(oldfile.path()));
+
+ EXPECT_THAT(link(oldfile.path().c_str(), newname.c_str()), SyscallSucceeds());
+
+ EXPECT_TRUE(IsSameFile(oldfile.path(), newname));
+
+ // Link count should be incremented.
+ EXPECT_THAT(Links(oldfile.path()),
+ IsPosixErrorOkAndHolds(initial_link_count + 1));
+
+ // Delete the link.
+ EXPECT_THAT(unlink(newname.c_str()), SyscallSucceeds());
+
+ // Link count should be back to initial.
+ EXPECT_THAT(Links(oldfile.path()),
+ IsPosixErrorOkAndHolds(initial_link_count));
+}
+
+TEST(LinkTest, PermissionDenied) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_FOWNER)));
+
+ // Make the file "unsafe" to link by making it only readable, but not
+ // writable.
+ const auto unwriteable_file =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileMode(0400));
+ const std::string special_path = NewTempAbsPath();
+ ASSERT_THAT(mkfifo(special_path.c_str(), 0666), SyscallSucceeds());
+ const auto setuid_file =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileMode(0666 | S_ISUID));
+
+ const std::string newname = NewTempAbsPath();
+
+ // Do setuid in a separate thread so that after finishing this test, the
+ // process can still open files the test harness created before starting this
+ // test. Otherwise, the files are created by root (UID before the test), but
+ // cannot be opened by the `uid` set below after the test. After calling
+ // setuid(non-zero-UID), there is no way to get root privileges back.
+ ScopedThread([&] {
+ // Use syscall instead of glibc setuid wrapper because we want this setuid
+ // call to only apply to this task. POSIX threads, however, require that all
+ // threads have the same UIDs, so using the setuid wrapper sets all threads'
+ // real UID.
+ // Also drops capabilities.
+ EXPECT_THAT(syscall(SYS_setuid, absl::GetFlag(FLAGS_scratch_uid)),
+ SyscallSucceeds());
+
+ EXPECT_THAT(link(unwriteable_file.path().c_str(), newname.c_str()),
+ SyscallFailsWithErrno(EPERM));
+ EXPECT_THAT(link(special_path.c_str(), newname.c_str()),
+ SyscallFailsWithErrno(EPERM));
+ if (!IsRunningWithVFS1()) {
+ EXPECT_THAT(link(setuid_file.path().c_str(), newname.c_str()),
+ SyscallFailsWithErrno(EPERM));
+ }
+ });
+}
+
+TEST(LinkTest, CannotLinkDirectory) {
+ auto olddir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const std::string newdir = NewTempAbsPath();
+
+ EXPECT_THAT(link(olddir.path().c_str(), newdir.c_str()),
+ SyscallFailsWithErrno(EPERM));
+
+ EXPECT_THAT(rmdir(olddir.path().c_str()), SyscallSucceeds());
+}
+
+TEST(LinkTest, CannotLinkWithSlash) {
+ auto oldfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ // Put a final "/" on newname.
+ const std::string newname = absl::StrCat(NewTempAbsPath(), "/");
+
+ EXPECT_THAT(link(oldfile.path().c_str(), newname.c_str()),
+ SyscallFailsWithErrno(ENOENT));
+}
+
+TEST(LinkTest, OldnameIsEmpty) {
+ const std::string newname = NewTempAbsPath();
+ EXPECT_THAT(link("", newname.c_str()), SyscallFailsWithErrno(ENOENT));
+}
+
+TEST(LinkTest, OldnameDoesNotExist) {
+ const std::string oldname = NewTempAbsPath();
+ const std::string newname = NewTempAbsPath();
+ EXPECT_THAT(link("", newname.c_str()), SyscallFailsWithErrno(ENOENT));
+}
+
+TEST(LinkTest, NewnameCannotExist) {
+ const std::string newname =
+ JoinPath(GetAbsoluteTestTmpdir(), "thisdoesnotexist", "foo");
+ EXPECT_THAT(link("/thisdoesnotmatter", newname.c_str()),
+ SyscallFailsWithErrno(ENOENT));
+}
+
+TEST(LinkTest, WithOldDirFD) {
+ const std::string oldname_parent = NewTempAbsPath();
+ const std::string oldname_base = "child";
+ const std::string oldname = JoinPath(oldname_parent, oldname_base);
+ const std::string newname = NewTempAbsPath();
+
+ // Create oldname_parent directory, and get an FD.
+ ASSERT_THAT(mkdir(oldname_parent.c_str(), 0777), SyscallSucceeds());
+ const FileDescriptor oldname_parent_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(oldname_parent, O_DIRECTORY | O_RDONLY));
+
+ // Create oldname file.
+ const FileDescriptor oldname_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(oldname, O_CREAT | O_RDWR, 0666));
+
+ // Link oldname to newname, using oldname_parent_fd.
+ EXPECT_THAT(linkat(oldname_parent_fd.get(), oldname_base.c_str(), AT_FDCWD,
+ newname.c_str(), 0),
+ SyscallSucceeds());
+
+ EXPECT_TRUE(IsSameFile(oldname, newname));
+
+ EXPECT_THAT(unlink(newname.c_str()), SyscallSucceeds());
+ EXPECT_THAT(unlink(oldname.c_str()), SyscallSucceeds());
+ EXPECT_THAT(rmdir(oldname_parent.c_str()), SyscallSucceeds());
+}
+
+TEST(LinkTest, BogusFlags) {
+ ASSERT_THAT(linkat(1, "foo", 2, "bar", 3), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(LinkTest, WithNewDirFD) {
+ auto oldfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const std::string newname_parent = NewTempAbsPath();
+ const std::string newname_base = "child";
+ const std::string newname = JoinPath(newname_parent, newname_base);
+
+ // Create newname_parent directory, and get an FD.
+ EXPECT_THAT(mkdir(newname_parent.c_str(), 0777), SyscallSucceeds());
+ const FileDescriptor newname_parent_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(newname_parent, O_DIRECTORY | O_RDONLY));
+
+ // Link newname to oldfile, using newname_parent_fd.
+ EXPECT_THAT(linkat(AT_FDCWD, oldfile.path().c_str(), newname_parent_fd.get(),
+ newname.c_str(), 0),
+ SyscallSucceeds());
+
+ EXPECT_TRUE(IsSameFile(oldfile.path(), newname));
+
+ EXPECT_THAT(unlink(newname.c_str()), SyscallSucceeds());
+ EXPECT_THAT(rmdir(newname_parent.c_str()), SyscallSucceeds());
+}
+
+TEST(LinkTest, RelPathsWithNonDirFDs) {
+ auto oldfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+ // Create a file that will be passed as the directory fd for old/new names.
+ const std::string filename = NewTempAbsPath();
+ const FileDescriptor file_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(filename, O_CREAT | O_RDWR, 0666));
+
+ // Using file_fd as olddirfd will fail.
+ EXPECT_THAT(linkat(file_fd.get(), "foo", AT_FDCWD, "bar", 0),
+ SyscallFailsWithErrno(ENOTDIR));
+
+ // Using file_fd as newdirfd will fail.
+ EXPECT_THAT(linkat(AT_FDCWD, oldfile.path().c_str(), file_fd.get(), "bar", 0),
+ SyscallFailsWithErrno(ENOTDIR));
+}
+
+TEST(LinkTest, AbsPathsWithNonDirFDs) {
+ auto oldfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const std::string newname = NewTempAbsPath();
+
+ // Create a file that will be passed as the directory fd for old/new names.
+ const std::string filename = NewTempAbsPath();
+ const FileDescriptor file_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(filename, O_CREAT | O_RDWR, 0666));
+
+ // Using file_fd as the dirfds is OK as long as paths are absolute.
+ EXPECT_THAT(linkat(file_fd.get(), oldfile.path().c_str(), file_fd.get(),
+ newname.c_str(), 0),
+ SyscallSucceeds());
+}
+
+TEST(LinkTest, LinkDoesNotFollowSymlinks) {
+ // Create oldfile, and oldsymlink which points to it.
+ auto oldfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const std::string oldsymlink = NewTempAbsPath();
+ EXPECT_THAT(symlink(oldfile.path().c_str(), oldsymlink.c_str()),
+ SyscallSucceeds());
+
+ // Now hard link newname to oldsymlink.
+ const std::string newname = NewTempAbsPath();
+ EXPECT_THAT(link(oldsymlink.c_str(), newname.c_str()), SyscallSucceeds());
+
+ // The link should not have resolved the symlink, so newname and oldsymlink
+ // are the same.
+ EXPECT_TRUE(IsSameFile(oldsymlink, newname));
+ EXPECT_FALSE(IsSameFile(oldfile.path(), newname));
+
+ EXPECT_THAT(unlink(oldsymlink.c_str()), SyscallSucceeds());
+ EXPECT_THAT(unlink(newname.c_str()), SyscallSucceeds());
+}
+
+TEST(LinkTest, LinkatDoesNotFollowSymlinkByDefault) {
+ // Create oldfile, and oldsymlink which points to it.
+ auto oldfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const std::string oldsymlink = NewTempAbsPath();
+ EXPECT_THAT(symlink(oldfile.path().c_str(), oldsymlink.c_str()),
+ SyscallSucceeds());
+
+ // Now hard link newname to oldsymlink.
+ const std::string newname = NewTempAbsPath();
+ EXPECT_THAT(
+ linkat(AT_FDCWD, oldsymlink.c_str(), AT_FDCWD, newname.c_str(), 0),
+ SyscallSucceeds());
+
+ // The link should not have resolved the symlink, so newname and oldsymlink
+ // are the same.
+ EXPECT_TRUE(IsSameFile(oldsymlink, newname));
+ EXPECT_FALSE(IsSameFile(oldfile.path(), newname));
+
+ EXPECT_THAT(unlink(oldsymlink.c_str()), SyscallSucceeds());
+ EXPECT_THAT(unlink(newname.c_str()), SyscallSucceeds());
+}
+
+TEST(LinkTest, LinkatWithSymlinkFollow) {
+ // Create oldfile, and oldsymlink which points to it.
+ auto oldfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const std::string oldsymlink = NewTempAbsPath();
+ ASSERT_THAT(symlink(oldfile.path().c_str(), oldsymlink.c_str()),
+ SyscallSucceeds());
+
+ // Now hard link newname to oldsymlink, and pass AT_SYMLINK_FOLLOW flag.
+ const std::string newname = NewTempAbsPath();
+ ASSERT_THAT(linkat(AT_FDCWD, oldsymlink.c_str(), AT_FDCWD, newname.c_str(),
+ AT_SYMLINK_FOLLOW),
+ SyscallSucceeds());
+
+ // The link should have resolved the symlink, so oldfile and newname are the
+ // same.
+ EXPECT_TRUE(IsSameFile(oldfile.path(), newname));
+ EXPECT_FALSE(IsSameFile(oldsymlink, newname));
+
+ EXPECT_THAT(unlink(oldsymlink.c_str()), SyscallSucceeds());
+ EXPECT_THAT(unlink(newname.c_str()), SyscallSucceeds());
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/lseek.cc b/test/syscalls/linux/lseek.cc
new file mode 100644
index 000000000..6ce1e6cc3
--- /dev/null
+++ b/test/syscalls/linux/lseek.cc
@@ -0,0 +1,202 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(LseekTest, InvalidWhence) {
+ const std::string kFileData = "hello world\n";
+ const TempPath path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), kFileData, TempPath::kDefaultFileMode));
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(path.path(), O_RDWR, 0644));
+
+ ASSERT_THAT(lseek(fd.get(), 0, -1), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(LseekTest, NegativeOffset) {
+ const std::string kFileData = "hello world\n";
+ const TempPath path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), kFileData, TempPath::kDefaultFileMode));
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(path.path(), O_RDWR, 0644));
+
+ EXPECT_THAT(lseek(fd.get(), -(kFileData.length() + 1), SEEK_CUR),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+// A 32-bit off_t is not large enough to represent an offset larger than
+// maximum file size on standard file systems, so it isn't possible to cause
+// overflow.
+#if defined(__x86_64__) || defined(__aarch64__)
+TEST(LseekTest, Overflow) {
+ // HA! Classic Linux. We really should have an EOVERFLOW
+ // here, since we're seeking to something that cannot be
+ // represented.. but instead we are given an EINVAL.
+ const std::string kFileData = "hello world\n";
+ const TempPath path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), kFileData, TempPath::kDefaultFileMode));
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(path.path(), O_RDWR, 0644));
+ EXPECT_THAT(lseek(fd.get(), 0x7fffffffffffffff, SEEK_END),
+ SyscallFailsWithErrno(EINVAL));
+}
+#endif
+
+TEST(LseekTest, Set) {
+ const std::string kFileData = "hello world\n";
+ const TempPath path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), kFileData, TempPath::kDefaultFileMode));
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(path.path(), O_RDWR, 0644));
+
+ char buf = '\0';
+ EXPECT_THAT(lseek(fd.get(), 0, SEEK_SET), SyscallSucceedsWithValue(0));
+ ASSERT_THAT(read(fd.get(), &buf, 1), SyscallSucceedsWithValue(1));
+ EXPECT_EQ(buf, kFileData.c_str()[0]);
+ EXPECT_THAT(lseek(fd.get(), 6, SEEK_SET), SyscallSucceedsWithValue(6));
+ ASSERT_THAT(read(fd.get(), &buf, 1), SyscallSucceedsWithValue(1));
+ EXPECT_EQ(buf, kFileData.c_str()[6]);
+}
+
+TEST(LseekTest, Cur) {
+ const std::string kFileData = "hello world\n";
+ const TempPath path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), kFileData, TempPath::kDefaultFileMode));
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(path.path(), O_RDWR, 0644));
+
+ char buf = '\0';
+ EXPECT_THAT(lseek(fd.get(), 0, SEEK_SET), SyscallSucceedsWithValue(0));
+ ASSERT_THAT(read(fd.get(), &buf, 1), SyscallSucceedsWithValue(1));
+ EXPECT_EQ(buf, kFileData.c_str()[0]);
+ EXPECT_THAT(lseek(fd.get(), 3, SEEK_CUR), SyscallSucceedsWithValue(4));
+ ASSERT_THAT(read(fd.get(), &buf, 1), SyscallSucceedsWithValue(1));
+ EXPECT_EQ(buf, kFileData.c_str()[4]);
+}
+
+TEST(LseekTest, End) {
+ const std::string kFileData = "hello world\n";
+ const TempPath path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), kFileData, TempPath::kDefaultFileMode));
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(path.path(), O_RDWR, 0644));
+
+ char buf = '\0';
+ EXPECT_THAT(lseek(fd.get(), 0, SEEK_SET), SyscallSucceedsWithValue(0));
+ ASSERT_THAT(read(fd.get(), &buf, 1), SyscallSucceedsWithValue(1));
+ EXPECT_EQ(buf, kFileData.c_str()[0]);
+ EXPECT_THAT(lseek(fd.get(), -2, SEEK_END), SyscallSucceedsWithValue(10));
+ ASSERT_THAT(read(fd.get(), &buf, 1), SyscallSucceedsWithValue(1));
+ EXPECT_EQ(buf, kFileData.c_str()[kFileData.length() - 2]);
+}
+
+TEST(LseekTest, InvalidFD) {
+ EXPECT_THAT(lseek(-1, 0, SEEK_SET), SyscallFailsWithErrno(EBADF));
+}
+
+TEST(LseekTest, DirCurEnd) {
+ const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open("/tmp", O_RDONLY));
+ ASSERT_THAT(lseek(fd.get(), 0, SEEK_CUR), SyscallSucceedsWithValue(0));
+}
+
+TEST(LseekTest, ProcDir) {
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/proc/self", O_RDONLY));
+ ASSERT_THAT(lseek(fd.get(), 0, SEEK_CUR), SyscallSucceeds());
+ ASSERT_THAT(lseek(fd.get(), 0, SEEK_END), SyscallSucceeds());
+}
+
+TEST(LseekTest, ProcFile) {
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/proc/meminfo", O_RDONLY));
+ ASSERT_THAT(lseek(fd.get(), 0, SEEK_CUR), SyscallSucceeds());
+ ASSERT_THAT(lseek(fd.get(), 0, SEEK_END), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(LseekTest, SysDir) {
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/sys/devices", O_RDONLY));
+ ASSERT_THAT(lseek(fd.get(), 0, SEEK_CUR), SyscallSucceeds());
+ ASSERT_THAT(lseek(fd.get(), 0, SEEK_END), SyscallSucceeds());
+}
+
+TEST(LseekTest, SeekCurrentDir) {
+ // From include/linux/fs.h.
+ constexpr loff_t MAX_LFS_FILESIZE = 0x7fffffffffffffff;
+
+ char* dir = get_current_dir_name();
+ const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(dir, O_RDONLY));
+
+ ASSERT_THAT(lseek(fd.get(), 0, SEEK_CUR), SyscallSucceeds());
+ ASSERT_THAT(lseek(fd.get(), 0, SEEK_END),
+ // Some filesystems (like ext4) allow lseek(SEEK_END) on a
+ // directory and return MAX_LFS_FILESIZE, others return EINVAL.
+ AnyOf(SyscallSucceedsWithValue(MAX_LFS_FILESIZE),
+ SyscallFailsWithErrno(EINVAL)));
+ free(dir);
+}
+
+TEST(LseekTest, ProcStatTwice) {
+ const FileDescriptor fd1 =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/proc/stat", O_RDONLY));
+ const FileDescriptor fd2 =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/proc/stat", O_RDONLY));
+
+ ASSERT_THAT(lseek(fd1.get(), 0, SEEK_CUR), SyscallSucceedsWithValue(0));
+ ASSERT_THAT(lseek(fd1.get(), 0, SEEK_END), SyscallFailsWithErrno(EINVAL));
+ ASSERT_THAT(lseek(fd1.get(), 1000, SEEK_CUR), SyscallSucceeds());
+ // Check that just because we moved fd1, fd2 doesn't move.
+ ASSERT_THAT(lseek(fd2.get(), 0, SEEK_CUR), SyscallSucceedsWithValue(0));
+
+ const FileDescriptor fd3 =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/proc/stat", O_RDONLY));
+ ASSERT_THAT(lseek(fd3.get(), 0, SEEK_CUR), SyscallSucceedsWithValue(0));
+}
+
+TEST(LseekTest, EtcPasswdDup) {
+ const FileDescriptor fd1 =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/etc/passwd", O_RDONLY));
+ const FileDescriptor fd2 = ASSERT_NO_ERRNO_AND_VALUE(fd1.Dup());
+
+ ASSERT_THAT(lseek(fd1.get(), 0, SEEK_CUR), SyscallSucceedsWithValue(0));
+ ASSERT_THAT(lseek(fd2.get(), 0, SEEK_CUR), SyscallSucceedsWithValue(0));
+ ASSERT_THAT(lseek(fd1.get(), 1000, SEEK_CUR), SyscallSucceeds());
+ // Check that just because we moved fd1, fd2 doesn't move.
+ ASSERT_THAT(lseek(fd2.get(), 0, SEEK_CUR), SyscallSucceedsWithValue(1000));
+
+ const FileDescriptor fd3 = ASSERT_NO_ERRNO_AND_VALUE(fd1.Dup());
+ ASSERT_THAT(lseek(fd3.get(), 0, SEEK_CUR), SyscallSucceedsWithValue(1000));
+}
+
+// TODO(magi): Add tests where we have donated in sockets.
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/madvise.cc b/test/syscalls/linux/madvise.cc
new file mode 100644
index 000000000..5a1973f60
--- /dev/null
+++ b/test/syscalls/linux/madvise.cc
@@ -0,0 +1,251 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include <string>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/logging.h"
+#include "test/util/memory_util.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void ExpectAllMappingBytes(Mapping const& m, char c) {
+ auto const v = m.view();
+ for (size_t i = 0; i < kPageSize; i++) {
+ ASSERT_EQ(v[i], c) << "at offset " << i;
+ }
+}
+
+// Equivalent to ExpectAllMappingBytes but async-signal-safe and with less
+// helpful failure messages.
+void CheckAllMappingBytes(Mapping const& m, char c) {
+ auto const v = m.view();
+ for (size_t i = 0; i < kPageSize; i++) {
+ TEST_CHECK_MSG(v[i] == c, "mapping contains wrong value");
+ }
+}
+
+TEST(MadviseDontneedTest, ZerosPrivateAnonPage) {
+ auto m = ASSERT_NO_ERRNO_AND_VALUE(
+ MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+ ExpectAllMappingBytes(m, 0);
+ memset(m.ptr(), 1, m.len());
+ ExpectAllMappingBytes(m, 1);
+ ASSERT_THAT(madvise(m.ptr(), m.len(), MADV_DONTNEED), SyscallSucceeds());
+ ExpectAllMappingBytes(m, 0);
+}
+
+TEST(MadviseDontneedTest, ZerosCOWAnonPageInCallerOnly) {
+ auto m = ASSERT_NO_ERRNO_AND_VALUE(
+ MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+ ExpectAllMappingBytes(m, 0);
+ memset(m.ptr(), 2, m.len());
+ ExpectAllMappingBytes(m, 2);
+
+ // Do madvise in a child process.
+ pid_t pid = fork();
+ CheckAllMappingBytes(m, 2);
+ if (pid == 0) {
+ TEST_PCHECK(madvise(m.ptr(), m.len(), MADV_DONTNEED) == 0);
+ CheckAllMappingBytes(m, 0);
+ _exit(0);
+ }
+
+ ASSERT_THAT(pid, SyscallSucceeds());
+
+ int status = 0;
+ ASSERT_THAT(waitpid(-1, &status, 0), SyscallSucceedsWithValue(pid));
+ EXPECT_TRUE(WIFEXITED(status));
+ EXPECT_EQ(WEXITSTATUS(status), 0);
+ // The child's madvise should not have affected the parent.
+ ExpectAllMappingBytes(m, 2);
+}
+
+TEST(MadviseDontneedTest, DoesNotModifySharedAnonPage) {
+ auto m = ASSERT_NO_ERRNO_AND_VALUE(
+ MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED));
+ ExpectAllMappingBytes(m, 0);
+ memset(m.ptr(), 3, m.len());
+ ExpectAllMappingBytes(m, 3);
+ ASSERT_THAT(madvise(m.ptr(), m.len(), MADV_DONTNEED), SyscallSucceeds());
+ ExpectAllMappingBytes(m, 3);
+}
+
+TEST(MadviseDontneedTest, CleansPrivateFilePage) {
+ TempPath f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ /* parent = */ GetAbsoluteTestTmpdir(),
+ /* content = */ std::string(kPageSize, 4), TempPath::kDefaultFileMode));
+ FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(f.path(), O_RDWR));
+
+ Mapping m = ASSERT_NO_ERRNO_AND_VALUE(Mmap(
+ nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd.get(), 0));
+ ExpectAllMappingBytes(m, 4);
+ memset(m.ptr(), 5, m.len());
+ ExpectAllMappingBytes(m, 5);
+ ASSERT_THAT(madvise(m.ptr(), m.len(), MADV_DONTNEED), SyscallSucceeds());
+ ExpectAllMappingBytes(m, 4);
+}
+
+TEST(MadviseDontneedTest, DoesNotModifySharedFilePage) {
+ TempPath f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ /* parent = */ GetAbsoluteTestTmpdir(),
+ /* content = */ std::string(kPageSize, 6), TempPath::kDefaultFileMode));
+ FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(f.path(), O_RDWR));
+
+ Mapping m = ASSERT_NO_ERRNO_AND_VALUE(Mmap(
+ nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd.get(), 0));
+ ExpectAllMappingBytes(m, 6);
+ memset(m.ptr(), 7, m.len());
+ ExpectAllMappingBytes(m, 7);
+ ASSERT_THAT(madvise(m.ptr(), m.len(), MADV_DONTNEED), SyscallSucceeds());
+ ExpectAllMappingBytes(m, 7);
+}
+
+TEST(MadviseDontneedTest, IgnoresPermissions) {
+ auto m =
+ ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_NONE, MAP_PRIVATE));
+ EXPECT_THAT(madvise(m.ptr(), m.len(), MADV_DONTNEED), SyscallSucceeds());
+}
+
+TEST(MadviseDontforkTest, AddressLength) {
+ auto m =
+ ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_NONE, MAP_PRIVATE));
+ char* addr = static_cast<char*>(m.ptr());
+
+ // Address must be page aligned.
+ EXPECT_THAT(madvise(addr + 1, kPageSize, MADV_DONTFORK),
+ SyscallFailsWithErrno(EINVAL));
+
+ // Zero length madvise always succeeds.
+ EXPECT_THAT(madvise(addr, 0, MADV_DONTFORK), SyscallSucceeds());
+
+ // Length must not roll over after rounding up.
+ size_t badlen = std::numeric_limits<std::size_t>::max() - (kPageSize / 2);
+ EXPECT_THAT(madvise(0, badlen, MADV_DONTFORK), SyscallFailsWithErrno(EINVAL));
+
+ // Length need not be page aligned - it is implicitly rounded up.
+ EXPECT_THAT(madvise(addr, 1, MADV_DONTFORK), SyscallSucceeds());
+ EXPECT_THAT(madvise(addr, kPageSize, MADV_DONTFORK), SyscallSucceeds());
+}
+
+TEST(MadviseDontforkTest, DontforkShared) {
+ // Mmap two shared file-backed pages and MADV_DONTFORK the second page.
+ TempPath f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ /* parent = */ GetAbsoluteTestTmpdir(),
+ /* content = */ std::string(kPageSize * 2, 2),
+ TempPath::kDefaultFileMode));
+ FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(f.path(), O_RDWR));
+
+ Mapping m = ASSERT_NO_ERRNO_AND_VALUE(Mmap(
+ nullptr, kPageSize * 2, PROT_READ | PROT_WRITE, MAP_SHARED, fd.get(), 0));
+
+ const Mapping ms1 = Mapping(reinterpret_cast<void*>(m.addr()), kPageSize);
+ const Mapping ms2 =
+ Mapping(reinterpret_cast<void*>(m.addr() + kPageSize), kPageSize);
+ m.release();
+
+ ASSERT_THAT(madvise(ms2.ptr(), kPageSize, MADV_DONTFORK), SyscallSucceeds());
+
+ const auto rest = [&] {
+ // First page is mapped in child and modifications are visible to parent
+ // via the shared mapping.
+ TEST_CHECK(IsMapped(ms1.addr()));
+ ExpectAllMappingBytes(ms1, 2);
+ memset(ms1.ptr(), 1, kPageSize);
+ ExpectAllMappingBytes(ms1, 1);
+
+ // Second page must not be mapped in child.
+ TEST_CHECK(!IsMapped(ms2.addr()));
+ };
+
+ EXPECT_THAT(InForkedProcess(rest), IsPosixErrorOkAndHolds(0));
+
+ ExpectAllMappingBytes(ms1, 1); // page contents modified by child.
+ ExpectAllMappingBytes(ms2, 2); // page contents unchanged.
+}
+
+TEST(MadviseDontforkTest, DontforkAnonPrivate) {
+ // Mmap three anonymous pages and MADV_DONTFORK the middle page.
+ Mapping m = ASSERT_NO_ERRNO_AND_VALUE(
+ MmapAnon(kPageSize * 3, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+ const Mapping mp1 = Mapping(reinterpret_cast<void*>(m.addr()), kPageSize);
+ const Mapping mp2 =
+ Mapping(reinterpret_cast<void*>(m.addr() + kPageSize), kPageSize);
+ const Mapping mp3 =
+ Mapping(reinterpret_cast<void*>(m.addr() + 2 * kPageSize), kPageSize);
+ m.release();
+
+ ASSERT_THAT(madvise(mp2.ptr(), kPageSize, MADV_DONTFORK), SyscallSucceeds());
+
+ // Verify that all pages are zeroed and memset the first, second and third
+ // pages to 1, 2, and 3 respectively.
+ ExpectAllMappingBytes(mp1, 0);
+ memset(mp1.ptr(), 1, kPageSize);
+
+ ExpectAllMappingBytes(mp2, 0);
+ memset(mp2.ptr(), 2, kPageSize);
+
+ ExpectAllMappingBytes(mp3, 0);
+ memset(mp3.ptr(), 3, kPageSize);
+
+ const auto rest = [&] {
+ // Verify first page is mapped, verify its contents and then modify the
+ // page. The mapping is private so the modifications are not visible to
+ // the parent.
+ TEST_CHECK(IsMapped(mp1.addr()));
+ ExpectAllMappingBytes(mp1, 1);
+ memset(mp1.ptr(), 11, kPageSize);
+ ExpectAllMappingBytes(mp1, 11);
+
+ // Verify second page is not mapped.
+ TEST_CHECK(!IsMapped(mp2.addr()));
+
+ // Verify third page is mapped, verify its contents and then modify the
+ // page. The mapping is private so the modifications are not visible to
+ // the parent.
+ TEST_CHECK(IsMapped(mp3.addr()));
+ ExpectAllMappingBytes(mp3, 3);
+ memset(mp3.ptr(), 13, kPageSize);
+ ExpectAllMappingBytes(mp3, 13);
+ };
+ EXPECT_THAT(InForkedProcess(rest), IsPosixErrorOkAndHolds(0));
+
+ // The fork and COW by child should not affect the parent mappings.
+ ExpectAllMappingBytes(mp1, 1);
+ ExpectAllMappingBytes(mp2, 2);
+ ExpectAllMappingBytes(mp3, 3);
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/memfd.cc b/test/syscalls/linux/memfd.cc
new file mode 100644
index 000000000..f8b7f7938
--- /dev/null
+++ b/test/syscalls/linux/memfd.cc
@@ -0,0 +1,557 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/magic.h>
+#include <linux/memfd.h>
+#include <linux/unistd.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/statfs.h>
+#include <sys/syscall.h>
+
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/memory_util.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+// The header sys/memfd.h isn't available on all systems, so redefining some of
+// the constants here.
+#define F_LINUX_SPECIFIC_BASE 1024
+
+#ifndef F_ADD_SEALS
+#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9)
+#endif /* F_ADD_SEALS */
+
+#ifndef F_GET_SEALS
+#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10)
+#endif /* F_GET_SEALS */
+
+#define F_SEAL_SEAL 0x0001
+#define F_SEAL_SHRINK 0x0002
+#define F_SEAL_GROW 0x0004
+#define F_SEAL_WRITE 0x0008
+
+using ::testing::StartsWith;
+
+const std::string kMemfdName = "some-memfd";
+
+int memfd_create(const std::string& name, unsigned int flags) {
+ return syscall(__NR_memfd_create, name.c_str(), flags);
+}
+
+PosixErrorOr<FileDescriptor> MemfdCreate(const std::string& name,
+ uint32_t flags) {
+ int fd = memfd_create(name, flags);
+ if (fd < 0) {
+ return PosixError(
+ errno, absl::StrFormat("memfd_create(\"%s\", %#x)", name, flags));
+ }
+ MaybeSave();
+ return FileDescriptor(fd);
+}
+
+// Procfs entries for memfds display the appropriate name.
+TEST(MemfdTest, Name) {
+ const FileDescriptor memfd =
+ ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, 0));
+ const std::string proc_name = ASSERT_NO_ERRNO_AND_VALUE(
+ ReadLink(absl::StrFormat("/proc/self/fd/%d", memfd.get())));
+ EXPECT_THAT(proc_name, StartsWith("/memfd:" + kMemfdName));
+}
+
+// Memfds support read/write syscalls.
+TEST(MemfdTest, WriteRead) {
+ const FileDescriptor memfd =
+ ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, 0));
+
+ // Write a random page of data to the memfd via write(2).
+ std::vector<char> buf(kPageSize);
+ RandomizeBuffer(buf.data(), buf.size());
+ ASSERT_THAT(write(memfd.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(kPageSize));
+
+ // Read back the same data and verify.
+ std::vector<char> buf2(kPageSize);
+ ASSERT_THAT(lseek(memfd.get(), 0, SEEK_SET), SyscallSucceeds());
+ EXPECT_THAT(read(memfd.get(), buf2.data(), buf2.size()),
+ SyscallSucceedsWithValue(kPageSize));
+ EXPECT_EQ(buf, buf2);
+}
+
+// Memfds can be mapped and used as usual.
+TEST(MemfdTest, Mmap) {
+ const FileDescriptor memfd =
+ ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, 0));
+ const Mapping m1 = ASSERT_NO_ERRNO_AND_VALUE(Mmap(
+ nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED, memfd.get(), 0));
+
+ // Write a random page of data to the memfd via mmap m1.
+ std::vector<char> buf(kPageSize);
+ RandomizeBuffer(buf.data(), buf.size());
+ ASSERT_THAT(ftruncate(memfd.get(), kPageSize), SyscallSucceeds());
+ memcpy(m1.ptr(), buf.data(), buf.size());
+
+ // Read the data back via a read syscall on the memfd.
+ std::vector<char> buf2(kPageSize);
+ EXPECT_THAT(read(memfd.get(), buf2.data(), buf2.size()),
+ SyscallSucceedsWithValue(kPageSize));
+ EXPECT_EQ(buf, buf2);
+
+ // The same data should be accessible via a new mapping m2.
+ const Mapping m2 = ASSERT_NO_ERRNO_AND_VALUE(Mmap(
+ nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED, memfd.get(), 0));
+ EXPECT_EQ(0, memcmp(m1.ptr(), m2.ptr(), kPageSize));
+}
+
+TEST(MemfdTest, DuplicateFDsShareContent) {
+ const FileDescriptor memfd =
+ ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, 0));
+ const Mapping m1 = ASSERT_NO_ERRNO_AND_VALUE(Mmap(
+ nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED, memfd.get(), 0));
+ const FileDescriptor memfd2 = ASSERT_NO_ERRNO_AND_VALUE(memfd.Dup());
+
+ // Write a random page of data to the memfd via mmap m1.
+ std::vector<char> buf(kPageSize);
+ RandomizeBuffer(buf.data(), buf.size());
+ ASSERT_THAT(ftruncate(memfd.get(), kPageSize), SyscallSucceeds());
+ memcpy(m1.ptr(), buf.data(), buf.size());
+
+ // Read the data back via a read syscall on a duplicate fd.
+ std::vector<char> buf2(kPageSize);
+ EXPECT_THAT(read(memfd2.get(), buf2.data(), buf2.size()),
+ SyscallSucceedsWithValue(kPageSize));
+ EXPECT_EQ(buf, buf2);
+}
+
+// File seals are disabled by default on memfds.
+TEST(MemfdTest, SealingDisabledByDefault) {
+ const FileDescriptor memfd =
+ ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, 0));
+ EXPECT_THAT(fcntl(memfd.get(), F_GET_SEALS),
+ SyscallSucceedsWithValue(F_SEAL_SEAL));
+ // Attempting to set any seal should fail.
+ EXPECT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE),
+ SyscallFailsWithErrno(EPERM));
+}
+
+// Seals can be retrieved and updated for memfds.
+TEST(MemfdTest, SealsGetSet) {
+ const FileDescriptor memfd =
+ ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING));
+ int seals;
+ ASSERT_THAT(seals = fcntl(memfd.get(), F_GET_SEALS), SyscallSucceeds());
+ // No seals are set yet.
+ EXPECT_EQ(0, seals);
+
+ // Set a seal and check that we can get it back.
+ ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE), SyscallSucceeds());
+ EXPECT_THAT(fcntl(memfd.get(), F_GET_SEALS),
+ SyscallSucceedsWithValue(F_SEAL_WRITE));
+
+ // Set some more seals and verify.
+ ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_GROW | F_SEAL_SHRINK),
+ SyscallSucceeds());
+ EXPECT_THAT(
+ fcntl(memfd.get(), F_GET_SEALS),
+ SyscallSucceedsWithValue(F_SEAL_WRITE | F_SEAL_GROW | F_SEAL_SHRINK));
+
+ // Attempting to set a seal that is already set is a no-op.
+ ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE), SyscallSucceeds());
+ EXPECT_THAT(
+ fcntl(memfd.get(), F_GET_SEALS),
+ SyscallSucceedsWithValue(F_SEAL_WRITE | F_SEAL_GROW | F_SEAL_SHRINK));
+
+ // Add remaining seals and verify.
+ ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_SEAL), SyscallSucceeds());
+ EXPECT_THAT(fcntl(memfd.get(), F_GET_SEALS),
+ SyscallSucceedsWithValue(F_SEAL_WRITE | F_SEAL_GROW |
+ F_SEAL_SHRINK | F_SEAL_SEAL));
+}
+
+// F_SEAL_GROW prevents a memfd from being grown using ftruncate.
+TEST(MemfdTest, SealGrowWithTruncate) {
+ const FileDescriptor memfd =
+ ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING));
+ ASSERT_THAT(ftruncate(memfd.get(), kPageSize), SyscallSucceeds());
+ ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_GROW), SyscallSucceeds());
+
+ // Try grow the memfd by 1 page.
+ ASSERT_THAT(ftruncate(memfd.get(), kPageSize * 2),
+ SyscallFailsWithErrno(EPERM));
+
+ // Ftruncate calls that don't actually grow the memfd are allowed.
+ ASSERT_THAT(ftruncate(memfd.get(), kPageSize), SyscallSucceeds());
+ ASSERT_THAT(ftruncate(memfd.get(), kPageSize / 2), SyscallSucceeds());
+
+ // After shrinking, growing back is not allowed.
+ ASSERT_THAT(ftruncate(memfd.get(), kPageSize), SyscallFailsWithErrno(EPERM));
+}
+
+// F_SEAL_GROW prevents a memfd from being grown using the write syscall.
+TEST(MemfdTest, SealGrowWithWrite) {
+ const FileDescriptor memfd =
+ ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING));
+
+ // Initially, writing to the memfd succeeds.
+ const std::vector<char> buf(kPageSize);
+ EXPECT_THAT(write(memfd.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(kPageSize));
+
+ // Apply F_SEAL_GROW, subsequent writes which extend the memfd should fail.
+ ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_GROW), SyscallSucceeds());
+ EXPECT_THAT(write(memfd.get(), buf.data(), buf.size()),
+ SyscallFailsWithErrno(EPERM));
+
+ // However, zero-length writes are ok since they don't grow the memfd.
+ EXPECT_THAT(write(memfd.get(), buf.data(), 0), SyscallSucceeds());
+
+ // Writing to existing parts of the memfd is also ok.
+ ASSERT_THAT(lseek(memfd.get(), 0, SEEK_SET), SyscallSucceeds());
+ EXPECT_THAT(write(memfd.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(kPageSize));
+
+ // Returning the end of the file and writing still not allowed.
+ EXPECT_THAT(write(memfd.get(), buf.data(), buf.size()),
+ SyscallFailsWithErrno(EPERM));
+}
+
+// F_SEAL_GROW causes writes which partially extend off the current EOF to
+// partially succeed, up to the page containing the EOF.
+TEST(MemfdTest, SealGrowPartialWriteTruncated) {
+ const FileDescriptor memfd =
+ ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING));
+ ASSERT_THAT(ftruncate(memfd.get(), kPageSize), SyscallSucceeds());
+ ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_GROW), SyscallSucceeds());
+
+ // FD offset: 1 page, EOF: 1 page.
+
+ ASSERT_THAT(lseek(memfd.get(), kPageSize * 3 / 4, SEEK_SET),
+ SyscallSucceeds());
+
+ // FD offset: 3/4 page. Writing a full page now should only write 1/4 page
+ // worth of data. This partially succeeds because the first page is entirely
+ // within the file and requires no growth, but attempting to write the final
+ // 3/4 page would require growing the file.
+ const std::vector<char> buf(kPageSize);
+ EXPECT_THAT(write(memfd.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(kPageSize / 4));
+}
+
+// F_SEAL_GROW causes writes which partially extend off the current EOF to fail
+// in its entirety if the only data written would be to the page containing the
+// EOF.
+TEST(MemfdTest, SealGrowPartialWriteTruncatedSamePage) {
+ const FileDescriptor memfd =
+ ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING));
+ ASSERT_THAT(ftruncate(memfd.get(), kPageSize * 3 / 4), SyscallSucceeds());
+ ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_GROW), SyscallSucceeds());
+
+ // EOF: 3/4 page, writing 1/2 page starting at 1/2 page would cause the file
+ // to grow. Since this would require only the page containing the EOF to be
+ // modified, the write is rejected entirely.
+ const std::vector<char> buf(kPageSize / 2);
+ EXPECT_THAT(pwrite(memfd.get(), buf.data(), buf.size(), kPageSize / 2),
+ SyscallFailsWithErrno(EPERM));
+
+ // However, writing up to EOF is fine.
+ EXPECT_THAT(pwrite(memfd.get(), buf.data(), buf.size() / 2, kPageSize / 2),
+ SyscallSucceedsWithValue(kPageSize / 4));
+}
+
+// F_SEAL_SHRINK prevents a memfd from being shrunk using ftruncate.
+TEST(MemfdTest, SealShrink) {
+ const FileDescriptor memfd =
+ ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING));
+ ASSERT_THAT(ftruncate(memfd.get(), kPageSize), SyscallSucceeds());
+ ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_SHRINK),
+ SyscallSucceeds());
+
+ // Shrink by half a page.
+ ASSERT_THAT(ftruncate(memfd.get(), kPageSize / 2),
+ SyscallFailsWithErrno(EPERM));
+
+ // Ftruncate calls that don't actually shrink the file are allowed.
+ ASSERT_THAT(ftruncate(memfd.get(), kPageSize), SyscallSucceeds());
+ ASSERT_THAT(ftruncate(memfd.get(), kPageSize * 2), SyscallSucceeds());
+
+ // After growing, shrinking is still not allowed.
+ ASSERT_THAT(ftruncate(memfd.get(), kPageSize), SyscallFailsWithErrno(EPERM));
+}
+
+// F_SEAL_WRITE prevents a memfd from being written to through a write
+// syscall.
+TEST(MemfdTest, SealWriteWithWrite) {
+ const FileDescriptor memfd =
+ ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING));
+ const std::vector<char> buf(kPageSize);
+ ASSERT_THAT(write(memfd.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(kPageSize));
+ ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE), SyscallSucceeds());
+
+ // Attemping to write at the end of the file fails.
+ EXPECT_THAT(write(memfd.get(), buf.data(), 1), SyscallFailsWithErrno(EPERM));
+
+ // Attemping to overwrite an existing part of the memfd fails.
+ EXPECT_THAT(pwrite(memfd.get(), buf.data(), 1, 0),
+ SyscallFailsWithErrno(EPERM));
+ EXPECT_THAT(pwrite(memfd.get(), buf.data(), buf.size() / 2, kPageSize / 2),
+ SyscallFailsWithErrno(EPERM));
+ EXPECT_THAT(pwrite(memfd.get(), buf.data(), buf.size(), kPageSize / 2),
+ SyscallFailsWithErrno(EPERM));
+
+ // Zero-length writes however do not fail.
+ EXPECT_THAT(write(memfd.get(), buf.data(), 0), SyscallSucceeds());
+}
+
+// F_SEAL_WRITE prevents a memfd from being written to through an mmap.
+TEST(MemfdTest, SealWriteWithMmap) {
+ const FileDescriptor memfd =
+ ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING));
+ const std::vector<char> buf(kPageSize);
+ ASSERT_THAT(write(memfd.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(kPageSize));
+ ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE), SyscallSucceeds());
+
+ // Can't create a shared mapping with writes sealed.
+ void* ret = mmap(nullptr, kPageSize, PROT_WRITE, MAP_SHARED, memfd.get(), 0);
+ EXPECT_EQ(ret, MAP_FAILED);
+ EXPECT_EQ(errno, EPERM);
+ ret = mmap(nullptr, kPageSize, PROT_READ, MAP_SHARED, memfd.get(), 0);
+ EXPECT_EQ(ret, MAP_FAILED);
+ EXPECT_EQ(errno, EPERM);
+
+ // However, private mappings are ok.
+ EXPECT_NO_ERRNO(Mmap(nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE,
+ memfd.get(), 0));
+}
+
+// Adding F_SEAL_WRITE fails when there are outstanding writable mappings to a
+// memfd.
+TEST(MemfdTest, SealWriteWithOutstandingWritbleMapping) {
+ const FileDescriptor memfd =
+ ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING));
+ const std::vector<char> buf(kPageSize);
+ ASSERT_THAT(write(memfd.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(kPageSize));
+
+ // Attempting to add F_SEAL_WRITE with active shared mapping with any set of
+ // permissions fails.
+
+ // Read-only shared mapping.
+ {
+ const Mapping m = ASSERT_NO_ERRNO_AND_VALUE(
+ Mmap(nullptr, kPageSize, PROT_READ, MAP_SHARED, memfd.get(), 0));
+ EXPECT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE),
+ SyscallFailsWithErrno(EBUSY));
+ }
+
+ // Write-only shared mapping.
+ {
+ const Mapping m = ASSERT_NO_ERRNO_AND_VALUE(
+ Mmap(nullptr, kPageSize, PROT_WRITE, MAP_SHARED, memfd.get(), 0));
+ EXPECT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE),
+ SyscallFailsWithErrno(EBUSY));
+ }
+
+ // Read-write shared mapping.
+ {
+ const Mapping m = ASSERT_NO_ERRNO_AND_VALUE(
+ Mmap(nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
+ memfd.get(), 0));
+ EXPECT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE),
+ SyscallFailsWithErrno(EBUSY));
+ }
+
+ // F_SEAL_WRITE can be set with private mappings with any permissions.
+ {
+ const Mapping m = ASSERT_NO_ERRNO_AND_VALUE(
+ Mmap(nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE,
+ memfd.get(), 0));
+ EXPECT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE),
+ SyscallSucceeds());
+ }
+}
+
+// When applying F_SEAL_WRITE fails due to outstanding writable mappings, any
+// additional seals passed to the same add seal call are also rejected.
+TEST(MemfdTest, NoPartialSealApplicationWhenWriteSealRejected) {
+ const FileDescriptor memfd =
+ ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING));
+ const Mapping m = ASSERT_NO_ERRNO_AND_VALUE(Mmap(
+ nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED, memfd.get(), 0));
+
+ // Try add some seals along with F_SEAL_WRITE. The seal application should
+ // fail since there exists an active shared mapping.
+ EXPECT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE | F_SEAL_GROW),
+ SyscallFailsWithErrno(EBUSY));
+
+ // None of the seals should be applied.
+ EXPECT_THAT(fcntl(memfd.get(), F_GET_SEALS), SyscallSucceedsWithValue(0));
+}
+
+// Seals are inode level properties, and apply to all file descriptors referring
+// to a memfd.
+TEST(MemfdTest, SealsAreInodeLevelProperties) {
+ const FileDescriptor memfd =
+ ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING));
+ const FileDescriptor memfd2 = ASSERT_NO_ERRNO_AND_VALUE(memfd.Dup());
+
+ // Add seal through the original memfd, and verify that it appears on the
+ // dupped fd.
+ ASSERT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE), SyscallSucceeds());
+ EXPECT_THAT(fcntl(memfd2.get(), F_GET_SEALS),
+ SyscallSucceedsWithValue(F_SEAL_WRITE));
+
+ // Verify the seal actually applies to both fds.
+ std::vector<char> buf(kPageSize);
+ EXPECT_THAT(write(memfd.get(), buf.data(), buf.size()),
+ SyscallFailsWithErrno(EPERM));
+ EXPECT_THAT(write(memfd2.get(), buf.data(), buf.size()),
+ SyscallFailsWithErrno(EPERM));
+
+ // Seals are enforced on new FDs that are dupped after the seal is already
+ // applied.
+ const FileDescriptor memfd3 = ASSERT_NO_ERRNO_AND_VALUE(memfd2.Dup());
+ EXPECT_THAT(write(memfd3.get(), buf.data(), buf.size()),
+ SyscallFailsWithErrno(EPERM));
+
+ // Try a new seal applied to one of the dupped fds.
+ ASSERT_THAT(fcntl(memfd3.get(), F_ADD_SEALS, F_SEAL_GROW), SyscallSucceeds());
+ EXPECT_THAT(ftruncate(memfd.get(), kPageSize), SyscallFailsWithErrno(EPERM));
+ EXPECT_THAT(ftruncate(memfd2.get(), kPageSize), SyscallFailsWithErrno(EPERM));
+ EXPECT_THAT(ftruncate(memfd3.get(), kPageSize), SyscallFailsWithErrno(EPERM));
+}
+
+PosixErrorOr<bool> IsTmpfs(const std::string& path) {
+ struct statfs stat;
+ if (statfs(path.c_str(), &stat)) {
+ if (errno == ENOENT) {
+ // Nothing at path, don't raise this as an error. Instead, just report no
+ // tmpfs at path.
+ return false;
+ }
+ return PosixError(errno,
+ absl::StrFormat("statfs(\"%s\", %#p)", path, &stat));
+ }
+ return stat.f_type == TMPFS_MAGIC;
+}
+
+// Tmpfs files also support seals, but are created with F_SEAL_SEAL.
+TEST(MemfdTest, TmpfsFilesHaveSealSeal) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsTmpfs("/tmp")));
+ const TempPath tmpfs_file =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn("/tmp"));
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(tmpfs_file.path(), O_RDWR, 0644));
+ EXPECT_THAT(fcntl(fd.get(), F_GET_SEALS),
+ SyscallSucceedsWithValue(F_SEAL_SEAL));
+}
+
+// Can open a memfd from procfs and use as normal.
+TEST(MemfdTest, CanOpenFromProcfs) {
+ const FileDescriptor memfd =
+ ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING));
+
+ // Write a random page of data to the memfd via write(2).
+ std::vector<char> buf(kPageSize);
+ RandomizeBuffer(buf.data(), buf.size());
+ ASSERT_THAT(write(memfd.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(kPageSize));
+
+ // Read back the same data from the fd obtained from procfs and verify.
+ const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Open(absl::StrFormat("/proc/self/fd/%d", memfd.get()), O_RDWR));
+ std::vector<char> buf2(kPageSize);
+ EXPECT_THAT(pread(fd.get(), buf2.data(), buf2.size(), 0),
+ SyscallSucceedsWithValue(kPageSize));
+ EXPECT_EQ(buf, buf2);
+}
+
+// Test that memfd permissions are set up correctly to allow another process to
+// open it from procfs.
+TEST(MemfdTest, OtherProcessCanOpenFromProcfs) {
+ const FileDescriptor memfd =
+ ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING));
+ const auto memfd_path =
+ absl::StrFormat("/proc/%d/fd/%d", getpid(), memfd.get());
+ const auto rest = [&] {
+ int fd = open(memfd_path.c_str(), O_RDWR);
+ TEST_PCHECK(fd >= 0);
+ TEST_PCHECK(close(fd) >= 0);
+ };
+ EXPECT_THAT(InForkedProcess(rest), IsPosixErrorOkAndHolds(0));
+}
+
+// Test that only files opened as writable can have seals applied to them.
+// Normally there's no way to specify file permissions on memfds, but we can
+// obtain a read-only memfd by opening the corresponding procfs fd entry as
+// read-only.
+TEST(MemfdTest, MemfdMustBeWritableToModifySeals) {
+ const FileDescriptor memfd =
+ ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, MFD_ALLOW_SEALING));
+
+ // Initially adding a seal works.
+ EXPECT_THAT(fcntl(memfd.get(), F_ADD_SEALS, F_SEAL_WRITE), SyscallSucceeds());
+
+ // Re-open the memfd as read-only from procfs.
+ const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Open(absl::StrFormat("/proc/self/fd/%d", memfd.get()), O_RDONLY));
+
+ // Can't add seals through an unwritable fd.
+ EXPECT_THAT(fcntl(fd.get(), F_ADD_SEALS, F_SEAL_GROW),
+ SyscallFailsWithErrno(EPERM));
+}
+
+// Test that the memfd implementation internally tracks potentially writable
+// maps correctly.
+TEST(MemfdTest, MultipleWritableAndNonWritableRefsToSameFileRegion) {
+ const FileDescriptor memfd =
+ ASSERT_NO_ERRNO_AND_VALUE(MemfdCreate(kMemfdName, 0));
+
+ // Populate with a random page of data.
+ std::vector<char> buf(kPageSize);
+ RandomizeBuffer(buf.data(), buf.size());
+ ASSERT_THAT(write(memfd.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(kPageSize));
+
+ // Read-only map to the page. This should cause an initial mapping to be
+ // created.
+ Mapping m1 = ASSERT_NO_ERRNO_AND_VALUE(
+ Mmap(nullptr, kPageSize, PROT_READ, MAP_PRIVATE, memfd.get(), 0));
+
+ // Create a shared writable map to the page. This should cause the internal
+ // mapping to become potentially writable.
+ Mapping m2 = ASSERT_NO_ERRNO_AND_VALUE(Mmap(
+ nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED, memfd.get(), 0));
+
+ // Drop the read-only mapping first. If writable-ness isn't tracked correctly,
+ // this can cause some misaccounting, which can trigger asserts internally.
+ m1.reset();
+ m2.reset();
+}
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/memory_accounting.cc b/test/syscalls/linux/memory_accounting.cc
new file mode 100644
index 000000000..94aea4077
--- /dev/null
+++ b/test/syscalls/linux/memory_accounting.cc
@@ -0,0 +1,99 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/mman.h>
+
+#include <map>
+
+#include "gtest/gtest.h"
+#include "absl/strings/match.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_split.h"
+#include "test/util/fs_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+using ::absl::StrFormat;
+
+// AnonUsageFromMeminfo scrapes the current anonymous memory usage from
+// /proc/meminfo and returns it in bytes.
+PosixErrorOr<uint64_t> AnonUsageFromMeminfo() {
+ ASSIGN_OR_RETURN_ERRNO(auto meminfo, GetContents("/proc/meminfo"));
+ std::vector<std::string> lines(absl::StrSplit(meminfo, '\n'));
+
+ // Try to find AnonPages line, the format is AnonPages:\\s+(\\d+) kB\n.
+ for (const auto& line : lines) {
+ if (!absl::StartsWith(line, "AnonPages:")) {
+ continue;
+ }
+
+ std::vector<std::string> parts(
+ absl::StrSplit(line, ' ', absl::SkipEmpty()));
+ if (parts.size() == 3) {
+ // The size is the second field, let's try to parse it as a number.
+ ASSIGN_OR_RETURN_ERRNO(auto anon_kb, Atoi<uint64_t>(parts[1]));
+ return anon_kb * 1024;
+ }
+
+ return PosixError(EINVAL, "AnonPages field in /proc/meminfo was malformed");
+ }
+
+ return PosixError(EINVAL, "AnonPages field not found in /proc/meminfo");
+}
+
+TEST(MemoryAccounting, AnonAccountingPreservedOnSaveRestore) {
+ // This test isn't meaningful on Linux. /proc/meminfo reports system-wide
+ // memory usage, which can change arbitrarily in Linux from other activity on
+ // the machine. In gvisor, this test is the only thing running on the
+ // "machine", so values in /proc/meminfo accurately reflect the memory used by
+ // the test.
+ SKIP_IF(!IsRunningOnGvisor());
+
+ uint64_t anon_initial = ASSERT_NO_ERRNO_AND_VALUE(AnonUsageFromMeminfo());
+
+ // Cause some anonymous memory usage.
+ uint64_t map_bytes = Megabytes(512);
+ char* mem =
+ static_cast<char*>(mmap(nullptr, map_bytes, PROT_READ | PROT_WRITE,
+ MAP_POPULATE | MAP_ANON | MAP_PRIVATE, -1, 0));
+ ASSERT_NE(mem, MAP_FAILED)
+ << "Map failed, errno: " << errno << " (" << strerror(errno) << ").";
+
+ // Write something to each page to prevent them from being decommited on
+ // S/R. Zero pages are dropped on save.
+ for (uint64_t i = 0; i < map_bytes; i += kPageSize) {
+ mem[i] = 'a';
+ }
+
+ uint64_t anon_after_alloc = ASSERT_NO_ERRNO_AND_VALUE(AnonUsageFromMeminfo());
+ EXPECT_THAT(anon_after_alloc,
+ EquivalentWithin(anon_initial + map_bytes, 0.03));
+
+ // We have many implicit S/R cycles from scraping /proc/meminfo throughout the
+ // test, but throw an explicit S/R in here as well.
+ MaybeSave();
+
+ // Usage should remain the same across S/R.
+ uint64_t anon_after_sr = ASSERT_NO_ERRNO_AND_VALUE(AnonUsageFromMeminfo());
+ EXPECT_THAT(anon_after_sr, EquivalentWithin(anon_after_alloc, 0.03));
+}
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/mempolicy.cc b/test/syscalls/linux/mempolicy.cc
new file mode 100644
index 000000000..059fad598
--- /dev/null
+++ b/test/syscalls/linux/mempolicy.cc
@@ -0,0 +1,289 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <sys/syscall.h>
+
+#include "gtest/gtest.h"
+#include "absl/memory/memory.h"
+#include "test/util/cleanup.h"
+#include "test/util/memory_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+#define BITS_PER_BYTE 8
+
+#define MPOL_F_STATIC_NODES (1 << 15)
+#define MPOL_F_RELATIVE_NODES (1 << 14)
+#define MPOL_DEFAULT 0
+#define MPOL_PREFERRED 1
+#define MPOL_BIND 2
+#define MPOL_INTERLEAVE 3
+#define MPOL_LOCAL 4
+#define MPOL_F_NODE (1 << 0)
+#define MPOL_F_ADDR (1 << 1)
+#define MPOL_F_MEMS_ALLOWED (1 << 2)
+#define MPOL_MF_STRICT (1 << 0)
+#define MPOL_MF_MOVE (1 << 1)
+#define MPOL_MF_MOVE_ALL (1 << 2)
+
+int get_mempolicy(int* policy, uint64_t* nmask, uint64_t maxnode, void* addr,
+ int flags) {
+ return syscall(SYS_get_mempolicy, policy, nmask, maxnode, addr, flags);
+}
+
+int set_mempolicy(int mode, uint64_t* nmask, uint64_t maxnode) {
+ return syscall(SYS_set_mempolicy, mode, nmask, maxnode);
+}
+
+int mbind(void* addr, unsigned long len, int mode,
+ const unsigned long* nodemask, unsigned long maxnode,
+ unsigned flags) {
+ return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags);
+}
+
+// Creates a cleanup object that resets the calling thread's mempolicy to the
+// system default when the calling scope ends.
+Cleanup ScopedMempolicy() {
+ return Cleanup([] {
+ EXPECT_THAT(set_mempolicy(MPOL_DEFAULT, nullptr, 0), SyscallSucceeds());
+ });
+}
+
+// Temporarily change the memory policy for the calling thread within the
+// caller's scope.
+PosixErrorOr<Cleanup> ScopedSetMempolicy(int mode, uint64_t* nmask,
+ uint64_t maxnode) {
+ if (set_mempolicy(mode, nmask, maxnode)) {
+ return PosixError(errno, "set_mempolicy");
+ }
+ return ScopedMempolicy();
+}
+
+TEST(MempolicyTest, CheckDefaultPolicy) {
+ int mode = 0;
+ uint64_t nodemask = 0;
+ ASSERT_THAT(get_mempolicy(&mode, &nodemask, sizeof(nodemask) * BITS_PER_BYTE,
+ nullptr, 0),
+ SyscallSucceeds());
+
+ EXPECT_EQ(MPOL_DEFAULT, mode);
+ EXPECT_EQ(0x0, nodemask);
+}
+
+TEST(MempolicyTest, PolicyPreservedAfterSetMempolicy) {
+ uint64_t nodemask = 0x1;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSetMempolicy(
+ MPOL_BIND, &nodemask, sizeof(nodemask) * BITS_PER_BYTE));
+
+ int mode = 0;
+ uint64_t nodemask_after = 0x0;
+ ASSERT_THAT(get_mempolicy(&mode, &nodemask_after,
+ sizeof(nodemask_after) * BITS_PER_BYTE, nullptr, 0),
+ SyscallSucceeds());
+ EXPECT_EQ(MPOL_BIND, mode);
+ EXPECT_EQ(0x1, nodemask_after);
+
+ // Try throw in some mode flags.
+ for (auto mode_flag : {MPOL_F_STATIC_NODES, MPOL_F_RELATIVE_NODES}) {
+ auto cleanup2 = ASSERT_NO_ERRNO_AND_VALUE(
+ ScopedSetMempolicy(MPOL_INTERLEAVE | mode_flag, &nodemask,
+ sizeof(nodemask) * BITS_PER_BYTE));
+ mode = 0;
+ nodemask_after = 0x0;
+ ASSERT_THAT(
+ get_mempolicy(&mode, &nodemask_after,
+ sizeof(nodemask_after) * BITS_PER_BYTE, nullptr, 0),
+ SyscallSucceeds());
+ EXPECT_EQ(MPOL_INTERLEAVE | mode_flag, mode);
+ EXPECT_EQ(0x1, nodemask_after);
+ }
+}
+
+TEST(MempolicyTest, SetMempolicyRejectsInvalidInputs) {
+ auto cleanup = ScopedMempolicy();
+ uint64_t nodemask;
+
+ if (IsRunningOnGvisor()) {
+ // Invalid nodemask, we only support a single node on gvisor.
+ nodemask = 0x4;
+ ASSERT_THAT(set_mempolicy(MPOL_DEFAULT, &nodemask,
+ sizeof(nodemask) * BITS_PER_BYTE),
+ SyscallFailsWithErrno(EINVAL));
+ }
+
+ nodemask = 0x1;
+
+ // Invalid mode.
+ ASSERT_THAT(set_mempolicy(7439, &nodemask, sizeof(nodemask) * BITS_PER_BYTE),
+ SyscallFailsWithErrno(EINVAL));
+
+ // Invalid nodemask size.
+ ASSERT_THAT(set_mempolicy(MPOL_DEFAULT, &nodemask, 0),
+ SyscallFailsWithErrno(EINVAL));
+
+ // Invalid mode flag.
+ ASSERT_THAT(
+ set_mempolicy(MPOL_DEFAULT | MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES,
+ &nodemask, sizeof(nodemask) * BITS_PER_BYTE),
+ SyscallFailsWithErrno(EINVAL));
+
+ // MPOL_INTERLEAVE with empty nodemask.
+ nodemask = 0x0;
+ ASSERT_THAT(set_mempolicy(MPOL_INTERLEAVE, &nodemask,
+ sizeof(nodemask) * BITS_PER_BYTE),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+// The manpages specify that the nodemask provided to set_mempolicy are
+// considered empty if the nodemask pointer is null, or if the nodemask size is
+// 0. We use a policy which accepts both empty and non-empty nodemasks
+// (MPOL_PREFERRED), a policy which requires a non-empty nodemask (MPOL_BIND),
+// and a policy which completely ignores the nodemask (MPOL_DEFAULT) to verify
+// argument checking around nodemasks.
+TEST(MempolicyTest, EmptyNodemaskOnSet) {
+ auto cleanup = ScopedMempolicy();
+
+ EXPECT_THAT(set_mempolicy(MPOL_DEFAULT, nullptr, 1), SyscallSucceeds());
+ EXPECT_THAT(set_mempolicy(MPOL_BIND, nullptr, 1),
+ SyscallFailsWithErrno(EINVAL));
+ EXPECT_THAT(set_mempolicy(MPOL_PREFERRED, nullptr, 1), SyscallSucceeds());
+
+ uint64_t nodemask = 0x1;
+ EXPECT_THAT(set_mempolicy(MPOL_DEFAULT, &nodemask, 0),
+ SyscallFailsWithErrno(EINVAL));
+ EXPECT_THAT(set_mempolicy(MPOL_BIND, &nodemask, 0),
+ SyscallFailsWithErrno(EINVAL));
+ EXPECT_THAT(set_mempolicy(MPOL_PREFERRED, &nodemask, 0),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(MempolicyTest, QueryAvailableNodes) {
+ uint64_t nodemask = 0;
+ ASSERT_THAT(
+ get_mempolicy(nullptr, &nodemask, sizeof(nodemask) * BITS_PER_BYTE,
+ nullptr, MPOL_F_MEMS_ALLOWED),
+ SyscallSucceeds());
+ // We can only be sure there is a single node if running on gvisor.
+ if (IsRunningOnGvisor()) {
+ EXPECT_EQ(0x1, nodemask);
+ }
+
+ // MPOL_F_ADDR and MPOL_F_NODE flags may not be combined with
+ // MPOL_F_MEMS_ALLLOWED.
+ for (auto flags :
+ {MPOL_F_MEMS_ALLOWED | MPOL_F_ADDR, MPOL_F_MEMS_ALLOWED | MPOL_F_NODE,
+ MPOL_F_MEMS_ALLOWED | MPOL_F_ADDR | MPOL_F_NODE}) {
+ ASSERT_THAT(get_mempolicy(nullptr, &nodemask,
+ sizeof(nodemask) * BITS_PER_BYTE, nullptr, flags),
+ SyscallFailsWithErrno(EINVAL));
+ }
+}
+
+TEST(MempolicyTest, GetMempolicyQueryNodeForAddress) {
+ uint64_t dummy_stack_address;
+ auto dummy_heap_address = absl::make_unique<uint64_t>();
+ int mode;
+
+ for (auto ptr : {&dummy_stack_address, dummy_heap_address.get()}) {
+ mode = -1;
+ ASSERT_THAT(
+ get_mempolicy(&mode, nullptr, 0, ptr, MPOL_F_ADDR | MPOL_F_NODE),
+ SyscallSucceeds());
+ // If we're not running on gvisor, the address may be allocated on a
+ // different numa node.
+ if (IsRunningOnGvisor()) {
+ EXPECT_EQ(0, mode);
+ }
+ }
+
+ void* invalid_address = reinterpret_cast<void*>(-1);
+
+ // Invalid address.
+ ASSERT_THAT(get_mempolicy(&mode, nullptr, 0, invalid_address,
+ MPOL_F_ADDR | MPOL_F_NODE),
+ SyscallFailsWithErrno(EFAULT));
+
+ // Invalid mode pointer.
+ ASSERT_THAT(get_mempolicy(reinterpret_cast<int*>(invalid_address), nullptr, 0,
+ &dummy_stack_address, MPOL_F_ADDR | MPOL_F_NODE),
+ SyscallFailsWithErrno(EFAULT));
+}
+
+TEST(MempolicyTest, GetMempolicyCanOmitPointers) {
+ int mode;
+ uint64_t nodemask;
+
+ // Omit nodemask pointer.
+ ASSERT_THAT(get_mempolicy(&mode, nullptr, 0, nullptr, 0), SyscallSucceeds());
+ // Omit mode pointer.
+ ASSERT_THAT(get_mempolicy(nullptr, &nodemask,
+ sizeof(nodemask) * BITS_PER_BYTE, nullptr, 0),
+ SyscallSucceeds());
+ // Omit both pointers.
+ ASSERT_THAT(get_mempolicy(nullptr, nullptr, 0, nullptr, 0),
+ SyscallSucceeds());
+}
+
+TEST(MempolicyTest, GetMempolicyNextInterleaveNode) {
+ int mode;
+ // Policy for thread not yet set to MPOL_INTERLEAVE, can't query for
+ // the next node which will be used for allocation.
+ ASSERT_THAT(get_mempolicy(&mode, nullptr, 0, nullptr, MPOL_F_NODE),
+ SyscallFailsWithErrno(EINVAL));
+
+ // Set default policy for thread to MPOL_INTERLEAVE.
+ uint64_t nodemask = 0x1;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSetMempolicy(
+ MPOL_INTERLEAVE, &nodemask, sizeof(nodemask) * BITS_PER_BYTE));
+
+ mode = -1;
+ ASSERT_THAT(get_mempolicy(&mode, nullptr, 0, nullptr, MPOL_F_NODE),
+ SyscallSucceeds());
+ EXPECT_EQ(0, mode);
+}
+
+TEST(MempolicyTest, Mbind) {
+ // Temporarily set the thread policy to MPOL_PREFERRED.
+ const auto cleanup_thread_policy =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSetMempolicy(MPOL_PREFERRED, nullptr, 0));
+
+ const auto mapping = ASSERT_NO_ERRNO_AND_VALUE(
+ MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS));
+
+ // vmas default to MPOL_DEFAULT irrespective of the thread policy (currently
+ // MPOL_PREFERRED).
+ int mode;
+ ASSERT_THAT(get_mempolicy(&mode, nullptr, 0, mapping.ptr(), MPOL_F_ADDR),
+ SyscallSucceeds());
+ EXPECT_EQ(mode, MPOL_DEFAULT);
+
+ // Set MPOL_PREFERRED for the vma and read it back.
+ ASSERT_THAT(
+ mbind(mapping.ptr(), mapping.len(), MPOL_PREFERRED, nullptr, 0, 0),
+ SyscallSucceeds());
+ ASSERT_THAT(get_mempolicy(&mode, nullptr, 0, mapping.ptr(), MPOL_F_ADDR),
+ SyscallSucceeds());
+ EXPECT_EQ(mode, MPOL_PREFERRED);
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/mincore.cc b/test/syscalls/linux/mincore.cc
new file mode 100644
index 000000000..5c1240c89
--- /dev/null
+++ b/test/syscalls/linux/mincore.cc
@@ -0,0 +1,96 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <stdint.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "test/util/memory_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+size_t CountSetLSBs(std::vector<unsigned char> const& vec) {
+ return std::count_if(begin(vec), end(vec),
+ [](unsigned char c) { return (c & 1) != 0; });
+}
+
+TEST(MincoreTest, DirtyAnonPagesAreResident) {
+ constexpr size_t kTestPageCount = 10;
+ auto const kTestMappingBytes = kTestPageCount * kPageSize;
+ auto m = ASSERT_NO_ERRNO_AND_VALUE(
+ MmapAnon(kTestMappingBytes, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+ memset(m.ptr(), 0, m.len());
+
+ std::vector<unsigned char> vec(kTestPageCount, 0);
+ ASSERT_THAT(mincore(m.ptr(), kTestMappingBytes, vec.data()),
+ SyscallSucceeds());
+ EXPECT_EQ(kTestPageCount, CountSetLSBs(vec));
+}
+
+TEST(MincoreTest, UnalignedAddressFails) {
+ // Map and touch two pages, then try to mincore the second half of the first
+ // page + the first half of the second page. Both pages are mapped, but
+ // mincore should return EINVAL due to the misaligned start address.
+ constexpr size_t kTestPageCount = 2;
+ auto const kTestMappingBytes = kTestPageCount * kPageSize;
+ auto m = ASSERT_NO_ERRNO_AND_VALUE(
+ MmapAnon(kTestMappingBytes, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+ memset(m.ptr(), 0, m.len());
+
+ std::vector<unsigned char> vec(kTestPageCount, 0);
+ EXPECT_THAT(mincore(reinterpret_cast<void*>(m.addr() + kPageSize / 2),
+ kPageSize, vec.data()),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(MincoreTest, UnalignedLengthSucceedsAndIsRoundedUp) {
+ // Map and touch two pages, then try to mincore the first page + the first
+ // half of the second page. mincore should silently round up the length to
+ // include both pages.
+ constexpr size_t kTestPageCount = 2;
+ auto const kTestMappingBytes = kTestPageCount * kPageSize;
+ auto m = ASSERT_NO_ERRNO_AND_VALUE(
+ MmapAnon(kTestMappingBytes, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+ memset(m.ptr(), 0, m.len());
+
+ std::vector<unsigned char> vec(kTestPageCount, 0);
+ ASSERT_THAT(mincore(m.ptr(), kPageSize + kPageSize / 2, vec.data()),
+ SyscallSucceeds());
+ EXPECT_EQ(kTestPageCount, CountSetLSBs(vec));
+}
+
+TEST(MincoreTest, ZeroLengthSucceedsAndAllowsAnyVecBelowTaskSize) {
+ EXPECT_THAT(mincore(nullptr, 0, nullptr), SyscallSucceeds());
+}
+
+TEST(MincoreTest, InvalidLengthFails) {
+ EXPECT_THAT(mincore(nullptr, -1, nullptr), SyscallFailsWithErrno(ENOMEM));
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/mkdir.cc b/test/syscalls/linux/mkdir.cc
new file mode 100644
index 000000000..4036a9275
--- /dev/null
+++ b/test/syscalls/linux/mkdir.cc
@@ -0,0 +1,88 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/util/capability_util.h"
+#include "test/util/fs_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/temp_umask.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+class MkdirTest : public ::testing::Test {
+ protected:
+ // SetUp creates various configurations of files.
+ void SetUp() override { dirname_ = NewTempAbsPath(); }
+
+ // TearDown unlinks created files.
+ void TearDown() override {
+ EXPECT_THAT(rmdir(dirname_.c_str()), SyscallSucceeds());
+ }
+
+ std::string dirname_;
+};
+
+TEST_F(MkdirTest, CanCreateWritableDir) {
+ ASSERT_THAT(mkdir(dirname_.c_str(), 0777), SyscallSucceeds());
+ std::string filename = JoinPath(dirname_, "anything");
+ int fd;
+ ASSERT_THAT(fd = open(filename.c_str(), O_RDWR | O_CREAT, 0666),
+ SyscallSucceeds());
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+ ASSERT_THAT(unlink(filename.c_str()), SyscallSucceeds());
+}
+
+TEST_F(MkdirTest, HonorsUmask) {
+ constexpr mode_t kMask = 0111;
+ TempUmask mask(kMask);
+ ASSERT_THAT(mkdir(dirname_.c_str(), 0777), SyscallSucceeds());
+ struct stat statbuf;
+ ASSERT_THAT(stat(dirname_.c_str(), &statbuf), SyscallSucceeds());
+ EXPECT_EQ(0777 & ~kMask, statbuf.st_mode & 0777);
+}
+
+TEST_F(MkdirTest, HonorsUmask2) {
+ constexpr mode_t kMask = 0142;
+ TempUmask mask(kMask);
+ ASSERT_THAT(mkdir(dirname_.c_str(), 0777), SyscallSucceeds());
+ struct stat statbuf;
+ ASSERT_THAT(stat(dirname_.c_str(), &statbuf), SyscallSucceeds());
+ EXPECT_EQ(0777 & ~kMask, statbuf.st_mode & 0777);
+}
+
+TEST_F(MkdirTest, FailsOnDirWithoutWritePerms) {
+ // Drop capabilities that allow us to override file and directory permissions.
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+ ASSERT_THAT(mkdir(dirname_.c_str(), 0555), SyscallSucceeds());
+ auto dir = JoinPath(dirname_.c_str(), "foo");
+ EXPECT_THAT(mkdir(dir.c_str(), 0777), SyscallFailsWithErrno(EACCES));
+ EXPECT_THAT(open(JoinPath(dirname_, "file").c_str(), O_RDWR | O_CREAT, 0666),
+ SyscallFailsWithErrno(EACCES));
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/mknod.cc b/test/syscalls/linux/mknod.cc
new file mode 100644
index 000000000..05dfb375a
--- /dev/null
+++ b/test/syscalls/linux/mknod.cc
@@ -0,0 +1,190 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/un.h>
+#include <unistd.h>
+
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(MknodTest, RegularFile) {
+ const std::string node0 = NewTempAbsPath();
+ EXPECT_THAT(mknod(node0.c_str(), S_IFREG, 0), SyscallSucceeds());
+
+ const std::string node1 = NewTempAbsPath();
+ EXPECT_THAT(mknod(node1.c_str(), 0, 0), SyscallSucceeds());
+}
+
+TEST(MknodTest, RegularFilePermissions) {
+ const std::string node = NewTempAbsPath();
+ mode_t newUmask = 0077;
+ umask(newUmask);
+
+ // Attempt to open file with mode 0777. Not specifying file type should create
+ // a regualar file.
+ mode_t perms = S_IRWXU | S_IRWXG | S_IRWXO;
+ EXPECT_THAT(mknod(node.c_str(), perms, 0), SyscallSucceeds());
+
+ // In the absence of a default ACL, the permissions of the created node are
+ // (mode & ~umask). -- mknod(2)
+ mode_t wantPerms = perms & ~newUmask;
+ struct stat st;
+ ASSERT_THAT(stat(node.c_str(), &st), SyscallSucceeds());
+ ASSERT_EQ(st.st_mode & 0777, wantPerms);
+
+ // "Zero file type is equivalent to type S_IFREG." - mknod(2)
+ ASSERT_EQ(st.st_mode & S_IFMT, S_IFREG);
+}
+
+TEST(MknodTest, MknodAtFIFO) {
+ const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const std::string fifo_relpath = NewTempRelPath();
+ const std::string fifo = JoinPath(dir.path(), fifo_relpath);
+
+ const FileDescriptor dirfd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path().c_str(), O_RDONLY));
+ ASSERT_THAT(mknodat(dirfd.get(), fifo_relpath.c_str(), S_IFIFO | S_IRUSR, 0),
+ SyscallSucceeds());
+
+ struct stat st;
+ ASSERT_THAT(stat(fifo.c_str(), &st), SyscallSucceeds());
+ EXPECT_TRUE(S_ISFIFO(st.st_mode));
+}
+
+TEST(MknodTest, MknodOnExistingPathFails) {
+ const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const TempPath slink = ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateSymlinkTo(GetAbsoluteTestTmpdir(), file.path()));
+
+ EXPECT_THAT(mknod(file.path().c_str(), S_IFREG, 0),
+ SyscallFailsWithErrno(EEXIST));
+ EXPECT_THAT(mknod(file.path().c_str(), S_IFIFO, 0),
+ SyscallFailsWithErrno(EEXIST));
+ EXPECT_THAT(mknod(slink.path().c_str(), S_IFREG, 0),
+ SyscallFailsWithErrno(EEXIST));
+ EXPECT_THAT(mknod(slink.path().c_str(), S_IFIFO, 0),
+ SyscallFailsWithErrno(EEXIST));
+}
+
+TEST(MknodTest, UnimplementedTypesReturnError) {
+ const std::string path = NewTempAbsPath();
+
+ if (IsRunningWithVFS1()) {
+ ASSERT_THAT(mknod(path.c_str(), S_IFSOCK, 0),
+ SyscallFailsWithErrno(EOPNOTSUPP));
+ }
+ // These will fail on linux as well since we don't have CAP_MKNOD.
+ ASSERT_THAT(mknod(path.c_str(), S_IFCHR, 0), SyscallFailsWithErrno(EPERM));
+ ASSERT_THAT(mknod(path.c_str(), S_IFBLK, 0), SyscallFailsWithErrno(EPERM));
+}
+
+TEST(MknodTest, Fifo) {
+ const std::string fifo = NewTempAbsPath();
+ ASSERT_THAT(mknod(fifo.c_str(), S_IFIFO | S_IRUSR | S_IWUSR, 0),
+ SyscallSucceeds());
+
+ struct stat st;
+ ASSERT_THAT(stat(fifo.c_str(), &st), SyscallSucceeds());
+ EXPECT_TRUE(S_ISFIFO(st.st_mode));
+
+ std::string msg = "some std::string";
+ std::vector<char> buf(512);
+
+ // Read-end of the pipe.
+ ScopedThread t([&fifo, &buf, &msg]() {
+ FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(fifo.c_str(), O_RDONLY));
+ EXPECT_THAT(ReadFd(fd.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(msg.length()));
+ EXPECT_EQ(msg, std::string(buf.data()));
+ });
+
+ // Write-end of the pipe.
+ FileDescriptor wfd = ASSERT_NO_ERRNO_AND_VALUE(Open(fifo.c_str(), O_WRONLY));
+ EXPECT_THAT(WriteFd(wfd.get(), msg.c_str(), msg.length()),
+ SyscallSucceedsWithValue(msg.length()));
+}
+
+TEST(MknodTest, FifoOtrunc) {
+ const std::string fifo = NewTempAbsPath();
+ ASSERT_THAT(mknod(fifo.c_str(), S_IFIFO | S_IRUSR | S_IWUSR, 0),
+ SyscallSucceeds());
+
+ struct stat st = {};
+ ASSERT_THAT(stat(fifo.c_str(), &st), SyscallSucceeds());
+ EXPECT_TRUE(S_ISFIFO(st.st_mode));
+
+ std::string msg = "some std::string";
+ std::vector<char> buf(512);
+ // Read-end of the pipe.
+ ScopedThread t([&fifo, &buf, &msg]() {
+ FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(fifo.c_str(), O_RDONLY));
+ EXPECT_THAT(ReadFd(fd.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(msg.length()));
+ EXPECT_EQ(msg, std::string(buf.data()));
+ });
+
+ // Write-end of the pipe.
+ FileDescriptor wfd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(fifo.c_str(), O_WRONLY | O_TRUNC));
+ EXPECT_THAT(WriteFd(wfd.get(), msg.c_str(), msg.length()),
+ SyscallSucceedsWithValue(msg.length()));
+}
+
+TEST(MknodTest, FifoTruncNoOp) {
+ const std::string fifo = NewTempAbsPath();
+ ASSERT_THAT(mknod(fifo.c_str(), S_IFIFO | S_IRUSR | S_IWUSR, 0),
+ SyscallSucceeds());
+
+ EXPECT_THAT(truncate(fifo.c_str(), 0), SyscallFailsWithErrno(EINVAL));
+
+ struct stat st = {};
+ ASSERT_THAT(stat(fifo.c_str(), &st), SyscallSucceeds());
+ EXPECT_TRUE(S_ISFIFO(st.st_mode));
+
+ std::string msg = "some std::string";
+ std::vector<char> buf(512);
+ // Read-end of the pipe.
+ ScopedThread t([&fifo, &buf, &msg]() {
+ FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(fifo.c_str(), O_RDONLY));
+ EXPECT_THAT(ReadFd(fd.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(msg.length()));
+ EXPECT_EQ(msg, std::string(buf.data()));
+ });
+
+ FileDescriptor wfd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(fifo.c_str(), O_WRONLY | O_TRUNC));
+ EXPECT_THAT(ftruncate(wfd.get(), 0), SyscallFailsWithErrno(EINVAL));
+ EXPECT_THAT(WriteFd(wfd.get(), msg.c_str(), msg.length()),
+ SyscallSucceedsWithValue(msg.length()));
+ EXPECT_THAT(ftruncate(wfd.get(), 0), SyscallFailsWithErrno(EINVAL));
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/mlock.cc b/test/syscalls/linux/mlock.cc
new file mode 100644
index 000000000..78ac96bed
--- /dev/null
+++ b/test/syscalls/linux/mlock.cc
@@ -0,0 +1,332 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/mman.h>
+#include <sys/resource.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include <cerrno>
+#include <cstring>
+
+#include "gmock/gmock.h"
+#include "test/util/capability_util.h"
+#include "test/util/cleanup.h"
+#include "test/util/memory_util.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/rlimit_util.h"
+#include "test/util/test_util.h"
+
+using ::testing::_;
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+PosixErrorOr<bool> CanMlock() {
+ struct rlimit rlim;
+ if (getrlimit(RLIMIT_MEMLOCK, &rlim) < 0) {
+ return PosixError(errno, "getrlimit(RLIMIT_MEMLOCK)");
+ }
+ if (rlim.rlim_cur != 0) {
+ return true;
+ }
+ return HaveCapability(CAP_IPC_LOCK);
+}
+
+// Returns true if the page containing addr is mlocked.
+bool IsPageMlocked(uintptr_t addr) {
+ // This relies on msync(MS_INVALIDATE) interacting correctly with mlocked
+ // pages, which is tested for by the MsyncInvalidate case below.
+ int const rv = msync(reinterpret_cast<void*>(addr & ~(kPageSize - 1)),
+ kPageSize, MS_ASYNC | MS_INVALIDATE);
+ if (rv == 0) {
+ return false;
+ }
+ // This uses TEST_PCHECK_MSG since it's used in subprocesses.
+ TEST_PCHECK_MSG(errno == EBUSY, "msync failed with unexpected errno");
+ return true;
+}
+
+TEST(MlockTest, Basic) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+ auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+ MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+ EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+ ASSERT_THAT(mlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
+ EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+}
+
+TEST(MlockTest, ProtNone) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+ auto const mapping =
+ ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_NONE, MAP_PRIVATE));
+ EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+ ASSERT_THAT(mlock(mapping.ptr(), mapping.len()),
+ SyscallFailsWithErrno(ENOMEM));
+ // ENOMEM is returned because mlock can't populate the page, but it's still
+ // considered locked.
+ EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+}
+
+TEST(MlockTest, MadviseDontneed) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+ auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+ MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+ ASSERT_THAT(mlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
+ EXPECT_THAT(madvise(mapping.ptr(), mapping.len(), MADV_DONTNEED),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(MlockTest, MsyncInvalidate) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+ auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+ MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+ ASSERT_THAT(mlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
+ EXPECT_THAT(msync(mapping.ptr(), mapping.len(), MS_ASYNC | MS_INVALIDATE),
+ SyscallFailsWithErrno(EBUSY));
+ EXPECT_THAT(msync(mapping.ptr(), mapping.len(), MS_SYNC | MS_INVALIDATE),
+ SyscallFailsWithErrno(EBUSY));
+}
+
+TEST(MlockTest, Fork) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+ auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+ MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+ EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+ ASSERT_THAT(mlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
+ EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+ EXPECT_THAT(
+ InForkedProcess([&] { TEST_CHECK(!IsPageMlocked(mapping.addr())); }),
+ IsPosixErrorOkAndHolds(0));
+}
+
+TEST(MlockTest, RlimitMemlockZero) {
+ if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) {
+ ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false));
+ }
+ Cleanup reset_rlimit =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSetSoftRlimit(RLIMIT_MEMLOCK, 0));
+ auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+ MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+ EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+ ASSERT_THAT(mlock(mapping.ptr(), mapping.len()),
+ SyscallFailsWithErrno(EPERM));
+}
+
+TEST(MlockTest, RlimitMemlockInsufficient) {
+ if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) {
+ ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false));
+ }
+ Cleanup reset_rlimit =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSetSoftRlimit(RLIMIT_MEMLOCK, kPageSize));
+ auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+ MmapAnon(2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+ EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+ ASSERT_THAT(mlock(mapping.ptr(), mapping.len()),
+ SyscallFailsWithErrno(ENOMEM));
+}
+
+TEST(MunlockTest, Basic) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+ auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+ MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+ EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+ ASSERT_THAT(mlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
+ EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+ ASSERT_THAT(munlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
+ EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+}
+
+TEST(MunlockTest, NotLocked) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+ auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+ MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+ EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+ EXPECT_THAT(munlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
+ EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+}
+
+// There is currently no test for mlockall(MCL_CURRENT) because the default
+// RLIMIT_MEMLOCK of 64 KB is insufficient to actually invoke
+// mlockall(MCL_CURRENT).
+
+TEST(MlockallTest, Future) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+
+ // Run this test in a separate (single-threaded) subprocess to ensure that a
+ // background thread doesn't try to mmap a large amount of memory, fail due
+ // to hitting RLIMIT_MEMLOCK, and explode the process violently.
+ auto const do_test = [] {
+ auto const mapping =
+ MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE).ValueOrDie();
+ TEST_CHECK(!IsPageMlocked(mapping.addr()));
+ TEST_PCHECK(mlockall(MCL_FUTURE) == 0);
+ // Ensure that mlockall(MCL_FUTURE) is turned off before the end of the
+ // test, as otherwise mmaps may fail unexpectedly.
+ Cleanup do_munlockall([] { TEST_PCHECK(munlockall() == 0); });
+ auto const mapping2 =
+ MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE).ValueOrDie();
+ TEST_CHECK(IsPageMlocked(mapping2.addr()));
+ // Fire munlockall() and check that it disables mlockall(MCL_FUTURE).
+ do_munlockall.Release()();
+ auto const mapping3 =
+ MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE).ValueOrDie();
+ TEST_CHECK(!IsPageMlocked(mapping2.addr()));
+ };
+ EXPECT_THAT(InForkedProcess(do_test), IsPosixErrorOkAndHolds(0));
+}
+
+TEST(MunlockallTest, Basic) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+ auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+ MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED));
+ EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+ ASSERT_THAT(munlockall(), SyscallSucceeds());
+ EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+}
+
+#ifndef SYS_mlock2
+#if defined(__x86_64__)
+#define SYS_mlock2 325
+#elif defined(__aarch64__)
+#define SYS_mlock2 284
+#endif
+#endif
+
+#ifndef MLOCK_ONFAULT
+#define MLOCK_ONFAULT 0x01 // Linux: include/uapi/asm-generic/mman-common.h
+#endif
+
+#ifdef SYS_mlock2
+
+int mlock2(void const* addr, size_t len, int flags) {
+ return syscall(SYS_mlock2, addr, len, flags);
+}
+
+TEST(Mlock2Test, NoFlags) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+ auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+ MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+ EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+ ASSERT_THAT(mlock2(mapping.ptr(), mapping.len(), 0), SyscallSucceeds());
+ EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+}
+
+TEST(Mlock2Test, MlockOnfault) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+ auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+ MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+ EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+ ASSERT_THAT(mlock2(mapping.ptr(), mapping.len(), MLOCK_ONFAULT),
+ SyscallSucceeds());
+ EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+}
+
+TEST(Mlock2Test, UnknownFlags) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+ auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+ MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+ EXPECT_THAT(mlock2(mapping.ptr(), mapping.len(), ~0),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+#endif // defined(SYS_mlock2)
+
+TEST(MapLockedTest, Basic) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+ auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+ MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED));
+ EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+ EXPECT_THAT(munlock(mapping.ptr(), mapping.len()), SyscallSucceeds());
+ EXPECT_FALSE(IsPageMlocked(mapping.addr()));
+}
+
+TEST(MapLockedTest, RlimitMemlockZero) {
+ if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) {
+ ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false));
+ }
+ Cleanup reset_rlimit =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSetSoftRlimit(RLIMIT_MEMLOCK, 0));
+ EXPECT_THAT(
+ MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED),
+ PosixErrorIs(EPERM, _));
+}
+
+TEST(MapLockedTest, RlimitMemlockInsufficient) {
+ if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) {
+ ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false));
+ }
+ Cleanup reset_rlimit =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSetSoftRlimit(RLIMIT_MEMLOCK, kPageSize));
+ EXPECT_THAT(
+ MmapAnon(2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED),
+ PosixErrorIs(EAGAIN, _));
+}
+
+TEST(MremapLockedTest, Basic) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+ auto mapping = ASSERT_NO_ERRNO_AND_VALUE(
+ MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED));
+ EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+
+ void* addr = mremap(mapping.ptr(), mapping.len(), 2 * mapping.len(),
+ MREMAP_MAYMOVE, nullptr);
+ if (addr == MAP_FAILED) {
+ FAIL() << "mremap failed: " << errno << " (" << strerror(errno) << ")";
+ }
+ mapping.release();
+ mapping.reset(addr, 2 * mapping.len());
+ EXPECT_TRUE(IsPageMlocked(reinterpret_cast<uintptr_t>(addr)));
+}
+
+TEST(MremapLockedTest, RlimitMemlockZero) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+ auto mapping = ASSERT_NO_ERRNO_AND_VALUE(
+ MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED));
+ EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+
+ if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) {
+ ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false));
+ }
+ Cleanup reset_rlimit =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSetSoftRlimit(RLIMIT_MEMLOCK, 0));
+ void* addr = mremap(mapping.ptr(), mapping.len(), 2 * mapping.len(),
+ MREMAP_MAYMOVE, nullptr);
+ EXPECT_TRUE(addr == MAP_FAILED && errno == EAGAIN)
+ << "addr = " << addr << ", errno = " << errno;
+}
+
+TEST(MremapLockedTest, RlimitMemlockInsufficient) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanMlock()));
+ auto mapping = ASSERT_NO_ERRNO_AND_VALUE(
+ MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_LOCKED));
+ EXPECT_TRUE(IsPageMlocked(mapping.addr()));
+
+ if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_IPC_LOCK))) {
+ ASSERT_NO_ERRNO(SetCapability(CAP_IPC_LOCK, false));
+ }
+ Cleanup reset_rlimit = ASSERT_NO_ERRNO_AND_VALUE(
+ ScopedSetSoftRlimit(RLIMIT_MEMLOCK, mapping.len()));
+ void* addr = mremap(mapping.ptr(), mapping.len(), 2 * mapping.len(),
+ MREMAP_MAYMOVE, nullptr);
+ EXPECT_TRUE(addr == MAP_FAILED && errno == EAGAIN)
+ << "addr = " << addr << ", errno = " << errno;
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/mmap.cc b/test/syscalls/linux/mmap.cc
new file mode 100644
index 000000000..6d3227ab6
--- /dev/null
+++ b/test/syscalls/linux/mmap.cc
@@ -0,0 +1,1676 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/magic.h>
+#include <linux/unistd.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/resource.h>
+#include <sys/statfs.h>
+#include <sys/syscall.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/strings/escaping.h"
+#include "absl/strings/str_split.h"
+#include "test/util/cleanup.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/memory_util.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+using ::testing::Gt;
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+PosixErrorOr<int64_t> VirtualMemorySize() {
+ ASSIGN_OR_RETURN_ERRNO(auto contents, GetContents("/proc/self/statm"));
+ std::vector<std::string> parts = absl::StrSplit(contents, ' ');
+ if (parts.empty()) {
+ return PosixError(EINVAL, "Unable to parse /proc/self/statm");
+ }
+ ASSIGN_OR_RETURN_ERRNO(auto pages, Atoi<int64_t>(parts[0]));
+ return pages * getpagesize();
+}
+
+class MMapTest : public ::testing::Test {
+ protected:
+ // Unmap mapping, if one was made.
+ void TearDown() override {
+ if (addr_) {
+ EXPECT_THAT(Unmap(), SyscallSucceeds());
+ }
+ }
+
+ // Remembers mapping, so it can be automatically unmapped.
+ uintptr_t Map(uintptr_t addr, size_t length, int prot, int flags, int fd,
+ off_t offset) {
+ void* ret =
+ mmap(reinterpret_cast<void*>(addr), length, prot, flags, fd, offset);
+
+ if (ret != MAP_FAILED) {
+ addr_ = ret;
+ length_ = length;
+ }
+
+ return reinterpret_cast<uintptr_t>(ret);
+ }
+
+ // Unmap previous mapping
+ int Unmap() {
+ if (!addr_) {
+ return -1;
+ }
+
+ int ret = munmap(addr_, length_);
+
+ addr_ = nullptr;
+ length_ = 0;
+
+ return ret;
+ }
+
+ // Msync the mapping.
+ int Msync() { return msync(addr_, length_, MS_SYNC); }
+
+ // Mlock the mapping.
+ int Mlock() { return mlock(addr_, length_); }
+
+ // Munlock the mapping.
+ int Munlock() { return munlock(addr_, length_); }
+
+ int Protect(uintptr_t addr, size_t length, int prot) {
+ return mprotect(reinterpret_cast<void*>(addr), length, prot);
+ }
+
+ void* addr_ = nullptr;
+ size_t length_ = 0;
+};
+
+// Matches if arg contains the same contents as string str.
+MATCHER_P(EqualsMemory, str, "") {
+ if (0 == memcmp(arg, str.c_str(), str.size())) {
+ return true;
+ }
+
+ *result_listener << "Memory did not match. Got:\n"
+ << absl::BytesToHexString(
+ std::string(static_cast<char*>(arg), str.size()))
+ << "Want:\n"
+ << absl::BytesToHexString(str);
+ return false;
+}
+
+// We can't map pipes, but for different reasons.
+TEST_F(MMapTest, MapPipe) {
+ int fds[2];
+ ASSERT_THAT(pipe(fds), SyscallSucceeds());
+ EXPECT_THAT(Map(0, kPageSize, PROT_READ, MAP_PRIVATE, fds[0], 0),
+ SyscallFailsWithErrno(ENODEV));
+ EXPECT_THAT(Map(0, kPageSize, PROT_READ, MAP_PRIVATE, fds[1], 0),
+ SyscallFailsWithErrno(EACCES));
+ ASSERT_THAT(close(fds[0]), SyscallSucceeds());
+ ASSERT_THAT(close(fds[1]), SyscallSucceeds());
+}
+
+// It's very common to mmap /dev/zero because anonymous mappings aren't part
+// of POSIX although they are widely supported. So a zero initialized memory
+// region would actually come from a "file backed" /dev/zero mapping.
+TEST_F(MMapTest, MapDevZeroShared) {
+ // This test will verify that we're able to map a page backed by /dev/zero
+ // as MAP_SHARED.
+ const FileDescriptor dev_zero =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/zero", O_RDWR));
+
+ // Test that we can create a RW SHARED mapping of /dev/zero.
+ ASSERT_THAT(
+ Map(0, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED, dev_zero.get(), 0),
+ SyscallSucceeds());
+}
+
+TEST_F(MMapTest, MapDevZeroPrivate) {
+ // This test will verify that we're able to map a page backed by /dev/zero
+ // as MAP_PRIVATE.
+ const FileDescriptor dev_zero =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/zero", O_RDWR));
+
+ // Test that we can create a RW SHARED mapping of /dev/zero.
+ ASSERT_THAT(
+ Map(0, kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE, dev_zero.get(), 0),
+ SyscallSucceeds());
+}
+
+TEST_F(MMapTest, MapDevZeroNoPersistence) {
+ // This test will verify that two independent mappings of /dev/zero do not
+ // appear to reference the same "backed file."
+
+ const FileDescriptor dev_zero1 =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/zero", O_RDWR));
+ const FileDescriptor dev_zero2 =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/zero", O_RDWR));
+
+ ASSERT_THAT(
+ Map(0, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED, dev_zero1.get(), 0),
+ SyscallSucceeds());
+
+ // Create a second mapping via the second /dev/zero fd.
+ void* psec_map = mmap(nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
+ dev_zero2.get(), 0);
+ ASSERT_THAT(reinterpret_cast<intptr_t>(psec_map), SyscallSucceeds());
+
+ // Always unmap.
+ auto cleanup_psec_map = Cleanup(
+ [&] { EXPECT_THAT(munmap(psec_map, kPageSize), SyscallSucceeds()); });
+
+ // Verify that we have independently addressed pages.
+ ASSERT_NE(psec_map, addr_);
+
+ std::string buf_zero(kPageSize, 0x00);
+ std::string buf_ones(kPageSize, 0xFF);
+
+ // Verify the first is actually all zeros after mmap.
+ EXPECT_THAT(addr_, EqualsMemory(buf_zero));
+
+ // Let's fill in the first mapping with 0xFF.
+ memcpy(addr_, buf_ones.data(), kPageSize);
+
+ // Verify that the memcpy actually stuck in the page.
+ EXPECT_THAT(addr_, EqualsMemory(buf_ones));
+
+ // Verify that it didn't affect the second page which should be all zeros.
+ EXPECT_THAT(psec_map, EqualsMemory(buf_zero));
+}
+
+TEST_F(MMapTest, MapDevZeroSharedMultiplePages) {
+ // This will test that we're able to map /dev/zero over multiple pages.
+ const FileDescriptor dev_zero =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/zero", O_RDWR));
+
+ // Test that we can create a RW SHARED mapping of /dev/zero.
+ ASSERT_THAT(Map(0, kPageSize * 2, PROT_READ | PROT_WRITE, MAP_PRIVATE,
+ dev_zero.get(), 0),
+ SyscallSucceeds());
+
+ std::string buf_zero(kPageSize * 2, 0x00);
+ std::string buf_ones(kPageSize * 2, 0xFF);
+
+ // Verify the two pages are actually all zeros after mmap.
+ EXPECT_THAT(addr_, EqualsMemory(buf_zero));
+
+ // Fill out the pages with all ones.
+ memcpy(addr_, buf_ones.data(), kPageSize * 2);
+
+ // Verify that the memcpy actually stuck in the pages.
+ EXPECT_THAT(addr_, EqualsMemory(buf_ones));
+}
+
+TEST_F(MMapTest, MapDevZeroSharedFdNoPersistence) {
+ // This test will verify that two independent mappings of /dev/zero do not
+ // appear to reference the same "backed file" even when mapped from the
+ // same initial fd.
+ const FileDescriptor dev_zero =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/zero", O_RDWR));
+
+ ASSERT_THAT(
+ Map(0, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED, dev_zero.get(), 0),
+ SyscallSucceeds());
+
+ // Create a second mapping via the same fd.
+ void* psec_map = mmap(nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
+ dev_zero.get(), 0);
+ ASSERT_THAT(reinterpret_cast<int64_t>(psec_map), SyscallSucceeds());
+
+ // Always unmap.
+ auto cleanup_psec_map = Cleanup(
+ [&] { ASSERT_THAT(munmap(psec_map, kPageSize), SyscallSucceeds()); });
+
+ // Verify that we have independently addressed pages.
+ ASSERT_NE(psec_map, addr_);
+
+ std::string buf_zero(kPageSize, 0x00);
+ std::string buf_ones(kPageSize, 0xFF);
+
+ // Verify the first is actually all zeros after mmap.
+ EXPECT_THAT(addr_, EqualsMemory(buf_zero));
+
+ // Let's fill in the first mapping with 0xFF.
+ memcpy(addr_, buf_ones.data(), kPageSize);
+
+ // Verify that the memcpy actually stuck in the page.
+ EXPECT_THAT(addr_, EqualsMemory(buf_ones));
+
+ // Verify that it didn't affect the second page which should be all zeros.
+ EXPECT_THAT(psec_map, EqualsMemory(buf_zero));
+}
+
+TEST_F(MMapTest, MapDevZeroSegfaultAfterUnmap) {
+ SetupGvisorDeathTest();
+
+ // This test will verify that we're able to map a page backed by /dev/zero
+ // as MAP_SHARED and after it's unmapped any access results in a SIGSEGV.
+ // This test is redundant but given the special nature of /dev/zero mappings
+ // it doesn't hurt.
+ const FileDescriptor dev_zero =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/zero", O_RDWR));
+
+ const auto rest = [&] {
+ // Test that we can create a RW SHARED mapping of /dev/zero.
+ TEST_PCHECK(Map(0, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
+ dev_zero.get(),
+ 0) != reinterpret_cast<uintptr_t>(MAP_FAILED));
+
+ // Confirm that accesses after the unmap result in a SIGSEGV.
+ //
+ // N.B. We depend on this process being single-threaded to ensure there
+ // can't be another mmap to map addr before the dereference below.
+ void* addr_saved = addr_; // Unmap resets addr_.
+ TEST_PCHECK(Unmap() == 0);
+ *reinterpret_cast<volatile int*>(addr_saved) = 0xFF;
+ };
+
+ EXPECT_THAT(InForkedProcess(rest),
+ IsPosixErrorOkAndHolds(W_EXITCODE(0, SIGSEGV)));
+}
+
+TEST_F(MMapTest, MapDevZeroUnaligned) {
+ const FileDescriptor dev_zero =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/zero", O_RDWR));
+ const size_t size = kPageSize + kPageSize / 2;
+ const std::string buf_zero(size, 0x00);
+
+ ASSERT_THAT(
+ Map(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, dev_zero.get(), 0),
+ SyscallSucceeds());
+ EXPECT_THAT(addr_, EqualsMemory(buf_zero));
+ ASSERT_THAT(Unmap(), SyscallSucceeds());
+
+ ASSERT_THAT(
+ Map(0, size, PROT_READ | PROT_WRITE, MAP_PRIVATE, dev_zero.get(), 0),
+ SyscallSucceeds());
+ EXPECT_THAT(addr_, EqualsMemory(buf_zero));
+}
+
+// We can't map _some_ character devices.
+TEST_F(MMapTest, MapCharDevice) {
+ const FileDescriptor cdevfd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/random", 0, 0));
+ EXPECT_THAT(Map(0, kPageSize, PROT_READ, MAP_PRIVATE, cdevfd.get(), 0),
+ SyscallFailsWithErrno(ENODEV));
+}
+
+// We can't map directories.
+TEST_F(MMapTest, MapDirectory) {
+ const FileDescriptor dirfd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(GetAbsoluteTestTmpdir(), 0, 0));
+ EXPECT_THAT(Map(0, kPageSize, PROT_READ, MAP_PRIVATE, dirfd.get(), 0),
+ SyscallFailsWithErrno(ENODEV));
+}
+
+// We can map *something*
+TEST_F(MMapTest, MapAnything) {
+ EXPECT_THAT(Map(0, kPageSize, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0),
+ SyscallSucceedsWithValue(Gt(0)));
+}
+
+// Map length < PageSize allowed
+TEST_F(MMapTest, SmallMap) {
+ EXPECT_THAT(Map(0, 128, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0),
+ SyscallSucceeds());
+}
+
+// Hint address doesn't break anything.
+// Note: there is no requirement we actually get the hint address
+TEST_F(MMapTest, HintAddress) {
+ EXPECT_THAT(
+ Map(0x30000000, kPageSize, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0),
+ SyscallSucceeds());
+}
+
+// MAP_FIXED gives us exactly the requested address
+TEST_F(MMapTest, MapFixed) {
+ EXPECT_THAT(Map(0x30000000, kPageSize, PROT_NONE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0),
+ SyscallSucceedsWithValue(0x30000000));
+}
+
+// 64-bit addresses work too
+#if defined(__x86_64__) || defined(__aarch64__)
+TEST_F(MMapTest, MapFixed64) {
+ EXPECT_THAT(Map(0x300000000000, kPageSize, PROT_NONE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0),
+ SyscallSucceedsWithValue(0x300000000000));
+}
+#endif
+
+// MAP_STACK allowed.
+// There isn't a good way to verify it did anything.
+TEST_F(MMapTest, MapStack) {
+ EXPECT_THAT(Map(0, kPageSize, PROT_NONE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0),
+ SyscallSucceeds());
+}
+
+// MAP_LOCKED allowed.
+// There isn't a good way to verify it did anything.
+TEST_F(MMapTest, MapLocked) {
+ EXPECT_THAT(Map(0, kPageSize, PROT_NONE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_LOCKED, -1, 0),
+ SyscallSucceeds());
+}
+
+// MAP_PRIVATE or MAP_SHARED must be passed
+TEST_F(MMapTest, NotPrivateOrShared) {
+ EXPECT_THAT(Map(0, kPageSize, PROT_NONE, MAP_ANONYMOUS, -1, 0),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+// Only one of MAP_PRIVATE or MAP_SHARED may be passed
+TEST_F(MMapTest, PrivateAndShared) {
+ EXPECT_THAT(Map(0, kPageSize, PROT_NONE,
+ MAP_PRIVATE | MAP_SHARED | MAP_ANONYMOUS, -1, 0),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_F(MMapTest, FixedAlignment) {
+ // Addr must be page aligned (MAP_FIXED)
+ EXPECT_THAT(Map(0x30000001, kPageSize, PROT_NONE,
+ MAP_PRIVATE | MAP_FIXED | MAP_ANONYMOUS, -1, 0),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+// Non-MAP_FIXED address does not need to be page aligned
+TEST_F(MMapTest, NonFixedAlignment) {
+ EXPECT_THAT(
+ Map(0x30000001, kPageSize, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0),
+ SyscallSucceeds());
+}
+
+// Length = 0 results in EINVAL.
+TEST_F(MMapTest, InvalidLength) {
+ EXPECT_THAT(Map(0, 0, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+// Bad fd not allowed.
+TEST_F(MMapTest, BadFd) {
+ EXPECT_THAT(Map(0, kPageSize, PROT_NONE, MAP_PRIVATE, 999, 0),
+ SyscallFailsWithErrno(EBADF));
+}
+
+// Mappings are writable.
+TEST_F(MMapTest, ProtWrite) {
+ uint64_t addr;
+ constexpr uint8_t kFirstWord[] = {42, 42, 42, 42};
+
+ EXPECT_THAT(addr = Map(0, kPageSize, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0),
+ SyscallSucceeds());
+
+ // This shouldn't cause a SIGSEGV.
+ memset(reinterpret_cast<void*>(addr), 42, kPageSize);
+
+ // The written data should actually be there.
+ EXPECT_EQ(
+ 0, memcmp(reinterpret_cast<void*>(addr), kFirstWord, sizeof(kFirstWord)));
+}
+
+// "Write-only" mappings are writable *and* readable.
+TEST_F(MMapTest, ProtWriteOnly) {
+ uint64_t addr;
+ constexpr uint8_t kFirstWord[] = {42, 42, 42, 42};
+
+ EXPECT_THAT(
+ addr = Map(0, kPageSize, PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0),
+ SyscallSucceeds());
+
+ // This shouldn't cause a SIGSEGV.
+ memset(reinterpret_cast<void*>(addr), 42, kPageSize);
+
+ // The written data should actually be there.
+ EXPECT_EQ(
+ 0, memcmp(reinterpret_cast<void*>(addr), kFirstWord, sizeof(kFirstWord)));
+}
+
+// "Write-only" mappings are readable.
+//
+// This is distinct from above to ensure the page is accessible even if the
+// initial fault is a write fault.
+TEST_F(MMapTest, ProtWriteOnlyReadable) {
+ uint64_t addr;
+ constexpr uint64_t kFirstWord = 0;
+
+ EXPECT_THAT(
+ addr = Map(0, kPageSize, PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0),
+ SyscallSucceeds());
+
+ EXPECT_EQ(0, memcmp(reinterpret_cast<void*>(addr), &kFirstWord,
+ sizeof(kFirstWord)));
+}
+
+// Mappings are writable after mprotect from PROT_NONE to PROT_READ|PROT_WRITE.
+TEST_F(MMapTest, ProtectProtWrite) {
+ uint64_t addr;
+ constexpr uint8_t kFirstWord[] = {42, 42, 42, 42};
+
+ EXPECT_THAT(
+ addr = Map(0, kPageSize, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0),
+ SyscallSucceeds());
+
+ ASSERT_THAT(Protect(addr, kPageSize, PROT_READ | PROT_WRITE),
+ SyscallSucceeds());
+
+ // This shouldn't cause a SIGSEGV.
+ memset(reinterpret_cast<void*>(addr), 42, kPageSize);
+
+ // The written data should actually be there.
+ EXPECT_EQ(
+ 0, memcmp(reinterpret_cast<void*>(addr), kFirstWord, sizeof(kFirstWord)));
+}
+
+// SIGSEGV raised when reading PROT_NONE memory
+TEST_F(MMapTest, ProtNoneDeath) {
+ SetupGvisorDeathTest();
+
+ uintptr_t addr;
+
+ ASSERT_THAT(
+ addr = Map(0, kPageSize, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0),
+ SyscallSucceeds());
+
+ EXPECT_EXIT(*reinterpret_cast<volatile int*>(addr),
+ ::testing::KilledBySignal(SIGSEGV), "");
+}
+
+// SIGSEGV raised when writing PROT_READ only memory
+TEST_F(MMapTest, ReadOnlyDeath) {
+ SetupGvisorDeathTest();
+
+ uintptr_t addr;
+
+ ASSERT_THAT(
+ addr = Map(0, kPageSize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0),
+ SyscallSucceeds());
+
+ EXPECT_EXIT(*reinterpret_cast<volatile int*>(addr) = 42,
+ ::testing::KilledBySignal(SIGSEGV), "");
+}
+
+// Writable mapping mprotect'd to read-only should not be writable.
+TEST_F(MMapTest, MprotectReadOnlyDeath) {
+ SetupGvisorDeathTest();
+
+ uintptr_t addr;
+
+ ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0),
+ SyscallSucceeds());
+
+ volatile int* val = reinterpret_cast<int*>(addr);
+
+ // Copy to ensure page is mapped in.
+ *val = 42;
+
+ ASSERT_THAT(Protect(addr, kPageSize, PROT_READ), SyscallSucceeds());
+
+ // Now it shouldn't be writable.
+ EXPECT_EXIT(*val = 0, ::testing::KilledBySignal(SIGSEGV), "");
+}
+
+// Verify that calling mprotect an address that's not page aligned fails.
+TEST_F(MMapTest, MprotectNotPageAligned) {
+ uintptr_t addr;
+
+ ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0),
+ SyscallSucceeds());
+ ASSERT_THAT(Protect(addr + 1, kPageSize - 1, PROT_READ),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+// Verify that calling mprotect with an absurdly huge length fails.
+TEST_F(MMapTest, MprotectHugeLength) {
+ uintptr_t addr;
+
+ ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0),
+ SyscallSucceeds());
+ ASSERT_THAT(Protect(addr, static_cast<size_t>(-1), PROT_READ),
+ SyscallFailsWithErrno(ENOMEM));
+}
+
+#if defined(__x86_64__) || defined(__i386__)
+// This code is equivalent in 32 and 64-bit mode
+const uint8_t machine_code[] = {
+ 0xb8, 0x2a, 0x00, 0x00, 0x00, // movl $42, %eax
+ 0xc3, // retq
+};
+#elif defined(__aarch64__)
+const uint8_t machine_code[] = {
+ 0x40, 0x05, 0x80, 0x52, // mov w0, #42
+ 0xc0, 0x03, 0x5f, 0xd6, // ret
+};
+#endif
+
+// PROT_EXEC allows code execution
+TEST_F(MMapTest, ProtExec) {
+ uintptr_t addr;
+ uint32_t (*func)(void);
+
+ EXPECT_THAT(addr = Map(0, kPageSize, PROT_EXEC | PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0),
+ SyscallSucceeds());
+
+ memcpy(reinterpret_cast<void*>(addr), machine_code, sizeof(machine_code));
+
+ func = reinterpret_cast<uint32_t (*)(void)>(addr);
+
+ EXPECT_EQ(42, func());
+}
+
+// No PROT_EXEC disallows code execution
+TEST_F(MMapTest, NoProtExecDeath) {
+ SetupGvisorDeathTest();
+
+ uintptr_t addr;
+ uint32_t (*func)(void);
+
+ EXPECT_THAT(addr = Map(0, kPageSize, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0),
+ SyscallSucceeds());
+
+ memcpy(reinterpret_cast<void*>(addr), machine_code, sizeof(machine_code));
+
+ func = reinterpret_cast<uint32_t (*)(void)>(addr);
+
+ EXPECT_EXIT(func(), ::testing::KilledBySignal(SIGSEGV), "");
+}
+
+TEST_F(MMapTest, NoExceedLimitData) {
+ void* prevbrk;
+ void* target_brk;
+ struct rlimit setlim;
+
+ prevbrk = sbrk(0);
+ ASSERT_NE(-1, reinterpret_cast<intptr_t>(prevbrk));
+ target_brk = reinterpret_cast<char*>(prevbrk) + 1;
+
+ setlim.rlim_cur = RLIM_INFINITY;
+ setlim.rlim_max = RLIM_INFINITY;
+ ASSERT_THAT(setrlimit(RLIMIT_DATA, &setlim), SyscallSucceeds());
+ EXPECT_THAT(brk(target_brk), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(MMapTest, ExceedLimitData) {
+ // To unit test this more precisely, we'd need access to the mm's start_brk
+ // and end_brk, which we don't have direct access to :/
+ void* prevbrk;
+ void* target_brk;
+ struct rlimit setlim;
+
+ prevbrk = sbrk(0);
+ ASSERT_NE(-1, reinterpret_cast<intptr_t>(prevbrk));
+ target_brk = reinterpret_cast<char*>(prevbrk) + 8192;
+
+ setlim.rlim_cur = 0;
+ setlim.rlim_max = RLIM_INFINITY;
+ // Set RLIMIT_DATA very low so any subsequent brk() calls fail.
+ // Reset RLIMIT_DATA during teardown step.
+ ASSERT_THAT(setrlimit(RLIMIT_DATA, &setlim), SyscallSucceeds());
+ EXPECT_THAT(brk(target_brk), SyscallFailsWithErrno(ENOMEM));
+ // Teardown step...
+ setlim.rlim_cur = RLIM_INFINITY;
+ ASSERT_THAT(setrlimit(RLIMIT_DATA, &setlim), SyscallSucceeds());
+}
+
+TEST_F(MMapTest, ExceedLimitDataPrlimit) {
+ // To unit test this more precisely, we'd need access to the mm's start_brk
+ // and end_brk, which we don't have direct access to :/
+ void* prevbrk;
+ void* target_brk;
+ struct rlimit setlim;
+
+ prevbrk = sbrk(0);
+ ASSERT_NE(-1, reinterpret_cast<intptr_t>(prevbrk));
+ target_brk = reinterpret_cast<char*>(prevbrk) + 8192;
+
+ setlim.rlim_cur = 0;
+ setlim.rlim_max = RLIM_INFINITY;
+ // Set RLIMIT_DATA very low so any subsequent brk() calls fail.
+ // Reset RLIMIT_DATA during teardown step.
+ ASSERT_THAT(prlimit(0, RLIMIT_DATA, &setlim, nullptr), SyscallSucceeds());
+ EXPECT_THAT(brk(target_brk), SyscallFailsWithErrno(ENOMEM));
+ // Teardown step...
+ setlim.rlim_cur = RLIM_INFINITY;
+ ASSERT_THAT(setrlimit(RLIMIT_DATA, &setlim), SyscallSucceeds());
+}
+
+TEST_F(MMapTest, ExceedLimitDataPrlimitPID) {
+ // To unit test this more precisely, we'd need access to the mm's start_brk
+ // and end_brk, which we don't have direct access to :/
+ void* prevbrk;
+ void* target_brk;
+ struct rlimit setlim;
+
+ prevbrk = sbrk(0);
+ ASSERT_NE(-1, reinterpret_cast<intptr_t>(prevbrk));
+ target_brk = reinterpret_cast<char*>(prevbrk) + 8192;
+
+ setlim.rlim_cur = 0;
+ setlim.rlim_max = RLIM_INFINITY;
+ // Set RLIMIT_DATA very low so any subsequent brk() calls fail.
+ // Reset RLIMIT_DATA during teardown step.
+ ASSERT_THAT(prlimit(syscall(__NR_gettid), RLIMIT_DATA, &setlim, nullptr),
+ SyscallSucceeds());
+ EXPECT_THAT(brk(target_brk), SyscallFailsWithErrno(ENOMEM));
+ // Teardown step...
+ setlim.rlim_cur = RLIM_INFINITY;
+ ASSERT_THAT(setrlimit(RLIMIT_DATA, &setlim), SyscallSucceeds());
+}
+
+TEST_F(MMapTest, NoExceedLimitAS) {
+ constexpr uint64_t kAllocBytes = 200 << 20;
+ // Add some headroom to the AS limit in case of e.g. unexpected stack
+ // expansion.
+ constexpr uint64_t kExtraASBytes = kAllocBytes + (20 << 20);
+ static_assert(kAllocBytes < kExtraASBytes,
+ "test depends on allocation not exceeding AS limit");
+
+ auto vss = ASSERT_NO_ERRNO_AND_VALUE(VirtualMemorySize());
+ struct rlimit setlim;
+ setlim.rlim_cur = vss + kExtraASBytes;
+ setlim.rlim_max = RLIM_INFINITY;
+ ASSERT_THAT(setrlimit(RLIMIT_AS, &setlim), SyscallSucceeds());
+ EXPECT_THAT(
+ Map(0, kAllocBytes, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0),
+ SyscallSucceedsWithValue(Gt(0)));
+}
+
+TEST_F(MMapTest, ExceedLimitAS) {
+ constexpr uint64_t kAllocBytes = 200 << 20;
+ // Add some headroom to the AS limit in case of e.g. unexpected stack
+ // expansion.
+ constexpr uint64_t kExtraASBytes = 20 << 20;
+ static_assert(kAllocBytes > kExtraASBytes,
+ "test depends on allocation exceeding AS limit");
+
+ auto vss = ASSERT_NO_ERRNO_AND_VALUE(VirtualMemorySize());
+ struct rlimit setlim;
+ setlim.rlim_cur = vss + kExtraASBytes;
+ setlim.rlim_max = RLIM_INFINITY;
+ ASSERT_THAT(setrlimit(RLIMIT_AS, &setlim), SyscallSucceeds());
+ EXPECT_THAT(
+ Map(0, kAllocBytes, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0),
+ SyscallFailsWithErrno(ENOMEM));
+}
+
+// Tests that setting an anonymous mmap to PROT_NONE doesn't free the memory.
+TEST_F(MMapTest, SettingProtNoneDoesntFreeMemory) {
+ uintptr_t addr;
+ constexpr uint8_t kFirstWord[] = {42, 42, 42, 42};
+
+ EXPECT_THAT(addr = Map(0, kPageSize, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0),
+ SyscallSucceedsWithValue(Gt(0)));
+
+ memset(reinterpret_cast<void*>(addr), 42, kPageSize);
+
+ ASSERT_THAT(Protect(addr, kPageSize, PROT_NONE), SyscallSucceeds());
+ ASSERT_THAT(Protect(addr, kPageSize, PROT_READ | PROT_WRITE),
+ SyscallSucceeds());
+
+ // The written data should still be there.
+ EXPECT_EQ(
+ 0, memcmp(reinterpret_cast<void*>(addr), kFirstWord, sizeof(kFirstWord)));
+}
+
+constexpr char kFileContents[] = "Hello World!";
+
+class MMapFileTest : public MMapTest {
+ protected:
+ FileDescriptor fd_;
+ std::string filename_;
+
+ // Open a file for read/write
+ void SetUp() override {
+ MMapTest::SetUp();
+
+ filename_ = NewTempAbsPath();
+ fd_ = ASSERT_NO_ERRNO_AND_VALUE(Open(filename_, O_CREAT | O_RDWR, 0644));
+
+ // Extend file so it can be written once mapped. Deliberately make the file
+ // only half a page in size, so we can test what happens when we access the
+ // second half.
+ // Use ftruncate(2) once the sentry supports it.
+ char zero = 0;
+ size_t count = 0;
+ do {
+ const DisableSave ds; // saving 2048 times is slow and useless.
+ Write(&zero, 1), SyscallSucceedsWithValue(1);
+ } while (++count < (kPageSize / 2));
+ ASSERT_THAT(lseek(fd_.get(), 0, SEEK_SET), SyscallSucceedsWithValue(0));
+ }
+
+ // Close and delete file
+ void TearDown() override {
+ MMapTest::TearDown();
+ fd_.reset(); // Make sure the files is closed before we unlink it.
+ ASSERT_THAT(unlink(filename_.c_str()), SyscallSucceeds());
+ }
+
+ ssize_t Read(char* buf, size_t count) {
+ ssize_t len = 0;
+ do {
+ ssize_t ret = read(fd_.get(), buf, count);
+ if (ret < 0) {
+ return ret;
+ } else if (ret == 0) {
+ return len;
+ }
+
+ len += ret;
+ buf += ret;
+ } while (len < static_cast<ssize_t>(count));
+
+ return len;
+ }
+
+ ssize_t Write(const char* buf, size_t count) {
+ ssize_t len = 0;
+ do {
+ ssize_t ret = write(fd_.get(), buf, count);
+ if (ret < 0) {
+ return ret;
+ } else if (ret == 0) {
+ return len;
+ }
+
+ len += ret;
+ buf += ret;
+ } while (len < static_cast<ssize_t>(count));
+
+ return len;
+ }
+};
+
+class MMapFileParamTest
+ : public MMapFileTest,
+ public ::testing::WithParamInterface<std::tuple<int, int>> {
+ protected:
+ int prot() const { return std::get<0>(GetParam()); }
+
+ int flags() const { return std::get<1>(GetParam()); }
+};
+
+// MAP_POPULATE allowed.
+// There isn't a good way to verify it actually did anything.
+TEST_P(MMapFileParamTest, MapPopulate) {
+ ASSERT_THAT(Map(0, kPageSize, prot(), flags() | MAP_POPULATE, fd_.get(), 0),
+ SyscallSucceeds());
+}
+
+// MAP_POPULATE on a short file.
+TEST_P(MMapFileParamTest, MapPopulateShort) {
+ ASSERT_THAT(
+ Map(0, 2 * kPageSize, prot(), flags() | MAP_POPULATE, fd_.get(), 0),
+ SyscallSucceeds());
+}
+
+// Read contents from mapped file.
+TEST_F(MMapFileTest, Read) {
+ size_t len = strlen(kFileContents);
+ ASSERT_EQ(len, Write(kFileContents, len));
+
+ uintptr_t addr;
+ ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ, MAP_PRIVATE, fd_.get(), 0),
+ SyscallSucceeds());
+
+ EXPECT_THAT(reinterpret_cast<char*>(addr),
+ EqualsMemory(std::string(kFileContents)));
+}
+
+// Map at an offset.
+TEST_F(MMapFileTest, MapOffset) {
+ ASSERT_THAT(lseek(fd_.get(), kPageSize, SEEK_SET), SyscallSucceeds());
+
+ size_t len = strlen(kFileContents);
+ ASSERT_EQ(len, Write(kFileContents, len));
+
+ uintptr_t addr;
+ ASSERT_THAT(
+ addr = Map(0, kPageSize, PROT_READ, MAP_PRIVATE, fd_.get(), kPageSize),
+ SyscallSucceeds());
+
+ EXPECT_THAT(reinterpret_cast<char*>(addr),
+ EqualsMemory(std::string(kFileContents)));
+}
+
+TEST_F(MMapFileTest, MapOffsetBeyondEnd) {
+ SetupGvisorDeathTest();
+
+ uintptr_t addr;
+ ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE,
+ fd_.get(), 10 * kPageSize),
+ SyscallSucceeds());
+
+ // Touching the memory causes SIGBUS.
+ size_t len = strlen(kFileContents);
+ EXPECT_EXIT(std::copy(kFileContents, kFileContents + len,
+ reinterpret_cast<volatile char*>(addr)),
+ ::testing::KilledBySignal(SIGBUS), "");
+}
+
+// Verify mmap fails when sum of length and offset overflows.
+TEST_F(MMapFileTest, MapLengthPlusOffsetOverflows) {
+ const size_t length = static_cast<size_t>(-kPageSize);
+ const off_t offset = kPageSize;
+ ASSERT_THAT(Map(0, length, PROT_READ, MAP_PRIVATE, fd_.get(), offset),
+ SyscallFailsWithErrno(ENOMEM));
+}
+
+// MAP_PRIVATE PROT_WRITE is allowed on read-only FDs.
+TEST_F(MMapFileTest, WritePrivateOnReadOnlyFd) {
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(filename_, O_RDONLY));
+
+ uintptr_t addr;
+ EXPECT_THAT(addr = Map(0, kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE,
+ fd.get(), 0),
+ SyscallSucceeds());
+
+ // Touch the page to ensure the kernel didn't lie about writability.
+ size_t len = strlen(kFileContents);
+ std::copy(kFileContents, kFileContents + len,
+ reinterpret_cast<volatile char*>(addr));
+}
+
+// MAP_SHARED PROT_WRITE not allowed on read-only FDs.
+TEST_F(MMapFileTest, WriteSharedOnReadOnlyFd) {
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(filename_, O_RDONLY));
+
+ uintptr_t addr;
+ EXPECT_THAT(
+ addr = Map(0, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd.get(), 0),
+ SyscallFailsWithErrno(EACCES));
+}
+
+// The FD must be readable.
+TEST_P(MMapFileParamTest, WriteOnlyFd) {
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(filename_, O_WRONLY));
+
+ uintptr_t addr;
+ EXPECT_THAT(addr = Map(0, kPageSize, prot(), flags(), fd.get(), 0),
+ SyscallFailsWithErrno(EACCES));
+}
+
+// Overwriting the contents of a file mapped MAP_SHARED PROT_READ
+// should cause the new data to be reflected in the mapping.
+TEST_F(MMapFileTest, ReadSharedConsistentWithOverwrite) {
+ // Start from scratch.
+ EXPECT_THAT(ftruncate(fd_.get(), 0), SyscallSucceeds());
+
+ // Expand the file to two pages and dirty them.
+ std::string bufA(kPageSize, 'a');
+ ASSERT_THAT(Write(bufA.c_str(), bufA.size()),
+ SyscallSucceedsWithValue(bufA.size()));
+ std::string bufB(kPageSize, 'b');
+ ASSERT_THAT(Write(bufB.c_str(), bufB.size()),
+ SyscallSucceedsWithValue(bufB.size()));
+
+ // Map the page.
+ uintptr_t addr;
+ ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ, MAP_SHARED, fd_.get(), 0),
+ SyscallSucceeds());
+
+ // Check that the mapping contains the right file data.
+ EXPECT_EQ(0, memcmp(reinterpret_cast<void*>(addr), bufA.c_str(), kPageSize));
+ EXPECT_EQ(0, memcmp(reinterpret_cast<void*>(addr + kPageSize), bufB.c_str(),
+ kPageSize));
+
+ // Start at the beginning of the file.
+ ASSERT_THAT(lseek(fd_.get(), 0, SEEK_SET), SyscallSucceedsWithValue(0));
+
+ // Swap the write pattern.
+ ASSERT_THAT(Write(bufB.c_str(), bufB.size()),
+ SyscallSucceedsWithValue(bufB.size()));
+ ASSERT_THAT(Write(bufA.c_str(), bufA.size()),
+ SyscallSucceedsWithValue(bufA.size()));
+
+ // Check that the mapping got updated.
+ EXPECT_EQ(0, memcmp(reinterpret_cast<void*>(addr), bufB.c_str(), kPageSize));
+ EXPECT_EQ(0, memcmp(reinterpret_cast<void*>(addr + kPageSize), bufA.c_str(),
+ kPageSize));
+}
+
+// Partially overwriting a file mapped MAP_SHARED PROT_READ should be reflected
+// in the mapping.
+TEST_F(MMapFileTest, ReadSharedConsistentWithPartialOverwrite) {
+ // Start from scratch.
+ EXPECT_THAT(ftruncate(fd_.get(), 0), SyscallSucceeds());
+
+ // Expand the file to two pages and dirty them.
+ std::string bufA(kPageSize, 'a');
+ ASSERT_THAT(Write(bufA.c_str(), bufA.size()),
+ SyscallSucceedsWithValue(bufA.size()));
+ std::string bufB(kPageSize, 'b');
+ ASSERT_THAT(Write(bufB.c_str(), bufB.size()),
+ SyscallSucceedsWithValue(bufB.size()));
+
+ // Map the page.
+ uintptr_t addr;
+ ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ, MAP_SHARED, fd_.get(), 0),
+ SyscallSucceeds());
+
+ // Check that the mapping contains the right file data.
+ EXPECT_EQ(0, memcmp(reinterpret_cast<void*>(addr), bufA.c_str(), kPageSize));
+ EXPECT_EQ(0, memcmp(reinterpret_cast<void*>(addr + kPageSize), bufB.c_str(),
+ kPageSize));
+
+ // Start at the beginning of the file.
+ ASSERT_THAT(lseek(fd_.get(), 0, SEEK_SET), SyscallSucceedsWithValue(0));
+
+ // Do a partial overwrite, spanning both pages.
+ std::string bufC(kPageSize + (kPageSize / 2), 'c');
+ ASSERT_THAT(Write(bufC.c_str(), bufC.size()),
+ SyscallSucceedsWithValue(bufC.size()));
+
+ // Check that the mapping got updated.
+ EXPECT_EQ(0, memcmp(reinterpret_cast<void*>(addr), bufC.c_str(),
+ kPageSize + (kPageSize / 2)));
+ EXPECT_EQ(0,
+ memcmp(reinterpret_cast<void*>(addr + kPageSize + (kPageSize / 2)),
+ bufB.c_str(), kPageSize / 2));
+}
+
+// Overwriting a file mapped MAP_SHARED PROT_READ should be reflected in the
+// mapping and the file.
+TEST_F(MMapFileTest, ReadSharedConsistentWithWriteAndFile) {
+ // Start from scratch.
+ EXPECT_THAT(ftruncate(fd_.get(), 0), SyscallSucceeds());
+
+ // Expand the file to two full pages and dirty it.
+ std::string bufA(2 * kPageSize, 'a');
+ ASSERT_THAT(Write(bufA.c_str(), bufA.size()),
+ SyscallSucceedsWithValue(bufA.size()));
+
+ // Map only the first page.
+ uintptr_t addr;
+ ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ, MAP_SHARED, fd_.get(), 0),
+ SyscallSucceeds());
+
+ // Prepare to overwrite the file contents.
+ ASSERT_THAT(lseek(fd_.get(), 0, SEEK_SET), SyscallSucceedsWithValue(0));
+
+ // Overwrite everything, beyond the mapped portion.
+ std::string bufB(2 * kPageSize, 'b');
+ ASSERT_THAT(Write(bufB.c_str(), bufB.size()),
+ SyscallSucceedsWithValue(bufB.size()));
+
+ // What the mapped portion should now look like.
+ std::string bufMapped(kPageSize, 'b');
+
+ // Expect that the mapped portion is consistent.
+ EXPECT_EQ(
+ 0, memcmp(reinterpret_cast<void*>(addr), bufMapped.c_str(), kPageSize));
+
+ // Prepare to read the entire file contents.
+ ASSERT_THAT(lseek(fd_.get(), 0, SEEK_SET), SyscallSucceedsWithValue(0));
+
+ // Expect that the file was fully updated.
+ std::vector<char> bufFile(2 * kPageSize);
+ ASSERT_THAT(Read(bufFile.data(), bufFile.size()),
+ SyscallSucceedsWithValue(bufFile.size()));
+ // Cast to void* to avoid EXPECT_THAT assuming bufFile.data() is a
+ // NUL-terminated C std::string. EXPECT_THAT will try to print a char* as a C
+ // std::string, possibly overruning the buffer.
+ EXPECT_THAT(reinterpret_cast<void*>(bufFile.data()), EqualsMemory(bufB));
+}
+
+// Write data to mapped file.
+TEST_F(MMapFileTest, WriteShared) {
+ uintptr_t addr;
+ ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
+ fd_.get(), 0),
+ SyscallSucceeds());
+
+ size_t len = strlen(kFileContents);
+ memcpy(reinterpret_cast<void*>(addr), kFileContents, len);
+
+ // The file may not actually be updated until munmap is called.
+ ASSERT_THAT(Unmap(), SyscallSucceeds());
+
+ std::vector<char> buf(len);
+ ASSERT_THAT(Read(buf.data(), buf.size()),
+ SyscallSucceedsWithValue(buf.size()));
+ // Cast to void* to avoid EXPECT_THAT assuming buf.data() is a
+ // NUL-terminated C string. EXPECT_THAT will try to print a char* as a C
+ // string, possibly overruning the buffer.
+ EXPECT_THAT(reinterpret_cast<void*>(buf.data()),
+ EqualsMemory(std::string(kFileContents)));
+}
+
+// Write data to portion of mapped page beyond the end of the file.
+// These writes are not reflected in the file.
+TEST_F(MMapFileTest, WriteSharedBeyondEnd) {
+ // The file is only half of a page. We map an entire page. Writes to the
+ // end of the mapping must not be reflected in the file.
+ uintptr_t addr;
+ ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
+ fd_.get(), 0),
+ SyscallSucceeds());
+
+ // First half; this is reflected in the file.
+ std::string first(kPageSize / 2, 'A');
+ memcpy(reinterpret_cast<void*>(addr), first.c_str(), first.size());
+
+ // Second half; this is not reflected in the file.
+ std::string second(kPageSize / 2, 'B');
+ memcpy(reinterpret_cast<void*>(addr + kPageSize / 2), second.c_str(),
+ second.size());
+
+ // The file may not actually be updated until munmap is called.
+ ASSERT_THAT(Unmap(), SyscallSucceeds());
+
+ // Big enough to fit the entire page, if the writes are mistakenly written to
+ // the file.
+ std::vector<char> buf(kPageSize);
+
+ // Only the first half is in the file.
+ ASSERT_THAT(Read(buf.data(), buf.size()),
+ SyscallSucceedsWithValue(first.size()));
+ // Cast to void* to avoid EXPECT_THAT assuming buf.data() is a
+ // NUL-terminated C string. EXPECT_THAT will try to print a char* as a C
+ // NUL-terminated C std::string. EXPECT_THAT will try to print a char* as a C
+ // std::string, possibly overruning the buffer.
+ EXPECT_THAT(reinterpret_cast<void*>(buf.data()), EqualsMemory(first));
+}
+
+// The portion of a mapped page that becomes part of the file after a truncate
+// is reflected in the file.
+TEST_F(MMapFileTest, WriteSharedTruncateUp) {
+ // The file is only half of a page. We map an entire page. Writes to the
+ // end of the mapping must not be reflected in the file.
+ uintptr_t addr;
+ ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
+ fd_.get(), 0),
+ SyscallSucceeds());
+
+ // First half; this is reflected in the file.
+ std::string first(kPageSize / 2, 'A');
+ memcpy(reinterpret_cast<void*>(addr), first.c_str(), first.size());
+
+ // Second half; this is not reflected in the file now (see
+ // WriteSharedBeyondEnd), but will be after the truncate.
+ std::string second(kPageSize / 2, 'B');
+ memcpy(reinterpret_cast<void*>(addr + kPageSize / 2), second.c_str(),
+ second.size());
+
+ // Extend the file to a full page. The second half of the page will be
+ // reflected in the file.
+ EXPECT_THAT(ftruncate(fd_.get(), kPageSize), SyscallSucceeds());
+
+ // The file may not actually be updated until munmap is called.
+ ASSERT_THAT(Unmap(), SyscallSucceeds());
+
+ // The whole page is in the file.
+ std::vector<char> buf(kPageSize);
+ ASSERT_THAT(Read(buf.data(), buf.size()),
+ SyscallSucceedsWithValue(buf.size()));
+ // Cast to void* to avoid EXPECT_THAT assuming buf.data() is a
+ // NUL-terminated C string. EXPECT_THAT will try to print a char* as a C
+ // string, possibly overruning the buffer.
+ EXPECT_THAT(reinterpret_cast<void*>(buf.data()), EqualsMemory(first));
+ EXPECT_THAT(reinterpret_cast<void*>(buf.data() + kPageSize / 2),
+ EqualsMemory(second));
+}
+
+TEST_F(MMapFileTest, ReadSharedTruncateDownThenUp) {
+ // Start from scratch.
+ EXPECT_THAT(ftruncate(fd_.get(), 0), SyscallSucceeds());
+
+ // Expand the file to a full page and dirty it.
+ std::string buf(kPageSize, 'a');
+ ASSERT_THAT(Write(buf.c_str(), buf.size()),
+ SyscallSucceedsWithValue(buf.size()));
+
+ // Map the page.
+ uintptr_t addr;
+ ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ, MAP_SHARED, fd_.get(), 0),
+ SyscallSucceeds());
+
+ // Check that the memory contains the file data.
+ EXPECT_EQ(0, memcmp(reinterpret_cast<void*>(addr), buf.c_str(), kPageSize));
+
+ // Truncate down, then up.
+ EXPECT_THAT(ftruncate(fd_.get(), 0), SyscallSucceeds());
+ EXPECT_THAT(ftruncate(fd_.get(), kPageSize), SyscallSucceeds());
+
+ // Check that the memory was zeroed.
+ std::string zeroed(kPageSize, '\0');
+ EXPECT_EQ(0,
+ memcmp(reinterpret_cast<void*>(addr), zeroed.c_str(), kPageSize));
+
+ // The file may not actually be updated until msync is called.
+ ASSERT_THAT(Msync(), SyscallSucceeds());
+
+ // Prepare to read the entire file contents.
+ ASSERT_THAT(lseek(fd_.get(), 0, SEEK_SET), SyscallSucceedsWithValue(0));
+
+ // Expect that the file is fully updated.
+ std::vector<char> bufFile(kPageSize);
+ ASSERT_THAT(Read(bufFile.data(), bufFile.size()),
+ SyscallSucceedsWithValue(bufFile.size()));
+ EXPECT_EQ(0, memcmp(bufFile.data(), zeroed.c_str(), kPageSize));
+}
+
+TEST_F(MMapFileTest, WriteSharedTruncateDownThenUp) {
+ // The file is only half of a page. We map an entire page. Writes to the
+ // end of the mapping must not be reflected in the file.
+ uintptr_t addr;
+ ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
+ fd_.get(), 0),
+ SyscallSucceeds());
+
+ // First half; this will be deleted by truncate(0).
+ std::string first(kPageSize / 2, 'A');
+ memcpy(reinterpret_cast<void*>(addr), first.c_str(), first.size());
+
+ // Truncate down, then up.
+ EXPECT_THAT(ftruncate(fd_.get(), 0), SyscallSucceeds());
+ EXPECT_THAT(ftruncate(fd_.get(), kPageSize), SyscallSucceeds());
+
+ // The whole page is zeroed in memory.
+ std::string zeroed(kPageSize, '\0');
+ EXPECT_EQ(0,
+ memcmp(reinterpret_cast<void*>(addr), zeroed.c_str(), kPageSize));
+
+ // The file may not actually be updated until munmap is called.
+ ASSERT_THAT(Unmap(), SyscallSucceeds());
+
+ // The whole file is also zeroed.
+ std::vector<char> buf(kPageSize);
+ ASSERT_THAT(Read(buf.data(), buf.size()),
+ SyscallSucceedsWithValue(buf.size()));
+ // Cast to void* to avoid EXPECT_THAT assuming buf.data() is a
+ // NUL-terminated C string. EXPECT_THAT will try to print a char* as a C
+ // string, possibly overruning the buffer.
+ EXPECT_THAT(reinterpret_cast<void*>(buf.data()), EqualsMemory(zeroed));
+}
+
+TEST_F(MMapFileTest, ReadSharedTruncateSIGBUS) {
+ SetupGvisorDeathTest();
+
+ // Start from scratch.
+ EXPECT_THAT(ftruncate(fd_.get(), 0), SyscallSucceeds());
+
+ // Expand the file to a full page and dirty it.
+ std::string buf(kPageSize, 'a');
+ ASSERT_THAT(Write(buf.c_str(), buf.size()),
+ SyscallSucceedsWithValue(buf.size()));
+
+ // Map the page.
+ uintptr_t addr;
+ ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ, MAP_SHARED, fd_.get(), 0),
+ SyscallSucceeds());
+
+ // Check that the mapping contains the file data.
+ EXPECT_EQ(0, memcmp(reinterpret_cast<void*>(addr), buf.c_str(), kPageSize));
+
+ // Truncate down.
+ EXPECT_THAT(ftruncate(fd_.get(), 0), SyscallSucceeds());
+
+ // Accessing the truncated region should cause a SIGBUS.
+ std::vector<char> in(kPageSize);
+ EXPECT_EXIT(
+ std::copy(reinterpret_cast<volatile char*>(addr),
+ reinterpret_cast<volatile char*>(addr) + kPageSize, in.data()),
+ ::testing::KilledBySignal(SIGBUS), "");
+}
+
+TEST_F(MMapFileTest, WriteSharedTruncateSIGBUS) {
+ SetupGvisorDeathTest();
+
+ uintptr_t addr;
+ ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
+ fd_.get(), 0),
+ SyscallSucceeds());
+
+ // Touch the memory to be sure it really is mapped.
+ size_t len = strlen(kFileContents);
+ memcpy(reinterpret_cast<void*>(addr), kFileContents, len);
+
+ // Truncate down.
+ EXPECT_THAT(ftruncate(fd_.get(), 0), SyscallSucceeds());
+
+ // Accessing the truncated file should cause a SIGBUS.
+ EXPECT_EXIT(std::copy(kFileContents, kFileContents + len,
+ reinterpret_cast<volatile char*>(addr)),
+ ::testing::KilledBySignal(SIGBUS), "");
+}
+
+TEST_F(MMapFileTest, ReadSharedTruncatePartialPage) {
+ // Start from scratch.
+ EXPECT_THAT(ftruncate(fd_.get(), 0), SyscallSucceeds());
+
+ // Dirty the file.
+ std::string buf(kPageSize, 'a');
+ ASSERT_THAT(Write(buf.c_str(), buf.size()),
+ SyscallSucceedsWithValue(buf.size()));
+
+ // Map a page.
+ uintptr_t addr;
+ ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ, MAP_SHARED, fd_.get(), 0),
+ SyscallSucceeds());
+
+ // Truncate to half of the page.
+ EXPECT_THAT(ftruncate(fd_.get(), kPageSize / 2), SyscallSucceeds());
+
+ // First half of the page untouched.
+ EXPECT_EQ(0,
+ memcmp(reinterpret_cast<void*>(addr), buf.data(), kPageSize / 2));
+
+ // Second half is zeroed.
+ std::string zeroed(kPageSize / 2, '\0');
+ EXPECT_EQ(0, memcmp(reinterpret_cast<void*>(addr + kPageSize / 2),
+ zeroed.c_str(), kPageSize / 2));
+}
+
+// Page can still be accessed and contents are intact after truncating a partial
+// page.
+TEST_F(MMapFileTest, WriteSharedTruncatePartialPage) {
+ // Expand the file to a full page.
+ EXPECT_THAT(ftruncate(fd_.get(), kPageSize), SyscallSucceeds());
+
+ uintptr_t addr;
+ ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
+ fd_.get(), 0),
+ SyscallSucceeds());
+
+ // Fill the entire page.
+ std::string contents(kPageSize, 'A');
+ memcpy(reinterpret_cast<void*>(addr), contents.c_str(), contents.size());
+
+ // Truncate half of the page.
+ EXPECT_THAT(ftruncate(fd_.get(), kPageSize / 2), SyscallSucceeds());
+
+ // First half of the page untouched.
+ EXPECT_EQ(0, memcmp(reinterpret_cast<void*>(addr), contents.c_str(),
+ kPageSize / 2));
+
+ // Second half zeroed.
+ std::string zeroed(kPageSize / 2, '\0');
+ EXPECT_EQ(0, memcmp(reinterpret_cast<void*>(addr + kPageSize / 2),
+ zeroed.c_str(), kPageSize / 2));
+}
+
+// MAP_PRIVATE writes are not carried through to the underlying file.
+TEST_F(MMapFileTest, WritePrivate) {
+ uintptr_t addr;
+ ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE,
+ fd_.get(), 0),
+ SyscallSucceeds());
+
+ size_t len = strlen(kFileContents);
+ memcpy(reinterpret_cast<void*>(addr), kFileContents, len);
+
+ // The file should not be updated, but if it mistakenly is, it may not be
+ // until after munmap is called.
+ ASSERT_THAT(Unmap(), SyscallSucceeds());
+
+ std::vector<char> buf(len);
+ ASSERT_THAT(Read(buf.data(), buf.size()),
+ SyscallSucceedsWithValue(buf.size()));
+ // Cast to void* to avoid EXPECT_THAT assuming buf.data() is a
+ // NUL-terminated C string. EXPECT_THAT will try to print a char* as a C
+ // string, possibly overruning the buffer.
+ EXPECT_THAT(reinterpret_cast<void*>(buf.data()),
+ EqualsMemory(std::string(len, '\0')));
+}
+
+// SIGBUS raised when reading or writing past end of a mapped file.
+TEST_P(MMapFileParamTest, SigBusDeath) {
+ SetupGvisorDeathTest();
+
+ uintptr_t addr;
+ ASSERT_THAT(addr = Map(0, 2 * kPageSize, prot(), flags(), fd_.get(), 0),
+ SyscallSucceeds());
+
+ auto* start = reinterpret_cast<volatile char*>(addr + kPageSize);
+
+ // MMapFileTest makes a file kPageSize/2 long. The entire first page should be
+ // accessible, but anything beyond it should not.
+ if (prot() & PROT_WRITE) {
+ // Write beyond first page.
+ size_t len = strlen(kFileContents);
+ EXPECT_EXIT(std::copy(kFileContents, kFileContents + len, start),
+ ::testing::KilledBySignal(SIGBUS), "");
+ } else {
+ // Read beyond first page.
+ std::vector<char> in(kPageSize);
+ EXPECT_EXIT(std::copy(start, start + kPageSize, in.data()),
+ ::testing::KilledBySignal(SIGBUS), "");
+ }
+}
+
+// Tests that SIGBUS is not raised when reading or writing to a file-mapped
+// page before EOF, even if part of the mapping extends beyond EOF.
+//
+// See b/27877699.
+TEST_P(MMapFileParamTest, NoSigBusOnPagesBeforeEOF) {
+ uintptr_t addr;
+ ASSERT_THAT(addr = Map(0, 2 * kPageSize, prot(), flags(), fd_.get(), 0),
+ SyscallSucceeds());
+
+ // The test passes if this survives.
+ auto* start = reinterpret_cast<volatile char*>(addr + (kPageSize / 2) + 1);
+ size_t len = strlen(kFileContents);
+ if (prot() & PROT_WRITE) {
+ std::copy(kFileContents, kFileContents + len, start);
+ } else {
+ std::vector<char> in(len);
+ std::copy(start, start + len, in.data());
+ }
+}
+
+// Tests that SIGBUS is not raised when reading or writing from a file-mapped
+// page containing EOF, *after* the EOF.
+TEST_P(MMapFileParamTest, NoSigBusOnPageContainingEOF) {
+ uintptr_t addr;
+ ASSERT_THAT(addr = Map(0, 2 * kPageSize, prot(), flags(), fd_.get(), 0),
+ SyscallSucceeds());
+
+ // The test passes if this survives. (Technically addr+kPageSize/2 is already
+ // beyond EOF, but +1 to check for fencepost errors.)
+ auto* start = reinterpret_cast<volatile char*>(addr + (kPageSize / 2) + 1);
+ size_t len = strlen(kFileContents);
+ if (prot() & PROT_WRITE) {
+ std::copy(kFileContents, kFileContents + len, start);
+ } else {
+ std::vector<char> in(len);
+ std::copy(start, start + len, in.data());
+ }
+}
+
+// Tests that reading from writable shared file-mapped pages succeeds.
+//
+// On most platforms this is trivial, but when the file is mapped via the sentry
+// page cache (which does not yet support writing to shared mappings), a bug
+// caused reads to fail unnecessarily on such mappings. See b/28913513.
+TEST_F(MMapFileTest, ReadingWritableSharedFilePageSucceeds) {
+ uintptr_t addr;
+ size_t len = strlen(kFileContents);
+
+ ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
+ fd_.get(), 0),
+ SyscallSucceeds());
+
+ std::vector<char> buf(kPageSize);
+ // The test passes if this survives.
+ std::copy(reinterpret_cast<volatile char*>(addr),
+ reinterpret_cast<volatile char*>(addr) + len, buf.data());
+}
+
+// Tests that EFAULT is returned when invoking a syscall that requires the OS to
+// read past end of file (resulting in a fault in sentry context in the gVisor
+// case). See b/28913513.
+TEST_F(MMapFileTest, InternalSigBus) {
+ uintptr_t addr;
+ ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE,
+ fd_.get(), 0),
+ SyscallSucceeds());
+
+ // This depends on the fact that gVisor implements pipes internally.
+ int pipefd[2];
+ ASSERT_THAT(pipe(pipefd), SyscallSucceeds());
+ EXPECT_THAT(
+ write(pipefd[1], reinterpret_cast<void*>(addr + kPageSize), kPageSize),
+ SyscallFailsWithErrno(EFAULT));
+
+ EXPECT_THAT(close(pipefd[0]), SyscallSucceeds());
+ EXPECT_THAT(close(pipefd[1]), SyscallSucceeds());
+}
+
+// Like InternalSigBus, but test the WriteZerosAt path by reading from
+// /dev/zero to a shared mapping (so that the SIGBUS isn't caught during
+// copy-on-write breaking).
+TEST_F(MMapFileTest, InternalSigBusZeroing) {
+ uintptr_t addr;
+ ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
+ fd_.get(), 0),
+ SyscallSucceeds());
+
+ const FileDescriptor dev_zero =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/zero", O_RDONLY));
+ EXPECT_THAT(read(dev_zero.get(), reinterpret_cast<void*>(addr + kPageSize),
+ kPageSize),
+ SyscallFailsWithErrno(EFAULT));
+}
+
+// Checks that mmaps with a length of uint64_t(-PAGE_SIZE + 1) or greater do not
+// induce a sentry panic (due to "rounding up" to 0).
+TEST_F(MMapTest, HugeLength) {
+ EXPECT_THAT(Map(0, static_cast<uint64_t>(-kPageSize + 1), PROT_NONE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0),
+ SyscallFailsWithErrno(ENOMEM));
+}
+
+// Tests for a specific gVisor MM caching bug.
+TEST_F(MMapTest, AccessCOWInvalidatesCachedSegments) {
+ auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ auto fd = ASSERT_NO_ERRNO_AND_VALUE(Open(f.path(), O_RDWR));
+ auto zero_fd = ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/zero", O_RDONLY));
+
+ // Get a two-page private mapping and fill it with 1s.
+ uintptr_t addr;
+ ASSERT_THAT(addr = Map(0, 2 * kPageSize, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0),
+ SyscallSucceeds());
+ memset(addr_, 1, 2 * kPageSize);
+ MaybeSave();
+
+ // Fork to make the mapping copy-on-write.
+ pid_t const pid = fork();
+ if (pid == 0) {
+ // The child process waits for the parent to SIGKILL it.
+ while (true) {
+ pause();
+ }
+ }
+ ASSERT_THAT(pid, SyscallSucceeds());
+ auto cleanup_child = Cleanup([&] {
+ EXPECT_THAT(kill(pid, SIGKILL), SyscallSucceeds());
+ int status;
+ EXPECT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
+ });
+
+ // Induce a read-only Access of the first page of the mapping, which will not
+ // cause a copy. The usermem.Segment should be cached.
+ ASSERT_THAT(PwriteFd(fd.get(), addr_, kPageSize, 0),
+ SyscallSucceedsWithValue(kPageSize));
+
+ // Induce a writable Access of both pages of the mapping. This should
+ // invalidate the cached Segment.
+ ASSERT_THAT(PreadFd(zero_fd.get(), addr_, 2 * kPageSize, 0),
+ SyscallSucceedsWithValue(2 * kPageSize));
+
+ // Induce a read-only Access of the first page of the mapping again. It should
+ // read the 0s that were stored in the mapping by the read from /dev/zero. If
+ // the read failed to invalidate the cached Segment, it will instead read the
+ // 1s in the stale page.
+ ASSERT_THAT(PwriteFd(fd.get(), addr_, kPageSize, 0),
+ SyscallSucceedsWithValue(kPageSize));
+ std::vector<char> buf(kPageSize);
+ ASSERT_THAT(PreadFd(fd.get(), buf.data(), kPageSize, 0),
+ SyscallSucceedsWithValue(kPageSize));
+ for (size_t i = 0; i < kPageSize; i++) {
+ ASSERT_EQ(0, buf[i]) << "at offset " << i;
+ }
+}
+
+TEST_F(MMapTest, NoReserve) {
+ const size_t kSize = 10 * 1 << 20; // 10M
+ uintptr_t addr;
+ ASSERT_THAT(addr = Map(0, kSize, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, -1, 0),
+ SyscallSucceeds());
+ EXPECT_GT(addr, 0);
+
+ // Check that every page can be read/written. Technically, writing to memory
+ // could SIGSEGV in case there is no more memory available. In gVisor it
+ // would never happen though because NORESERVE is ignored. In Linux, it's
+ // possible to fail, but allocation is small enough that it's highly likely
+ // to succeed.
+ for (size_t j = 0; j < kSize; j += kPageSize) {
+ EXPECT_EQ(0, reinterpret_cast<char*>(addr)[j]);
+ reinterpret_cast<char*>(addr)[j] = j;
+ }
+}
+
+// Map more than the gVisor page-cache map unit (64k) and ensure that
+// it is consistent with reading from the file.
+TEST_F(MMapFileTest, Bug38498194) {
+ // Choose a sufficiently large map unit.
+ constexpr int kSize = 4 * 1024 * 1024;
+ EXPECT_THAT(ftruncate(fd_.get(), kSize), SyscallSucceeds());
+
+ // Map a large enough region so that multiple internal segments
+ // are created to back the mapping.
+ uintptr_t addr;
+ ASSERT_THAT(
+ addr = Map(0, kSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd_.get(), 0),
+ SyscallSucceeds());
+
+ std::vector<char> expect(kSize, 'a');
+ std::copy(expect.data(), expect.data() + expect.size(),
+ reinterpret_cast<volatile char*>(addr));
+
+ // Trigger writeback for gVisor. In Linux pages stay cached until
+ // it can't hold onto them anymore.
+ ASSERT_THAT(Unmap(), SyscallSucceeds());
+
+ std::vector<char> buf(kSize);
+ ASSERT_THAT(Read(buf.data(), buf.size()),
+ SyscallSucceedsWithValue(buf.size()));
+ EXPECT_EQ(buf, expect) << std::string(buf.data(), buf.size());
+}
+
+// Tests that reading from a file to a memory mapping of the same file does not
+// deadlock. See b/34813270.
+TEST_F(MMapFileTest, SelfRead) {
+ uintptr_t addr;
+ ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED,
+ fd_.get(), 0),
+ SyscallSucceeds());
+ EXPECT_THAT(Read(reinterpret_cast<char*>(addr), kPageSize / 2),
+ SyscallSucceedsWithValue(kPageSize / 2));
+ // The resulting file contents are poorly-specified and irrelevant.
+}
+
+// Tests that writing to a file from a memory mapping of the same file does not
+// deadlock. Regression test for b/34813270.
+TEST_F(MMapFileTest, SelfWrite) {
+ uintptr_t addr;
+ ASSERT_THAT(addr = Map(0, kPageSize, PROT_READ, MAP_SHARED, fd_.get(), 0),
+ SyscallSucceeds());
+ EXPECT_THAT(Write(reinterpret_cast<char*>(addr), kPageSize / 2),
+ SyscallSucceedsWithValue(kPageSize / 2));
+ // The resulting file contents are poorly-specified and irrelevant.
+}
+
+TEST(MMapDeathTest, TruncateAfterCOWBreak) {
+ SetupGvisorDeathTest();
+
+ // Create and map a single-page file.
+ auto const temp_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ auto const fd = ASSERT_NO_ERRNO_AND_VALUE(Open(temp_file.path(), O_RDWR));
+ ASSERT_THAT(ftruncate(fd.get(), kPageSize), SyscallSucceeds());
+ auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(Mmap(
+ nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd.get(), 0));
+
+ // Write to this mapping, causing the page to be copied for write.
+ memset(mapping.ptr(), 'a', mapping.len());
+ MaybeSave(); // Trigger a co-operative save cycle.
+
+ // Truncate the file and expect it to invalidate the copied page.
+ ASSERT_THAT(ftruncate(fd.get(), 0), SyscallSucceeds());
+ EXPECT_EXIT(*reinterpret_cast<volatile char*>(mapping.ptr()),
+ ::testing::KilledBySignal(SIGBUS), "");
+}
+
+// Regression test for #147.
+TEST(MMapNoFixtureTest, MapReadOnlyAfterCreateWriteOnly) {
+ std::string filename = NewTempAbsPath();
+
+ // We have to create the file O_RDONLY to reproduce the bug because
+ // fsgofer.localFile.Create() silently upgrades O_WRONLY to O_RDWR, causing
+ // the cached "write-only" FD to be read/write and therefore usable by mmap().
+ auto const ro_fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Open(filename, O_RDONLY | O_CREAT | O_EXCL, 0666));
+
+ // Get a write-only FD for the same file, which should be ignored by mmap()
+ // (but isn't in #147).
+ auto const wo_fd = ASSERT_NO_ERRNO_AND_VALUE(Open(filename, O_WRONLY));
+ ASSERT_THAT(ftruncate(wo_fd.get(), kPageSize), SyscallSucceeds());
+
+ auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+ Mmap(nullptr, kPageSize, PROT_READ, MAP_SHARED, ro_fd.get(), 0));
+ std::vector<char> buf(kPageSize);
+ // The test passes if this survives.
+ std::copy(static_cast<char*>(mapping.ptr()),
+ static_cast<char*>(mapping.endptr()), buf.data());
+}
+
+// Conditional on MAP_32BIT.
+// This flag is supported only on x86-64, for 64-bit programs.
+#ifdef __x86_64__
+
+TEST(MMapNoFixtureTest, Map32Bit) {
+ auto const mapping = ASSERT_NO_ERRNO_AND_VALUE(
+ MmapAnon(kPageSize, PROT_NONE, MAP_PRIVATE | MAP_32BIT));
+ EXPECT_LT(mapping.addr(), static_cast<uintptr_t>(1) << 32);
+ EXPECT_LE(mapping.endaddr(), static_cast<uintptr_t>(1) << 32);
+}
+
+#endif // defined(__x86_64__)
+
+INSTANTIATE_TEST_SUITE_P(
+ ReadWriteSharedPrivate, MMapFileParamTest,
+ ::testing::Combine(::testing::ValuesIn({
+ PROT_READ,
+ PROT_WRITE,
+ PROT_READ | PROT_WRITE,
+ }),
+ ::testing::ValuesIn({MAP_SHARED, MAP_PRIVATE})));
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/mount.cc b/test/syscalls/linux/mount.cc
new file mode 100644
index 000000000..a3e9745cf
--- /dev/null
+++ b/test/syscalls/linux/mount.cc
@@ -0,0 +1,327 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/time.h"
+#include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/mount_util.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(MountTest, MountBadFilesystem) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+ // Linux expects a valid target before it checks the file system name.
+ auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ EXPECT_THAT(mount("", dir.path().c_str(), "foobar", 0, ""),
+ SyscallFailsWithErrno(ENODEV));
+}
+
+TEST(MountTest, MountInvalidTarget) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+ auto const dir = NewTempAbsPath();
+ EXPECT_THAT(mount("", dir.c_str(), "tmpfs", 0, ""),
+ SyscallFailsWithErrno(ENOENT));
+}
+
+TEST(MountTest, MountPermDenied) {
+ // Clear CAP_SYS_ADMIN.
+ if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))) {
+ EXPECT_NO_ERRNO(SetCapability(CAP_SYS_ADMIN, false));
+ }
+
+ // Linux expects a valid target before checking capability.
+ auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ EXPECT_THAT(mount("", dir.path().c_str(), "", 0, ""),
+ SyscallFailsWithErrno(EPERM));
+}
+
+TEST(MountTest, UmountPermDenied) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+ auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ auto const mount =
+ ASSERT_NO_ERRNO_AND_VALUE(Mount("", dir.path(), "tmpfs", 0, "", 0));
+
+ // Drop privileges in another thread, so we can still unmount the mounted
+ // directory.
+ ScopedThread([&]() {
+ EXPECT_NO_ERRNO(SetCapability(CAP_SYS_ADMIN, false));
+ EXPECT_THAT(umount(dir.path().c_str()), SyscallFailsWithErrno(EPERM));
+ });
+}
+
+TEST(MountTest, MountOverBusy) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+ auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ auto const fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Open(JoinPath(dir.path(), "foo"), O_CREAT | O_RDWR, 0777));
+
+ // Should be able to mount over a busy directory.
+ ASSERT_NO_ERRNO_AND_VALUE(Mount("", dir.path(), "tmpfs", 0, "", 0));
+}
+
+TEST(MountTest, OpenFileBusy) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+ auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ auto const mount = ASSERT_NO_ERRNO_AND_VALUE(
+ Mount("", dir.path(), "tmpfs", 0, "mode=0700", 0));
+ auto const fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Open(JoinPath(dir.path(), "foo"), O_CREAT | O_RDWR, 0777));
+
+ // An open file should prevent unmounting.
+ EXPECT_THAT(umount(dir.path().c_str()), SyscallFailsWithErrno(EBUSY));
+}
+
+TEST(MountTest, UmountDetach) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+ // structure:
+ //
+ // dir (mount point)
+ // subdir
+ // file
+ //
+ // We show that we can walk around in the mount after detach-unmount dir.
+ //
+ // We show that even though dir is unreachable from outside the mount, we can
+ // still reach dir's (former) parent!
+ auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+ const struct stat before = ASSERT_NO_ERRNO_AND_VALUE(Stat(dir.path()));
+ auto mount =
+ ASSERT_NO_ERRNO_AND_VALUE(Mount("", dir.path(), "tmpfs", 0, "mode=0700",
+ /* umountflags= */ MNT_DETACH));
+ const struct stat after = ASSERT_NO_ERRNO_AND_VALUE(Stat(dir.path()));
+ EXPECT_NE(before.st_ino, after.st_ino);
+
+ // Create files in the new mount.
+ constexpr char kContents[] = "no no no";
+ auto const subdir =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(dir.path()));
+ auto const file = ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateFileWith(dir.path(), kContents, 0777));
+
+ auto const dir_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(subdir.path(), O_RDONLY | O_DIRECTORY));
+ auto const fd = ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY));
+
+ // Unmount the tmpfs.
+ mount.Release()();
+
+ const struct stat after2 = ASSERT_NO_ERRNO_AND_VALUE(Stat(dir.path()));
+ EXPECT_EQ(before.st_ino, after2.st_ino);
+
+ // Can still read file after unmounting.
+ std::vector<char> buf(sizeof(kContents));
+ EXPECT_THAT(ReadFd(fd.get(), buf.data(), buf.size()), SyscallSucceeds());
+
+ // Walk to dir.
+ auto const mounted_dir = ASSERT_NO_ERRNO_AND_VALUE(
+ OpenAt(dir_fd.get(), "..", O_DIRECTORY | O_RDONLY));
+ // Walk to dir/file.
+ auto const fd_again = ASSERT_NO_ERRNO_AND_VALUE(
+ OpenAt(mounted_dir.get(), std::string(Basename(file.path())), O_RDONLY));
+
+ std::vector<char> buf2(sizeof(kContents));
+ EXPECT_THAT(ReadFd(fd_again.get(), buf2.data(), buf2.size()),
+ SyscallSucceeds());
+ EXPECT_EQ(buf, buf2);
+
+ // Walking outside the unmounted realm should still work, too!
+ auto const dir_parent = ASSERT_NO_ERRNO_AND_VALUE(
+ OpenAt(mounted_dir.get(), "..", O_DIRECTORY | O_RDONLY));
+}
+
+TEST(MountTest, ActiveSubmountBusy) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+ auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ auto const mount1 = ASSERT_NO_ERRNO_AND_VALUE(
+ Mount("", dir.path(), "tmpfs", 0, "mode=0700", 0));
+
+ auto const dir2 =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(dir.path()));
+ auto const mount2 =
+ ASSERT_NO_ERRNO_AND_VALUE(Mount("", dir2.path(), "tmpfs", 0, "", 0));
+
+ // Since dir now has an active submount, should not be able to unmount.
+ EXPECT_THAT(umount(dir.path().c_str()), SyscallFailsWithErrno(EBUSY));
+}
+
+TEST(MountTest, MountTmpfs) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+ auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+ // NOTE(b/129868551): Inode IDs are only stable across S/R if we have an open
+ // FD for that inode. Since we are going to compare inode IDs below, get a
+ // FileDescriptor for this directory here, which will be closed automatically
+ // at the end of the test.
+ auto const fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_DIRECTORY, O_RDONLY));
+
+ const struct stat before = ASSERT_NO_ERRNO_AND_VALUE(Stat(dir.path()));
+
+ {
+ auto const mount = ASSERT_NO_ERRNO_AND_VALUE(
+ Mount("", dir.path(), "tmpfs", 0, "mode=0700", 0));
+
+ const struct stat s = ASSERT_NO_ERRNO_AND_VALUE(Stat(dir.path()));
+ EXPECT_EQ(s.st_mode, S_IFDIR | 0700);
+ EXPECT_NE(s.st_ino, before.st_ino);
+
+ EXPECT_NO_ERRNO(Open(JoinPath(dir.path(), "foo"), O_CREAT | O_RDWR, 0777));
+ }
+
+ // Now that dir is unmounted again, we should have the old inode back.
+ const struct stat after = ASSERT_NO_ERRNO_AND_VALUE(Stat(dir.path()));
+ EXPECT_EQ(before.st_ino, after.st_ino);
+}
+
+TEST(MountTest, MountTmpfsMagicValIgnored) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+ auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+ auto const mount = ASSERT_NO_ERRNO_AND_VALUE(
+ Mount("", dir.path(), "tmpfs", MS_MGC_VAL, "mode=0700", 0));
+}
+
+// Passing nullptr to data is equivalent to "".
+TEST(MountTest, NullData) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+ auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+ EXPECT_THAT(mount("", dir.path().c_str(), "tmpfs", 0, nullptr),
+ SyscallSucceeds());
+ EXPECT_THAT(umount2(dir.path().c_str(), 0), SyscallSucceeds());
+}
+
+TEST(MountTest, MountReadonly) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+ auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ auto const mount = ASSERT_NO_ERRNO_AND_VALUE(
+ Mount("", dir.path(), "tmpfs", MS_RDONLY, "mode=0777", 0));
+
+ const struct stat s = ASSERT_NO_ERRNO_AND_VALUE(Stat(dir.path()));
+ EXPECT_EQ(s.st_mode, S_IFDIR | 0777);
+
+ std::string const filename = JoinPath(dir.path(), "foo");
+ EXPECT_THAT(open(filename.c_str(), O_RDWR | O_CREAT, 0777),
+ SyscallFailsWithErrno(EROFS));
+}
+
+PosixErrorOr<absl::Time> ATime(absl::string_view file) {
+ struct stat s = {};
+ if (stat(std::string(file).c_str(), &s) == -1) {
+ return PosixError(errno, "stat failed");
+ }
+ return absl::TimeFromTimespec(s.st_atim);
+}
+
+TEST(MountTest, MountNoAtime) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+ auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ auto const mount = ASSERT_NO_ERRNO_AND_VALUE(
+ Mount("", dir.path(), "tmpfs", MS_NOATIME, "mode=0777", 0));
+
+ std::string const contents = "No no no, don't follow the instructions!";
+ auto const file = ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateFileWith(dir.path(), contents, 0777));
+
+ absl::Time const before = ASSERT_NO_ERRNO_AND_VALUE(ATime(file.path()));
+
+ // Reading from the file should change the atime, but the MS_NOATIME flag
+ // should prevent that.
+ auto const fd = ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR));
+ char buf[100];
+ int read_n;
+ ASSERT_THAT(read_n = read(fd.get(), buf, sizeof(buf)), SyscallSucceeds());
+ EXPECT_EQ(std::string(buf, read_n), contents);
+
+ absl::Time const after = ASSERT_NO_ERRNO_AND_VALUE(ATime(file.path()));
+
+ // Expect that atime hasn't changed.
+ EXPECT_EQ(before, after);
+}
+
+TEST(MountTest, MountNoExec) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+ auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ auto const mount = ASSERT_NO_ERRNO_AND_VALUE(
+ Mount("", dir.path(), "tmpfs", MS_NOEXEC, "mode=0777", 0));
+
+ std::string const contents = "No no no, don't follow the instructions!";
+ auto const file = ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateFileWith(dir.path(), contents, 0777));
+
+ int execve_errno;
+ ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec(file.path(), {}, {}, nullptr, &execve_errno));
+ EXPECT_EQ(execve_errno, EACCES);
+}
+
+TEST(MountTest, RenameRemoveMountPoint) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+ auto const dir_parent = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ auto const dir =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(dir_parent.path()));
+ auto const new_dir = NewTempAbsPath();
+
+ auto const mount =
+ ASSERT_NO_ERRNO_AND_VALUE(Mount("", dir.path(), "tmpfs", 0, "", 0));
+
+ ASSERT_THAT(rename(dir.path().c_str(), new_dir.c_str()),
+ SyscallFailsWithErrno(EBUSY));
+
+ ASSERT_THAT(rmdir(dir.path().c_str()), SyscallFailsWithErrno(EBUSY));
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/mremap.cc b/test/syscalls/linux/mremap.cc
new file mode 100644
index 000000000..f0e5f7d82
--- /dev/null
+++ b/test/syscalls/linux/mremap.cc
@@ -0,0 +1,492 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <string.h>
+#include <sys/mman.h>
+
+#include <string>
+
+#include "gmock/gmock.h"
+#include "absl/strings/string_view.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/logging.h"
+#include "test/util/memory_util.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+using ::testing::_;
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Fixture for mremap tests parameterized by mmap flags.
+using MremapParamTest = ::testing::TestWithParam<int>;
+
+TEST_P(MremapParamTest, Noop) {
+ Mapping const m =
+ ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_NONE, GetParam()));
+
+ ASSERT_THAT(Mremap(m.ptr(), kPageSize, kPageSize, 0, nullptr),
+ IsPosixErrorOkAndHolds(m.ptr()));
+ EXPECT_TRUE(IsMapped(m.addr()));
+}
+
+TEST_P(MremapParamTest, InPlace_ShrinkingWholeVMA) {
+ Mapping const m =
+ ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(2 * kPageSize, PROT_NONE, GetParam()));
+
+ const auto rest = [&] {
+ // N.B. we must be in a single-threaded subprocess to ensure a
+ // background thread doesn't concurrently map the second page.
+ void* addr = mremap(m.ptr(), 2 * kPageSize, kPageSize, 0, nullptr);
+ TEST_PCHECK_MSG(addr != MAP_FAILED, "mremap failed");
+ TEST_CHECK(addr == m.ptr());
+ MaybeSave();
+
+ TEST_CHECK(IsMapped(m.addr()));
+ TEST_CHECK(!IsMapped(m.addr() + kPageSize));
+ };
+
+ EXPECT_THAT(InForkedProcess(rest), IsPosixErrorOkAndHolds(0));
+}
+
+TEST_P(MremapParamTest, InPlace_ShrinkingPartialVMA) {
+ Mapping const m =
+ ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(3 * kPageSize, PROT_NONE, GetParam()));
+
+ const auto rest = [&] {
+ void* addr = mremap(m.ptr(), 2 * kPageSize, kPageSize, 0, nullptr);
+ TEST_PCHECK_MSG(addr != MAP_FAILED, "mremap failed");
+ TEST_CHECK(addr == m.ptr());
+ MaybeSave();
+
+ TEST_CHECK(IsMapped(m.addr()));
+ TEST_CHECK(!IsMapped(m.addr() + kPageSize));
+ TEST_CHECK(IsMapped(m.addr() + 2 * kPageSize));
+ };
+
+ EXPECT_THAT(InForkedProcess(rest), IsPosixErrorOkAndHolds(0));
+}
+
+TEST_P(MremapParamTest, InPlace_ShrinkingAcrossVMAs) {
+ Mapping const m =
+ ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(3 * kPageSize, PROT_READ, GetParam()));
+ // Changing permissions on the first page forces it to become a separate vma.
+ ASSERT_THAT(mprotect(m.ptr(), kPageSize, PROT_NONE), SyscallSucceeds());
+
+ const auto rest = [&] {
+ // Both old_size and new_size now span two vmas; mremap
+ // shouldn't care.
+ void* addr = mremap(m.ptr(), 3 * kPageSize, 2 * kPageSize, 0, nullptr);
+ TEST_PCHECK_MSG(addr != MAP_FAILED, "mremap failed");
+ TEST_CHECK(addr == m.ptr());
+ MaybeSave();
+
+ TEST_CHECK(IsMapped(m.addr()));
+ TEST_CHECK(IsMapped(m.addr() + kPageSize));
+ TEST_CHECK(!IsMapped(m.addr() + 2 * kPageSize));
+ };
+
+ EXPECT_THAT(InForkedProcess(rest), IsPosixErrorOkAndHolds(0));
+}
+
+TEST_P(MremapParamTest, InPlace_ExpansionSuccess) {
+ Mapping const m =
+ ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(2 * kPageSize, PROT_NONE, GetParam()));
+
+ const auto rest = [&] {
+ // Unmap the second page so that the first can be expanded back into it.
+ //
+ // N.B. we must be in a single-threaded subprocess to ensure a
+ // background thread doesn't concurrently map this page.
+ TEST_PCHECK(
+ munmap(reinterpret_cast<void*>(m.addr() + kPageSize), kPageSize) == 0);
+ MaybeSave();
+
+ void* addr = mremap(m.ptr(), kPageSize, 2 * kPageSize, 0, nullptr);
+ TEST_PCHECK_MSG(addr != MAP_FAILED, "mremap failed");
+ TEST_CHECK(addr == m.ptr());
+ MaybeSave();
+
+ TEST_CHECK(IsMapped(m.addr()));
+ TEST_CHECK(IsMapped(m.addr() + kPageSize));
+ };
+
+ EXPECT_THAT(InForkedProcess(rest), IsPosixErrorOkAndHolds(0));
+}
+
+TEST_P(MremapParamTest, InPlace_ExpansionFailure) {
+ Mapping const m =
+ ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(3 * kPageSize, PROT_NONE, GetParam()));
+
+ const auto rest = [&] {
+ // Unmap the second page, leaving a one-page hole. Trying to expand the
+ // first page to three pages should fail since the original third page
+ // is still mapped.
+ TEST_PCHECK(
+ munmap(reinterpret_cast<void*>(m.addr() + kPageSize), kPageSize) == 0);
+ MaybeSave();
+
+ void* addr = mremap(m.ptr(), kPageSize, 3 * kPageSize, 0, nullptr);
+ TEST_CHECK_MSG(addr == MAP_FAILED, "mremap unexpectedly succeeded");
+ TEST_PCHECK_MSG(errno == ENOMEM, "mremap failed with wrong errno");
+ MaybeSave();
+
+ TEST_CHECK(IsMapped(m.addr()));
+ TEST_CHECK(!IsMapped(m.addr() + kPageSize));
+ TEST_CHECK(IsMapped(m.addr() + 2 * kPageSize));
+ };
+
+ EXPECT_THAT(InForkedProcess(rest), IsPosixErrorOkAndHolds(0));
+}
+
+TEST_P(MremapParamTest, MayMove_Expansion) {
+ Mapping const m =
+ ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(3 * kPageSize, PROT_NONE, GetParam()));
+
+ const auto rest = [&] {
+ // Unmap the second page, leaving a one-page hole. Trying to expand the
+ // first page to three pages with MREMAP_MAYMOVE should force the
+ // mapping to be relocated since the original third page is still
+ // mapped.
+ TEST_PCHECK(
+ munmap(reinterpret_cast<void*>(m.addr() + kPageSize), kPageSize) == 0);
+ MaybeSave();
+
+ void* addr2 =
+ mremap(m.ptr(), kPageSize, 3 * kPageSize, MREMAP_MAYMOVE, nullptr);
+ TEST_PCHECK_MSG(addr2 != MAP_FAILED, "mremap failed");
+ MaybeSave();
+
+ const Mapping m2 = Mapping(addr2, 3 * kPageSize);
+ TEST_CHECK(m.addr() != m2.addr());
+
+ TEST_CHECK(!IsMapped(m.addr()));
+ TEST_CHECK(!IsMapped(m.addr() + kPageSize));
+ TEST_CHECK(IsMapped(m.addr() + 2 * kPageSize));
+ TEST_CHECK(IsMapped(m2.addr()));
+ TEST_CHECK(IsMapped(m2.addr() + kPageSize));
+ TEST_CHECK(IsMapped(m2.addr() + 2 * kPageSize));
+ };
+
+ EXPECT_THAT(InForkedProcess(rest), IsPosixErrorOkAndHolds(0));
+}
+
+TEST_P(MremapParamTest, Fixed_SourceAndDestinationCannotOverlap) {
+ Mapping const m =
+ ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_NONE, GetParam()));
+
+ ASSERT_THAT(Mremap(m.ptr(), kPageSize, kPageSize,
+ MREMAP_MAYMOVE | MREMAP_FIXED, m.ptr()),
+ PosixErrorIs(EINVAL, _));
+ EXPECT_TRUE(IsMapped(m.addr()));
+}
+
+TEST_P(MremapParamTest, Fixed_SameSize) {
+ Mapping const src =
+ ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_NONE, GetParam()));
+ Mapping const dst =
+ ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_NONE, GetParam()));
+
+ const auto rest = [&] {
+ // Unmap dst to create a hole.
+ TEST_PCHECK(munmap(dst.ptr(), kPageSize) == 0);
+ MaybeSave();
+
+ void* addr = mremap(src.ptr(), kPageSize, kPageSize,
+ MREMAP_MAYMOVE | MREMAP_FIXED, dst.ptr());
+ TEST_PCHECK_MSG(addr != MAP_FAILED, "mremap failed");
+ TEST_CHECK(addr == dst.ptr());
+ MaybeSave();
+
+ TEST_CHECK(!IsMapped(src.addr()));
+ TEST_CHECK(IsMapped(dst.addr()));
+ };
+
+ EXPECT_THAT(InForkedProcess(rest), IsPosixErrorOkAndHolds(0));
+}
+
+TEST_P(MremapParamTest, Fixed_SameSize_Unmapping) {
+ // Like the Fixed_SameSize case, but expect mremap to unmap the destination
+ // automatically.
+ Mapping const src =
+ ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_NONE, GetParam()));
+ Mapping const dst =
+ ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_NONE, GetParam()));
+
+ const auto rest = [&] {
+ void* addr = mremap(src.ptr(), kPageSize, kPageSize,
+ MREMAP_MAYMOVE | MREMAP_FIXED, dst.ptr());
+ TEST_PCHECK_MSG(addr != MAP_FAILED, "mremap failed");
+ TEST_CHECK(addr == dst.ptr());
+ MaybeSave();
+
+ TEST_CHECK(!IsMapped(src.addr()));
+ TEST_CHECK(IsMapped(dst.addr()));
+ };
+
+ EXPECT_THAT(InForkedProcess(rest), IsPosixErrorOkAndHolds(0));
+}
+
+TEST_P(MremapParamTest, Fixed_ShrinkingWholeVMA) {
+ Mapping const src =
+ ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(2 * kPageSize, PROT_NONE, GetParam()));
+ Mapping const dst =
+ ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(2 * kPageSize, PROT_NONE, GetParam()));
+
+ const auto rest = [&] {
+ // Unmap dst so we can check that mremap does not keep the
+ // second page.
+ TEST_PCHECK(munmap(dst.ptr(), 2 * kPageSize) == 0);
+ MaybeSave();
+
+ void* addr = mremap(src.ptr(), 2 * kPageSize, kPageSize,
+ MREMAP_MAYMOVE | MREMAP_FIXED, dst.ptr());
+ TEST_PCHECK_MSG(addr != MAP_FAILED, "mremap failed");
+ TEST_CHECK(addr == dst.ptr());
+ MaybeSave();
+
+ TEST_CHECK(!IsMapped(src.addr()));
+ TEST_CHECK(!IsMapped(src.addr() + kPageSize));
+ TEST_CHECK(IsMapped(dst.addr()));
+ TEST_CHECK(!IsMapped(dst.addr() + kPageSize));
+ };
+
+ EXPECT_THAT(InForkedProcess(rest), IsPosixErrorOkAndHolds(0));
+}
+
+TEST_P(MremapParamTest, Fixed_ShrinkingPartialVMA) {
+ Mapping const src =
+ ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(3 * kPageSize, PROT_NONE, GetParam()));
+ Mapping const dst =
+ ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(2 * kPageSize, PROT_NONE, GetParam()));
+
+ const auto rest = [&] {
+ // Unmap dst so we can check that mremap does not keep the
+ // second page.
+ TEST_PCHECK(munmap(dst.ptr(), 2 * kPageSize) == 0);
+ MaybeSave();
+
+ void* addr = mremap(src.ptr(), 2 * kPageSize, kPageSize,
+ MREMAP_MAYMOVE | MREMAP_FIXED, dst.ptr());
+ TEST_PCHECK_MSG(addr != MAP_FAILED, "mremap failed");
+ TEST_CHECK(addr == dst.ptr());
+ MaybeSave();
+
+ TEST_CHECK(!IsMapped(src.addr()));
+ TEST_CHECK(!IsMapped(src.addr() + kPageSize));
+ TEST_CHECK(IsMapped(src.addr() + 2 * kPageSize));
+ TEST_CHECK(IsMapped(dst.addr()));
+ TEST_CHECK(!IsMapped(dst.addr() + kPageSize));
+ };
+
+ EXPECT_THAT(InForkedProcess(rest), IsPosixErrorOkAndHolds(0));
+}
+
+TEST_P(MremapParamTest, Fixed_ShrinkingAcrossVMAs) {
+ Mapping const src =
+ ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(3 * kPageSize, PROT_READ, GetParam()));
+ // Changing permissions on the first page forces it to become a separate vma.
+ ASSERT_THAT(mprotect(src.ptr(), kPageSize, PROT_NONE), SyscallSucceeds());
+ Mapping const dst =
+ ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(2 * kPageSize, PROT_NONE, GetParam()));
+
+ const auto rest = [&] {
+ // Unlike flags=0, MREMAP_FIXED requires that [old_address,
+ // old_address+new_size) only spans a single vma.
+ void* addr = mremap(src.ptr(), 3 * kPageSize, 2 * kPageSize,
+ MREMAP_MAYMOVE | MREMAP_FIXED, dst.ptr());
+ TEST_CHECK_MSG(addr == MAP_FAILED, "mremap unexpectedly succeeded");
+ TEST_PCHECK_MSG(errno == EFAULT, "mremap failed with wrong errno");
+ MaybeSave();
+
+ TEST_CHECK(IsMapped(src.addr()));
+ TEST_CHECK(IsMapped(src.addr() + kPageSize));
+ // Despite failing, mremap should have unmapped [old_address+new_size,
+ // old_address+old_size) (i.e. the third page).
+ TEST_CHECK(!IsMapped(src.addr() + 2 * kPageSize));
+ // Despite failing, mremap should have unmapped the destination pages.
+ TEST_CHECK(!IsMapped(dst.addr()));
+ TEST_CHECK(!IsMapped(dst.addr() + kPageSize));
+ };
+
+ EXPECT_THAT(InForkedProcess(rest), IsPosixErrorOkAndHolds(0));
+}
+
+TEST_P(MremapParamTest, Fixed_Expansion) {
+ Mapping const src =
+ ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_NONE, GetParam()));
+ Mapping const dst =
+ ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(2 * kPageSize, PROT_NONE, GetParam()));
+
+ const auto rest = [&] {
+ // Unmap dst so we can check that mremap actually maps all pages
+ // at the destination.
+ TEST_PCHECK(munmap(dst.ptr(), 2 * kPageSize) == 0);
+ MaybeSave();
+
+ void* addr = mremap(src.ptr(), kPageSize, 2 * kPageSize,
+ MREMAP_MAYMOVE | MREMAP_FIXED, dst.ptr());
+ TEST_PCHECK_MSG(addr != MAP_FAILED, "mremap failed");
+ TEST_CHECK(addr == dst.ptr());
+ MaybeSave();
+
+ TEST_CHECK(!IsMapped(src.addr()));
+ TEST_CHECK(IsMapped(dst.addr()));
+ TEST_CHECK(IsMapped(dst.addr() + kPageSize));
+ };
+
+ EXPECT_THAT(InForkedProcess(rest), IsPosixErrorOkAndHolds(0));
+}
+
+INSTANTIATE_TEST_SUITE_P(PrivateShared, MremapParamTest,
+ ::testing::Values(MAP_PRIVATE, MAP_SHARED));
+
+// mremap with old_size == 0 only works with MAP_SHARED after Linux 4.14
+// (dba58d3b8c50 "mm/mremap: fail map duplication attempts for private
+// mappings").
+
+TEST(MremapTest, InPlace_Copy) {
+ Mapping const m =
+ ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_NONE, MAP_SHARED));
+ EXPECT_THAT(Mremap(m.ptr(), 0, kPageSize, 0, nullptr),
+ PosixErrorIs(ENOMEM, _));
+}
+
+TEST(MremapTest, MayMove_Copy) {
+ Mapping const m =
+ ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_NONE, MAP_SHARED));
+
+ // Remainder of this test executes in a subprocess to ensure that if mremap
+ // incorrectly removes m, it is not remapped by another thread.
+ const auto rest = [&] {
+ void* ptr = mremap(m.ptr(), 0, kPageSize, MREMAP_MAYMOVE, nullptr);
+ MaybeSave();
+ TEST_PCHECK_MSG(ptr != MAP_FAILED, "mremap failed");
+ TEST_CHECK(ptr != m.ptr());
+ TEST_CHECK(IsMapped(m.addr()));
+ TEST_CHECK(IsMapped(reinterpret_cast<uintptr_t>(ptr)));
+ };
+ EXPECT_THAT(InForkedProcess(rest), IsPosixErrorOkAndHolds(0));
+}
+
+TEST(MremapTest, MustMove_Copy) {
+ Mapping const src =
+ ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_NONE, MAP_SHARED));
+ Mapping const dst =
+ ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_NONE, MAP_PRIVATE));
+
+ // Remainder of this test executes in a subprocess to ensure that if mremap
+ // incorrectly removes src, it is not remapped by another thread.
+ const auto rest = [&] {
+ void* ptr = mremap(src.ptr(), 0, kPageSize, MREMAP_MAYMOVE | MREMAP_FIXED,
+ dst.ptr());
+ MaybeSave();
+ TEST_PCHECK_MSG(ptr != MAP_FAILED, "mremap failed");
+ TEST_CHECK(ptr == dst.ptr());
+ TEST_CHECK(IsMapped(src.addr()));
+ TEST_CHECK(IsMapped(dst.addr()));
+ };
+ EXPECT_THAT(InForkedProcess(rest), IsPosixErrorOkAndHolds(0));
+}
+
+void ExpectAllBytesAre(absl::string_view v, char c) {
+ for (size_t i = 0; i < v.size(); i++) {
+ ASSERT_EQ(v[i], c) << "at offset " << i;
+ }
+}
+
+TEST(MremapTest, ExpansionPreservesCOWPagesAndExposesNewFilePages) {
+ // Create a file with 3 pages. The first is filled with 'a', the second is
+ // filled with 'b', and the third is filled with 'c'.
+ TempPath const file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR));
+ ASSERT_THAT(WriteFd(fd.get(), std::string(kPageSize, 'a').c_str(), kPageSize),
+ SyscallSucceedsWithValue(kPageSize));
+ ASSERT_THAT(WriteFd(fd.get(), std::string(kPageSize, 'b').c_str(), kPageSize),
+ SyscallSucceedsWithValue(kPageSize));
+ ASSERT_THAT(WriteFd(fd.get(), std::string(kPageSize, 'c').c_str(), kPageSize),
+ SyscallSucceedsWithValue(kPageSize));
+
+ // Create a private mapping of the first 2 pages, and fill the second page
+ // with 'd'.
+ Mapping const src = ASSERT_NO_ERRNO_AND_VALUE(Mmap(nullptr, 2 * kPageSize,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE, fd.get(), 0));
+ memset(reinterpret_cast<void*>(src.addr() + kPageSize), 'd', kPageSize);
+ MaybeSave();
+
+ // Move the mapping while expanding it to 3 pages. The resulting mapping
+ // should contain the original first page of the file (filled with 'a'),
+ // followed by the private copy of the second page (filled with 'd'), followed
+ // by the newly-mapped third page of the file (filled with 'c').
+ Mapping const dst = ASSERT_NO_ERRNO_AND_VALUE(
+ MmapAnon(3 * kPageSize, PROT_NONE, MAP_PRIVATE));
+ ASSERT_THAT(Mremap(src.ptr(), 2 * kPageSize, 3 * kPageSize,
+ MREMAP_MAYMOVE | MREMAP_FIXED, dst.ptr()),
+ IsPosixErrorOkAndHolds(dst.ptr()));
+ auto const v = dst.view();
+ ExpectAllBytesAre(v.substr(0, kPageSize), 'a');
+ ExpectAllBytesAre(v.substr(kPageSize, kPageSize), 'd');
+ ExpectAllBytesAre(v.substr(2 * kPageSize, kPageSize), 'c');
+}
+
+TEST(MremapDeathTest, SharedAnon) {
+ SetupGvisorDeathTest();
+
+ // Reserve 4 pages of address space.
+ Mapping const reserved = ASSERT_NO_ERRNO_AND_VALUE(
+ MmapAnon(4 * kPageSize, PROT_NONE, MAP_PRIVATE));
+
+ // Create a 2-page shared anonymous mapping at the beginning of the
+ // reservation. Fill the first page with 'a' and the second with 'b'.
+ Mapping const m = ASSERT_NO_ERRNO_AND_VALUE(
+ Mmap(reserved.ptr(), 2 * kPageSize, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_ANONYMOUS | MAP_FIXED, -1, 0));
+ memset(m.ptr(), 'a', kPageSize);
+ memset(reinterpret_cast<void*>(m.addr() + kPageSize), 'b', kPageSize);
+ MaybeSave();
+
+ // Shrink the mapping to 1 page in-place.
+ ASSERT_THAT(Mremap(m.ptr(), 2 * kPageSize, kPageSize, 0, m.ptr()),
+ IsPosixErrorOkAndHolds(m.ptr()));
+
+ // Expand the mapping to 3 pages, moving it forward by 1 page in the process
+ // since the old and new mappings can't overlap.
+ void* const new_m = reinterpret_cast<void*>(m.addr() + kPageSize);
+ ASSERT_THAT(Mremap(m.ptr(), kPageSize, 3 * kPageSize,
+ MREMAP_MAYMOVE | MREMAP_FIXED, new_m),
+ IsPosixErrorOkAndHolds(new_m));
+
+ // The first 2 pages of the mapping should still contain the data we wrote
+ // (i.e. shrinking should not have discarded the second page's data), while
+ // touching the third page should raise SIGBUS.
+ auto const v =
+ absl::string_view(static_cast<char const*>(new_m), 3 * kPageSize);
+ ExpectAllBytesAre(v.substr(0, kPageSize), 'a');
+ ExpectAllBytesAre(v.substr(kPageSize, kPageSize), 'b');
+ EXPECT_EXIT(ExpectAllBytesAre(v.substr(2 * kPageSize, kPageSize), '\0'),
+ ::testing::KilledBySignal(SIGBUS), "");
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/msync.cc b/test/syscalls/linux/msync.cc
new file mode 100644
index 000000000..2b2b6aef9
--- /dev/null
+++ b/test/syscalls/linux/msync.cc
@@ -0,0 +1,151 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include <functional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "test/util/file_descriptor.h"
+#include "test/util/memory_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Parameters for msync tests. Use a std::tuple so we can use
+// ::testing::Combine.
+using MsyncTestParam =
+ std::tuple<int, // msync flags
+ std::function<PosixErrorOr<Mapping>()> // returns mapping to
+ // msync
+ >;
+
+class MsyncParameterizedTest : public ::testing::TestWithParam<MsyncTestParam> {
+ protected:
+ int msync_flags() const { return std::get<0>(GetParam()); }
+
+ PosixErrorOr<Mapping> GetMapping() const { return std::get<1>(GetParam())(); }
+};
+
+// All valid msync(2) flag combinations, not including MS_INVALIDATE. ("Linux
+// permits a call to msync() that specifies neither [MS_SYNC or MS_ASYNC], with
+// semantics that are (currently) equivalent to specifying MS_ASYNC." -
+// msync(2))
+constexpr std::initializer_list<int> kMsyncFlags = {MS_SYNC, MS_ASYNC, 0};
+
+// Returns functions that return mappings that should be successfully
+// msync()able.
+std::vector<std::function<PosixErrorOr<Mapping>()>> SyncableMappings() {
+ std::vector<std::function<PosixErrorOr<Mapping>()>> funcs;
+ for (bool const writable : {false, true}) {
+ for (int const mflags : {MAP_PRIVATE, MAP_SHARED}) {
+ int const prot = PROT_READ | (writable ? PROT_WRITE : 0);
+ int const oflags = O_CREAT | (writable ? O_RDWR : O_RDONLY);
+ funcs.push_back([=] { return MmapAnon(kPageSize, prot, mflags); });
+ funcs.push_back([=]() -> PosixErrorOr<Mapping> {
+ std::string const path = NewTempAbsPath();
+ ASSIGN_OR_RETURN_ERRNO(auto fd, Open(path, oflags, 0644));
+ // Don't unlink the file since that breaks save/restore. Just let the
+ // test infrastructure clean up all of our temporary files when we're
+ // done.
+ return Mmap(nullptr, kPageSize, prot, mflags, fd.get(), 0);
+ });
+ }
+ }
+ return funcs;
+}
+
+PosixErrorOr<Mapping> NoMappings() {
+ return PosixError(EINVAL, "unexpected attempt to create a mapping");
+}
+
+// "Fixture" for msync tests that hold for all valid flags, but do not create
+// mappings.
+using MsyncNoMappingTest = MsyncParameterizedTest;
+
+TEST_P(MsyncNoMappingTest, UnmappedAddressWithZeroLengthSucceeds) {
+ EXPECT_THAT(msync(nullptr, 0, msync_flags()), SyscallSucceeds());
+}
+
+TEST_P(MsyncNoMappingTest, UnmappedAddressWithNonzeroLengthFails) {
+ EXPECT_THAT(msync(nullptr, kPageSize, msync_flags()),
+ SyscallFailsWithErrno(ENOMEM));
+}
+
+INSTANTIATE_TEST_SUITE_P(All, MsyncNoMappingTest,
+ ::testing::Combine(::testing::ValuesIn(kMsyncFlags),
+ ::testing::Values(NoMappings)));
+
+// "Fixture" for msync tests that are not parameterized by msync flags, but do
+// create mappings.
+using MsyncNoFlagsTest = MsyncParameterizedTest;
+
+TEST_P(MsyncNoFlagsTest, BothSyncAndAsyncFails) {
+ auto m = ASSERT_NO_ERRNO_AND_VALUE(GetMapping());
+ EXPECT_THAT(msync(m.ptr(), m.len(), MS_SYNC | MS_ASYNC),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ All, MsyncNoFlagsTest,
+ ::testing::Combine(::testing::Values(0), // ignored
+ ::testing::ValuesIn(SyncableMappings())));
+
+// "Fixture" for msync tests parameterized by both msync flags and sources of
+// mappings.
+using MsyncFullParamTest = MsyncParameterizedTest;
+
+TEST_P(MsyncFullParamTest, NormallySucceeds) {
+ auto m = ASSERT_NO_ERRNO_AND_VALUE(GetMapping());
+ EXPECT_THAT(msync(m.ptr(), m.len(), msync_flags()), SyscallSucceeds());
+}
+
+TEST_P(MsyncFullParamTest, UnalignedLengthSucceeds) {
+ auto m = ASSERT_NO_ERRNO_AND_VALUE(GetMapping());
+ EXPECT_THAT(msync(m.ptr(), m.len() - 1, msync_flags()), SyscallSucceeds());
+}
+
+TEST_P(MsyncFullParamTest, UnalignedAddressFails) {
+ auto m = ASSERT_NO_ERRNO_AND_VALUE(GetMapping());
+ EXPECT_THAT(
+ msync(reinterpret_cast<void*>(m.addr() + 1), m.len() - 1, msync_flags()),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(MsyncFullParamTest, InvalidateUnlockedSucceeds) {
+ auto m = ASSERT_NO_ERRNO_AND_VALUE(GetMapping());
+ EXPECT_THAT(msync(m.ptr(), m.len(), msync_flags() | MS_INVALIDATE),
+ SyscallSucceeds());
+}
+
+// The test for MS_INVALIDATE on mlocked pages is in mlock.cc since it requires
+// probing for mlock support.
+
+INSTANTIATE_TEST_SUITE_P(
+ All, MsyncFullParamTest,
+ ::testing::Combine(::testing::ValuesIn(kMsyncFlags),
+ ::testing::ValuesIn(SyncableMappings())));
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/munmap.cc b/test/syscalls/linux/munmap.cc
new file mode 100644
index 000000000..067241f4d
--- /dev/null
+++ b/test/syscalls/linux/munmap.cc
@@ -0,0 +1,53 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/mman.h>
+
+#include "gtest/gtest.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+class MunmapTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ m_ = mmap(nullptr, kPageSize, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ ASSERT_NE(MAP_FAILED, m_);
+ }
+
+ void* m_ = nullptr;
+};
+
+TEST_F(MunmapTest, HappyCase) {
+ EXPECT_THAT(munmap(m_, kPageSize), SyscallSucceeds());
+}
+
+TEST_F(MunmapTest, ZeroLength) {
+ EXPECT_THAT(munmap(m_, 0), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_F(MunmapTest, LastPageRoundUp) {
+ // Attempt to unmap up to and including the last page.
+ EXPECT_THAT(munmap(m_, static_cast<size_t>(-kPageSize + 1)),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/network_namespace.cc b/test/syscalls/linux/network_namespace.cc
new file mode 100644
index 000000000..133fdecf0
--- /dev/null
+++ b/test/syscalls/linux/network_namespace.cc
@@ -0,0 +1,52 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <net/if.h>
+#include <sched.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/capability_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+TEST(NetworkNamespaceTest, LoopbackExists) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+ ScopedThread t([&] {
+ ASSERT_THAT(unshare(CLONE_NEWNET), SyscallSucceedsWithValue(0));
+
+ // TODO(gvisor.dev/issue/1833): Update this to test that only "lo" exists.
+ // Check loopback device exists.
+ int sock = socket(AF_INET, SOCK_DGRAM, 0);
+ ASSERT_THAT(sock, SyscallSucceeds());
+ struct ifreq ifr;
+ strncpy(ifr.ifr_name, "lo", IFNAMSIZ);
+ EXPECT_THAT(ioctl(sock, SIOCGIFINDEX, &ifr), SyscallSucceeds())
+ << "lo cannot be found";
+ });
+}
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/open.cc b/test/syscalls/linux/open.cc
new file mode 100644
index 000000000..bb7d108e8
--- /dev/null
+++ b/test/syscalls/linux/open.cc
@@ -0,0 +1,451 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/capability.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/memory/memory.h"
+#include "test/syscalls/linux/file_base.h"
+#include "test/util/capability_util.h"
+#include "test/util/cleanup.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// This test is currently very rudimentary.
+//
+// There are plenty of extra cases to cover once the sentry supports them.
+//
+// Different types of opens:
+// * O_CREAT
+// * O_DIRECTORY
+// * O_NOFOLLOW
+// * O_PATH <- Will we ever support this?
+//
+// Special operations on open:
+// * O_EXCL
+//
+// Special files:
+// * Blocking behavior for a named pipe.
+//
+// Different errors:
+// * EACCES
+// * EEXIST
+// * ENAMETOOLONG
+// * ELOOP
+// * ENOTDIR
+// * EPERM
+class OpenTest : public FileTest {
+ void SetUp() override {
+ FileTest::SetUp();
+
+ ASSERT_THAT(
+ write(test_file_fd_.get(), test_data_.c_str(), test_data_.length()),
+ SyscallSucceedsWithValue(test_data_.length()));
+ EXPECT_THAT(lseek(test_file_fd_.get(), 0, SEEK_SET), SyscallSucceeds());
+ }
+
+ public:
+ const std::string test_data_ = "hello world\n";
+};
+
+TEST_F(OpenTest, OTrunc) {
+ auto dirpath = JoinPath(GetAbsoluteTestTmpdir(), "truncd");
+ ASSERT_THAT(mkdir(dirpath.c_str(), 0777), SyscallSucceeds());
+ ASSERT_THAT(open(dirpath.c_str(), O_TRUNC, 0666),
+ SyscallFailsWithErrno(EISDIR));
+}
+
+TEST_F(OpenTest, OTruncAndReadOnlyDir) {
+ auto dirpath = JoinPath(GetAbsoluteTestTmpdir(), "truncd");
+ ASSERT_THAT(mkdir(dirpath.c_str(), 0777), SyscallSucceeds());
+ ASSERT_THAT(open(dirpath.c_str(), O_TRUNC | O_RDONLY, 0666),
+ SyscallFailsWithErrno(EISDIR));
+}
+
+TEST_F(OpenTest, OTruncAndReadOnlyFile) {
+ auto dirpath = JoinPath(GetAbsoluteTestTmpdir(), "truncfile");
+ const FileDescriptor existing =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(dirpath.c_str(), O_RDWR | O_CREAT, 0666));
+ const FileDescriptor otrunc = ASSERT_NO_ERRNO_AND_VALUE(
+ Open(dirpath.c_str(), O_TRUNC | O_RDONLY, 0666));
+}
+
+TEST_F(OpenTest, ReadOnly) {
+ char buf;
+ const FileDescriptor ro_file =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDONLY));
+
+ EXPECT_THAT(read(ro_file.get(), &buf, 1), SyscallSucceedsWithValue(1));
+ EXPECT_THAT(lseek(ro_file.get(), 0, SEEK_SET), SyscallSucceeds());
+ EXPECT_THAT(write(ro_file.get(), &buf, 1), SyscallFailsWithErrno(EBADF));
+}
+
+TEST_F(OpenTest, WriteOnly) {
+ char buf;
+ const FileDescriptor wo_file =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_WRONLY));
+
+ EXPECT_THAT(read(wo_file.get(), &buf, 1), SyscallFailsWithErrno(EBADF));
+ EXPECT_THAT(lseek(wo_file.get(), 0, SEEK_SET), SyscallSucceeds());
+ EXPECT_THAT(write(wo_file.get(), &buf, 1), SyscallSucceedsWithValue(1));
+}
+
+TEST_F(OpenTest, ReadWrite) {
+ char buf;
+ const FileDescriptor rw_file =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR));
+
+ EXPECT_THAT(read(rw_file.get(), &buf, 1), SyscallSucceedsWithValue(1));
+ EXPECT_THAT(lseek(rw_file.get(), 0, SEEK_SET), SyscallSucceeds());
+ EXPECT_THAT(write(rw_file.get(), &buf, 1), SyscallSucceedsWithValue(1));
+}
+
+TEST_F(OpenTest, RelPath) {
+ auto name = std::string(Basename(test_file_name_));
+
+ ASSERT_THAT(chdir(GetAbsoluteTestTmpdir().c_str()), SyscallSucceeds());
+ const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(name, O_RDONLY));
+}
+
+TEST_F(OpenTest, AbsPath) {
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDONLY));
+}
+
+TEST_F(OpenTest, AtRelPath) {
+ auto name = std::string(Basename(test_file_name_));
+ const FileDescriptor dirfd = ASSERT_NO_ERRNO_AND_VALUE(
+ Open(GetAbsoluteTestTmpdir(), O_RDONLY | O_DIRECTORY));
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(OpenAt(dirfd.get(), name, O_RDONLY));
+}
+
+TEST_F(OpenTest, AtAbsPath) {
+ const FileDescriptor dirfd = ASSERT_NO_ERRNO_AND_VALUE(
+ Open(GetAbsoluteTestTmpdir(), O_RDONLY | O_DIRECTORY));
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(OpenAt(dirfd.get(), test_file_name_, O_RDONLY));
+}
+
+TEST_F(OpenTest, OpenNoFollowSymlink) {
+ const std::string link_path = JoinPath(GetAbsoluteTestTmpdir(), "link");
+ ASSERT_THAT(symlink(test_file_name_.c_str(), link_path.c_str()),
+ SyscallSucceeds());
+ auto cleanup = Cleanup([link_path]() {
+ EXPECT_THAT(unlink(link_path.c_str()), SyscallSucceeds());
+ });
+
+ // Open will succeed without O_NOFOLLOW and fails with O_NOFOLLOW.
+ const FileDescriptor fd2 =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(link_path, O_RDONLY));
+ ASSERT_THAT(open(link_path.c_str(), O_RDONLY | O_NOFOLLOW),
+ SyscallFailsWithErrno(ELOOP));
+}
+
+TEST_F(OpenTest, OpenNoFollowStillFollowsLinksInPath) {
+ // We will create the following structure:
+ // tmp_folder/real_folder/file
+ // tmp_folder/sym_folder -> tmp_folder/real_folder
+ //
+ // We will then open tmp_folder/sym_folder/file with O_NOFOLLOW and it
+ // should succeed as O_NOFOLLOW only applies to the final path component.
+ auto tmp_path =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(GetAbsoluteTestTmpdir()));
+ auto sym_path = ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateSymlinkTo(GetAbsoluteTestTmpdir(), tmp_path.path()));
+ auto file_path =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(tmp_path.path()));
+
+ auto path_via_symlink = JoinPath(sym_path.path(), Basename(file_path.path()));
+ const FileDescriptor fd2 =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(path_via_symlink, O_RDONLY | O_NOFOLLOW));
+}
+
+// Test that open(2) can follow symlinks that point back to the same tree.
+// Test sets up files as follows:
+// root/child/symlink => redirects to ../..
+// root/child/target => regular file
+//
+// open("root/child/symlink/root/child/file")
+TEST_F(OpenTest, SymlinkRecurse) {
+ auto root =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(GetAbsoluteTestTmpdir()));
+ auto child = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(root.path()));
+ auto symlink = ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateSymlinkTo(child.path(), "../.."));
+ auto target = ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateFileWith(child.path(), "abc", 0644));
+ auto path_via_symlink =
+ JoinPath(symlink.path(), Basename(root.path()), Basename(child.path()),
+ Basename(target.path()));
+ const auto contents =
+ ASSERT_NO_ERRNO_AND_VALUE(GetContents(path_via_symlink));
+ ASSERT_EQ(contents, "abc");
+}
+
+TEST_F(OpenTest, Fault) {
+ char* totally_not_null = nullptr;
+ ASSERT_THAT(open(totally_not_null, O_RDONLY), SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(OpenTest, AppendOnly) {
+ // First write some data to the fresh file.
+ const int64_t kBufSize = 1024;
+ std::vector<char> buf(kBufSize, 'a');
+
+ FileDescriptor fd0 = ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR));
+
+ std::fill(buf.begin(), buf.end(), 'a');
+ EXPECT_THAT(WriteFd(fd0.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(buf.size()));
+ fd0.reset(); // Close the file early.
+
+ // Next get two handles to the same file. We open two files because we want
+ // to make sure that appending is respected between them.
+ const FileDescriptor fd1 =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR | O_APPEND));
+ EXPECT_THAT(lseek(fd1.get(), 0, SEEK_CUR), SyscallSucceedsWithValue(0));
+
+ const FileDescriptor fd2 =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR | O_APPEND));
+ EXPECT_THAT(lseek(fd2.get(), 0, SEEK_CUR), SyscallSucceedsWithValue(0));
+
+ // Then try to write to the first file and make sure the bytes are appended.
+ EXPECT_THAT(WriteFd(fd1.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(buf.size()));
+
+ // Check that the size of the file is correct and that the offset has been
+ // incremented to that size.
+ struct stat s0;
+ EXPECT_THAT(fstat(fd1.get(), &s0), SyscallSucceeds());
+ EXPECT_EQ(s0.st_size, kBufSize * 2);
+ EXPECT_THAT(lseek(fd1.get(), 0, SEEK_CUR),
+ SyscallSucceedsWithValue(kBufSize * 2));
+
+ // Then try to write to the second file and make sure the bytes are appended.
+ EXPECT_THAT(WriteFd(fd2.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(buf.size()));
+
+ // Check that the size of the file is correct and that the offset has been
+ // incremented to that size.
+ struct stat s1;
+ EXPECT_THAT(fstat(fd2.get(), &s1), SyscallSucceeds());
+ EXPECT_EQ(s1.st_size, kBufSize * 3);
+ EXPECT_THAT(lseek(fd2.get(), 0, SEEK_CUR),
+ SyscallSucceedsWithValue(kBufSize * 3));
+}
+
+TEST_F(OpenTest, AppendConcurrentWrite) {
+ constexpr int kThreadCount = 5;
+ constexpr int kBytesPerThread = 10000;
+ std::unique_ptr<ScopedThread> threads[kThreadCount];
+
+ // In case of the uncached policy, we expect that a file system can be changed
+ // externally, so we create a new inode each time when we open a file and we
+ // can't guarantee that writes to files with O_APPEND will work correctly.
+ SKIP_IF(getenv("GVISOR_GOFER_UNCACHED"));
+
+ EXPECT_THAT(truncate(test_file_name_.c_str(), 0), SyscallSucceeds());
+
+ std::string filename = test_file_name_;
+ DisableSave ds; // Too many syscalls.
+ // Start kThreadCount threads which will write concurrently into the same
+ // file.
+ for (int i = 0; i < kThreadCount; i++) {
+ threads[i] = absl::make_unique<ScopedThread>([filename]() {
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(filename, O_RDWR | O_APPEND));
+
+ for (int j = 0; j < kBytesPerThread; j++) {
+ EXPECT_THAT(WriteFd(fd.get(), &j, 1), SyscallSucceedsWithValue(1));
+ }
+ });
+ }
+ for (int i = 0; i < kThreadCount; i++) {
+ threads[i]->Join();
+ }
+
+ // Check that the size of the file is correct.
+ struct stat st;
+ EXPECT_THAT(stat(test_file_name_.c_str(), &st), SyscallSucceeds());
+ EXPECT_EQ(st.st_size, kThreadCount * kBytesPerThread);
+}
+
+TEST_F(OpenTest, Truncate) {
+ {
+ // First write some data to the new file and close it.
+ FileDescriptor fd0 =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_WRONLY));
+ std::vector<char> orig(10, 'a');
+ EXPECT_THAT(WriteFd(fd0.get(), orig.data(), orig.size()),
+ SyscallSucceedsWithValue(orig.size()));
+ }
+
+ // Then open with truncate and verify that offset is set to 0.
+ const FileDescriptor fd1 =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR | O_TRUNC));
+ EXPECT_THAT(lseek(fd1.get(), 0, SEEK_CUR), SyscallSucceedsWithValue(0));
+
+ // Then write less data to the file and ensure the old content is gone.
+ std::vector<char> want(5, 'b');
+ EXPECT_THAT(WriteFd(fd1.get(), want.data(), want.size()),
+ SyscallSucceedsWithValue(want.size()));
+
+ struct stat stat;
+ EXPECT_THAT(fstat(fd1.get(), &stat), SyscallSucceeds());
+ EXPECT_EQ(stat.st_size, want.size());
+ EXPECT_THAT(lseek(fd1.get(), 0, SEEK_CUR),
+ SyscallSucceedsWithValue(want.size()));
+
+ // Read the data and ensure only the latest write is in the file.
+ std::vector<char> got(want.size() + 1, 'c');
+ ASSERT_THAT(pread(fd1.get(), got.data(), got.size(), 0),
+ SyscallSucceedsWithValue(want.size()));
+ EXPECT_EQ(memcmp(want.data(), got.data(), want.size()), 0)
+ << "rbuf=" << got.data();
+ EXPECT_EQ(got.back(), 'c'); // Last byte should not have been modified.
+}
+
+TEST_F(OpenTest, NameTooLong) {
+ char buf[4097] = {};
+ memset(buf, 'a', 4097);
+ EXPECT_THAT(open(buf, O_RDONLY), SyscallFailsWithErrno(ENAMETOOLONG));
+}
+
+TEST_F(OpenTest, DotsFromRoot) {
+ const FileDescriptor rootfd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/", O_RDONLY | O_DIRECTORY));
+ const FileDescriptor other_rootfd =
+ ASSERT_NO_ERRNO_AND_VALUE(OpenAt(rootfd.get(), "..", O_RDONLY));
+}
+
+TEST_F(OpenTest, DirectoryWritableFails) {
+ ASSERT_THAT(open(GetAbsoluteTestTmpdir().c_str(), O_RDWR),
+ SyscallFailsWithErrno(EISDIR));
+}
+
+TEST_F(OpenTest, FileNotDirectory) {
+ // Create a file and try to open it with O_DIRECTORY.
+ auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ ASSERT_THAT(open(file.path().c_str(), O_RDONLY | O_DIRECTORY),
+ SyscallFailsWithErrno(ENOTDIR));
+}
+
+TEST_F(OpenTest, Null) {
+ char c = '\0';
+ ASSERT_THAT(open(&c, O_RDONLY), SyscallFailsWithErrno(ENOENT));
+}
+
+// NOTE(b/119785738): While the man pages specify that this behavior should be
+// undefined, Linux truncates the file on opening read only if we have write
+// permission, so we will too.
+TEST_F(OpenTest, CanTruncateReadOnly) {
+ const FileDescriptor fd1 =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDONLY | O_TRUNC));
+
+ struct stat stat;
+ EXPECT_THAT(fstat(fd1.get(), &stat), SyscallSucceeds());
+ EXPECT_EQ(stat.st_size, 0);
+}
+
+// If we don't have read permission on the file, opening with
+// O_TRUNC should fail.
+TEST_F(OpenTest, CanTruncateReadOnlyNoWritePermission_NoRandomSave) {
+ // Drop capabilities that allow us to override file permissions.
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+
+ const DisableSave ds; // Permissions are dropped.
+ ASSERT_THAT(chmod(test_file_name_.c_str(), S_IRUSR | S_IRGRP),
+ SyscallSucceeds());
+
+ ASSERT_THAT(open(test_file_name_.c_str(), O_RDONLY | O_TRUNC),
+ SyscallFailsWithErrno(EACCES));
+
+ const FileDescriptor fd1 =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDONLY));
+
+ struct stat stat;
+ EXPECT_THAT(fstat(fd1.get(), &stat), SyscallSucceeds());
+ EXPECT_EQ(stat.st_size, test_data_.size());
+}
+
+// If we don't have read permission but have write permission, opening O_WRONLY
+// and O_TRUNC should succeed.
+TEST_F(OpenTest, CanTruncateWriteOnlyNoReadPermission_NoRandomSave) {
+ const DisableSave ds; // Permissions are dropped.
+
+ EXPECT_THAT(fchmod(test_file_fd_.get(), S_IWUSR | S_IWGRP),
+ SyscallSucceeds());
+
+ const FileDescriptor fd1 =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_WRONLY | O_TRUNC));
+
+ EXPECT_THAT(fchmod(test_file_fd_.get(), S_IRUSR | S_IRGRP),
+ SyscallSucceeds());
+
+ const FileDescriptor fd2 =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDONLY));
+
+ struct stat stat;
+ EXPECT_THAT(fstat(fd2.get(), &stat), SyscallSucceeds());
+ EXPECT_EQ(stat.st_size, 0);
+}
+
+TEST_F(OpenTest, CanTruncateWithStrangePermissions) {
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+ const DisableSave ds; // Permissions are dropped.
+ std::string path = NewTempAbsPath();
+ int fd;
+ // Create a file without user permissions.
+ EXPECT_THAT( // SAVE_BELOW
+ fd = open(path.c_str(), O_CREAT | O_TRUNC | O_WRONLY, 055),
+ SyscallSucceeds());
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+
+ // Cannot open file because we are owner and have no permissions set.
+ EXPECT_THAT(open(path.c_str(), O_RDONLY), SyscallFailsWithErrno(EACCES));
+
+ // We *can* chmod the file, because we are the owner.
+ EXPECT_THAT(chmod(path.c_str(), 0755), SyscallSucceeds());
+
+ // Now we can open the file again.
+ EXPECT_THAT(fd = open(path.c_str(), O_RDWR), SyscallSucceeds());
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
+TEST_F(OpenTest, OpenNonDirectoryWithTrailingSlash) {
+ const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const std::string bad_path = file.path() + "/";
+ EXPECT_THAT(open(bad_path.c_str(), O_RDONLY), SyscallFailsWithErrno(ENOTDIR));
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/open_create.cc b/test/syscalls/linux/open_create.cc
new file mode 100644
index 000000000..51eacf3f2
--- /dev/null
+++ b/test/syscalls/linux/open_create.cc
@@ -0,0 +1,155 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/temp_umask.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+TEST(CreateTest, TmpFile) {
+ int fd;
+ EXPECT_THAT(fd = open(JoinPath(GetAbsoluteTestTmpdir(), "a").c_str(),
+ O_RDWR | O_CREAT, 0666),
+ SyscallSucceeds());
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
+TEST(CreateTest, ExistingFile) {
+ int fd;
+ EXPECT_THAT(
+ fd = open(JoinPath(GetAbsoluteTestTmpdir(), "ExistingFile").c_str(),
+ O_RDWR | O_CREAT, 0666),
+ SyscallSucceeds());
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+
+ EXPECT_THAT(
+ fd = open(JoinPath(GetAbsoluteTestTmpdir(), "ExistingFile").c_str(),
+ O_RDWR | O_CREAT, 0666),
+ SyscallSucceeds());
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
+TEST(CreateTest, CreateAtFile) {
+ int dirfd;
+ EXPECT_THAT(dirfd = open(GetAbsoluteTestTmpdir().c_str(), O_DIRECTORY, 0666),
+ SyscallSucceeds());
+ EXPECT_THAT(openat(dirfd, "CreateAtFile", O_RDWR | O_CREAT, 0666),
+ SyscallSucceeds());
+ EXPECT_THAT(close(dirfd), SyscallSucceeds());
+}
+
+TEST(CreateTest, HonorsUmask_NoRandomSave) {
+ const DisableSave ds; // file cannot be re-opened as writable.
+ TempUmask mask(0222);
+ int fd;
+ ASSERT_THAT(
+ fd = open(JoinPath(GetAbsoluteTestTmpdir(), "UmaskedFile").c_str(),
+ O_RDWR | O_CREAT, 0666),
+ SyscallSucceeds());
+ struct stat statbuf;
+ ASSERT_THAT(fstat(fd, &statbuf), SyscallSucceeds());
+ EXPECT_EQ(0444, statbuf.st_mode & 0777);
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
+TEST(CreateTest, CreateExclusively) {
+ std::string filename = NewTempAbsPath();
+
+ int fd;
+ ASSERT_THAT(fd = open(filename.c_str(), O_CREAT | O_RDWR, 0644),
+ SyscallSucceeds());
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+
+ EXPECT_THAT(open(filename.c_str(), O_CREAT | O_EXCL | O_RDWR, 0644),
+ SyscallFailsWithErrno(EEXIST));
+}
+
+TEST(CreateTeast, CreatWithOTrunc) {
+ std::string dirpath = JoinPath(GetAbsoluteTestTmpdir(), "truncd");
+ ASSERT_THAT(mkdir(dirpath.c_str(), 0777), SyscallSucceeds());
+ ASSERT_THAT(open(dirpath.c_str(), O_CREAT | O_TRUNC, 0666),
+ SyscallFailsWithErrno(EISDIR));
+}
+
+TEST(CreateTeast, CreatDirWithOTruncAndReadOnly) {
+ std::string dirpath = JoinPath(GetAbsoluteTestTmpdir(), "truncd");
+ ASSERT_THAT(mkdir(dirpath.c_str(), 0777), SyscallSucceeds());
+ ASSERT_THAT(open(dirpath.c_str(), O_CREAT | O_TRUNC | O_RDONLY, 0666),
+ SyscallFailsWithErrno(EISDIR));
+}
+
+TEST(CreateTeast, CreatFileWithOTruncAndReadOnly) {
+ std::string dirpath = JoinPath(GetAbsoluteTestTmpdir(), "truncfile");
+ int dirfd;
+ ASSERT_THAT(dirfd = open(dirpath.c_str(), O_RDWR | O_CREAT, 0666),
+ SyscallSucceeds());
+ ASSERT_THAT(open(dirpath.c_str(), O_CREAT | O_TRUNC | O_RDONLY, 0666),
+ SyscallSucceeds());
+ ASSERT_THAT(close(dirfd), SyscallSucceeds());
+}
+
+TEST(CreateTest, CreateFailsOnUnpermittedDir) {
+ // Make sure we don't have CAP_DAC_OVERRIDE, since that allows the user to
+ // always override directory permissions.
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+ ASSERT_THAT(open("/foo", O_CREAT | O_RDWR, 0644),
+ SyscallFailsWithErrno(EACCES));
+}
+
+TEST(CreateTest, CreateFailsOnDirWithoutWritePerms) {
+ // Make sure we don't have CAP_DAC_OVERRIDE, since that allows the user to
+ // always override directory permissions.
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+ auto parent = ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateDirWith(GetAbsoluteTestTmpdir(), 0555));
+ auto file = JoinPath(parent.path(), "foo");
+ ASSERT_THAT(open(file.c_str(), O_CREAT | O_RDWR, 0644),
+ SyscallFailsWithErrno(EACCES));
+}
+
+// A file originally created RW, but opened RO can later be opened RW.
+// Regression test for b/65385065.
+TEST(CreateTest, OpenCreateROThenRW) {
+ TempPath file(NewTempAbsPath());
+
+ // Create a RW file, but only open it RO.
+ FileDescriptor fd1 = ASSERT_NO_ERRNO_AND_VALUE(
+ Open(file.path(), O_CREAT | O_EXCL | O_RDONLY, 0644));
+
+ // Now get a RW FD.
+ FileDescriptor fd2 = ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR));
+
+ // fd1 is not writable, but fd2 is.
+ char c = 'a';
+ EXPECT_THAT(WriteFd(fd1.get(), &c, 1), SyscallFailsWithErrno(EBADF));
+ EXPECT_THAT(WriteFd(fd2.get(), &c, 1), SyscallSucceedsWithValue(1));
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/packet_socket.cc b/test/syscalls/linux/packet_socket.cc
new file mode 100644
index 000000000..5ac68feb4
--- /dev/null
+++ b/test/syscalls/linux/packet_socket.cc
@@ -0,0 +1,440 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arpa/inet.h>
+#include <ifaddrs.h>
+#include <linux/capability.h>
+#include <linux/if_arp.h>
+#include <linux/if_packet.h>
+#include <net/ethernet.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/udp.h>
+#include <poll.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "absl/base/internal/endian.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/test_util.h"
+
+// Some of these tests involve sending packets via AF_PACKET sockets and the
+// loopback interface. Because AF_PACKET circumvents so much of the networking
+// stack, Linux sees these packets as "martian", i.e. they claim to be to/from
+// localhost but don't have the usual associated data. Thus Linux drops them by
+// default. You can see where this happens by following the code at:
+//
+// - net/ipv4/ip_input.c:ip_rcv_finish, which calls
+// - net/ipv4/route.c:ip_route_input_noref, which calls
+// - net/ipv4/route.c:ip_route_input_slow, which finds and drops martian
+// packets.
+//
+// To tell Linux not to drop these packets, you need to tell it to accept our
+// funny packets (which are completely valid and correct, but lack associated
+// in-kernel data because we use AF_PACKET):
+//
+// echo 1 >> /proc/sys/net/ipv4/conf/lo/accept_local
+// echo 1 >> /proc/sys/net/ipv4/conf/lo/route_localnet
+//
+// These tests require CAP_NET_RAW to run.
+
+// TODO(gvisor.dev/issue/173): gVisor support.
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+using ::testing::AnyOf;
+using ::testing::Eq;
+
+constexpr char kMessage[] = "soweoneul malhaebwa";
+constexpr in_port_t kPort = 0x409c; // htons(40000)
+
+//
+// "Cooked" tests. Cooked AF_PACKET sockets do not contain link layer
+// headers, and provide link layer destination/source information via a
+// returned struct sockaddr_ll.
+//
+
+// Send kMessage via sock to loopback
+void SendUDPMessage(int sock) {
+ struct sockaddr_in dest = {};
+ dest.sin_port = kPort;
+ dest.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ dest.sin_family = AF_INET;
+ EXPECT_THAT(sendto(sock, kMessage, sizeof(kMessage), 0,
+ reinterpret_cast<struct sockaddr*>(&dest), sizeof(dest)),
+ SyscallSucceedsWithValue(sizeof(kMessage)));
+}
+
+// Send an IP packet and make sure ETH_P_<something else> doesn't pick it up.
+TEST(BasicCookedPacketTest, WrongType) {
+ if (!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) {
+ ASSERT_THAT(socket(AF_PACKET, SOCK_DGRAM, ETH_P_PUP),
+ SyscallFailsWithErrno(EPERM));
+ GTEST_SKIP();
+ }
+
+ FileDescriptor sock = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(AF_PACKET, SOCK_DGRAM, htons(ETH_P_PUP)));
+
+ // Let's use a simple IP payload: a UDP datagram.
+ FileDescriptor udp_sock =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+ SendUDPMessage(udp_sock.get());
+
+ // Wait and make sure the socket never becomes readable.
+ struct pollfd pfd = {};
+ pfd.fd = sock.get();
+ pfd.events = POLLIN;
+ EXPECT_THAT(RetryEINTR(poll)(&pfd, 1, 1000), SyscallSucceedsWithValue(0));
+}
+
+// Tests for "cooked" (SOCK_DGRAM) packet(7) sockets.
+class CookedPacketTest : public ::testing::TestWithParam<int> {
+ protected:
+ // Creates a socket to be used in tests.
+ void SetUp() override;
+
+ // Closes the socket created by SetUp().
+ void TearDown() override;
+
+ // Gets the device index of the loopback device.
+ int GetLoopbackIndex();
+
+ // The socket used for both reading and writing.
+ int socket_;
+};
+
+void CookedPacketTest::SetUp() {
+ if (!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) {
+ ASSERT_THAT(socket(AF_PACKET, SOCK_DGRAM, htons(GetParam())),
+ SyscallFailsWithErrno(EPERM));
+ GTEST_SKIP();
+ }
+
+ if (!IsRunningOnGvisor()) {
+ FileDescriptor acceptLocal = ASSERT_NO_ERRNO_AND_VALUE(
+ Open("/proc/sys/net/ipv4/conf/lo/accept_local", O_RDONLY));
+ FileDescriptor routeLocalnet = ASSERT_NO_ERRNO_AND_VALUE(
+ Open("/proc/sys/net/ipv4/conf/lo/route_localnet", O_RDONLY));
+ char enabled;
+ ASSERT_THAT(read(acceptLocal.get(), &enabled, 1),
+ SyscallSucceedsWithValue(1));
+ ASSERT_EQ(enabled, '1');
+ ASSERT_THAT(read(routeLocalnet.get(), &enabled, 1),
+ SyscallSucceedsWithValue(1));
+ ASSERT_EQ(enabled, '1');
+ }
+
+ ASSERT_THAT(socket_ = socket(AF_PACKET, SOCK_DGRAM, htons(GetParam())),
+ SyscallSucceeds());
+}
+
+void CookedPacketTest::TearDown() {
+ // TearDown will be run even if we skip the test.
+ if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) {
+ EXPECT_THAT(close(socket_), SyscallSucceeds());
+ }
+}
+
+int CookedPacketTest::GetLoopbackIndex() {
+ struct ifreq ifr;
+ snprintf(ifr.ifr_name, IFNAMSIZ, "lo");
+ EXPECT_THAT(ioctl(socket_, SIOCGIFINDEX, &ifr), SyscallSucceeds());
+ EXPECT_NE(ifr.ifr_ifindex, 0);
+ return ifr.ifr_ifindex;
+}
+
+// Receive and verify the message via packet socket on interface.
+void ReceiveMessage(int sock, int ifindex) {
+ // Wait for the socket to become readable.
+ struct pollfd pfd = {};
+ pfd.fd = sock;
+ pfd.events = POLLIN;
+ EXPECT_THAT(RetryEINTR(poll)(&pfd, 1, 2000), SyscallSucceedsWithValue(1));
+
+ // Read and verify the data.
+ constexpr size_t packet_size =
+ sizeof(struct iphdr) + sizeof(struct udphdr) + sizeof(kMessage);
+ char buf[64];
+ struct sockaddr_ll src = {};
+ socklen_t src_len = sizeof(src);
+ ASSERT_THAT(recvfrom(sock, buf, sizeof(buf), 0,
+ reinterpret_cast<struct sockaddr*>(&src), &src_len),
+ SyscallSucceedsWithValue(packet_size));
+
+ // sockaddr_ll ends with an 8 byte physical address field, but ethernet
+ // addresses only use 6 bytes. Linux used to return sizeof(sockaddr_ll)-2
+ // here, but since commit b2cf86e1563e33a14a1c69b3e508d15dc12f804c returns
+ // sizeof(sockaddr_ll).
+ ASSERT_THAT(src_len, AnyOf(Eq(sizeof(src)), Eq(sizeof(src) - 2)));
+
+ // TODO(b/129292371): Verify protocol once we return it.
+ // Verify the source address.
+ EXPECT_EQ(src.sll_family, AF_PACKET);
+ EXPECT_EQ(src.sll_ifindex, ifindex);
+ EXPECT_EQ(src.sll_halen, ETH_ALEN);
+ // This came from the loopback device, so the address is all 0s.
+ for (int i = 0; i < src.sll_halen; i++) {
+ EXPECT_EQ(src.sll_addr[i], 0);
+ }
+
+ // Verify the IP header. We memcpy to deal with pointer aligment.
+ struct iphdr ip = {};
+ memcpy(&ip, buf, sizeof(ip));
+ EXPECT_EQ(ip.ihl, 5);
+ EXPECT_EQ(ip.version, 4);
+ EXPECT_EQ(ip.tot_len, htons(packet_size));
+ EXPECT_EQ(ip.protocol, IPPROTO_UDP);
+ EXPECT_EQ(ip.daddr, htonl(INADDR_LOOPBACK));
+ EXPECT_EQ(ip.saddr, htonl(INADDR_LOOPBACK));
+
+ // Verify the UDP header. We memcpy to deal with pointer aligment.
+ struct udphdr udp = {};
+ memcpy(&udp, buf + sizeof(iphdr), sizeof(udp));
+ EXPECT_EQ(udp.dest, kPort);
+ EXPECT_EQ(udp.len, htons(sizeof(udphdr) + sizeof(kMessage)));
+
+ // Verify the payload.
+ char* payload = reinterpret_cast<char*>(buf + sizeof(iphdr) + sizeof(udphdr));
+ EXPECT_EQ(strncmp(payload, kMessage, sizeof(kMessage)), 0);
+}
+
+// Receive via a packet socket.
+TEST_P(CookedPacketTest, Receive) {
+ // Let's use a simple IP payload: a UDP datagram.
+ FileDescriptor udp_sock =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+ SendUDPMessage(udp_sock.get());
+
+ // Receive and verify the data.
+ int loopback_index = GetLoopbackIndex();
+ ReceiveMessage(socket_, loopback_index);
+}
+
+// Send via a packet socket.
+TEST_P(CookedPacketTest, Send) {
+ // TODO(b/129292371): Remove once we support packet socket writing.
+ SKIP_IF(IsRunningOnGvisor());
+
+ // Let's send a UDP packet and receive it using a regular UDP socket.
+ FileDescriptor udp_sock =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+ struct sockaddr_in bind_addr = {};
+ bind_addr.sin_family = AF_INET;
+ bind_addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ bind_addr.sin_port = kPort;
+ ASSERT_THAT(
+ bind(udp_sock.get(), reinterpret_cast<struct sockaddr*>(&bind_addr),
+ sizeof(bind_addr)),
+ SyscallSucceeds());
+
+ // Set up the destination physical address.
+ struct sockaddr_ll dest = {};
+ dest.sll_family = AF_PACKET;
+ dest.sll_halen = ETH_ALEN;
+ dest.sll_ifindex = GetLoopbackIndex();
+ dest.sll_protocol = htons(ETH_P_IP);
+ // We're sending to the loopback device, so the address is all 0s.
+ memset(dest.sll_addr, 0x00, ETH_ALEN);
+
+ // Set up the IP header.
+ struct iphdr iphdr = {0};
+ iphdr.ihl = 5;
+ iphdr.version = 4;
+ iphdr.tos = 0;
+ iphdr.tot_len =
+ htons(sizeof(struct iphdr) + sizeof(struct udphdr) + sizeof(kMessage));
+ // Get a pseudo-random ID. If we clash with an in-use ID the test will fail,
+ // but we have no way of getting an ID we know to be good.
+ srand(*reinterpret_cast<unsigned int*>(&iphdr));
+ iphdr.id = rand();
+ // Linux sets this bit ("do not fragment") for small packets.
+ iphdr.frag_off = 1 << 6;
+ iphdr.ttl = 64;
+ iphdr.protocol = IPPROTO_UDP;
+ iphdr.daddr = htonl(INADDR_LOOPBACK);
+ iphdr.saddr = htonl(INADDR_LOOPBACK);
+ iphdr.check = IPChecksum(iphdr);
+
+ // Set up the UDP header.
+ struct udphdr udphdr = {};
+ udphdr.source = kPort;
+ udphdr.dest = kPort;
+ udphdr.len = htons(sizeof(udphdr) + sizeof(kMessage));
+ udphdr.check = UDPChecksum(iphdr, udphdr, kMessage, sizeof(kMessage));
+
+ // Copy both headers and the payload into our packet buffer.
+ char send_buf[sizeof(iphdr) + sizeof(udphdr) + sizeof(kMessage)];
+ memcpy(send_buf, &iphdr, sizeof(iphdr));
+ memcpy(send_buf + sizeof(iphdr), &udphdr, sizeof(udphdr));
+ memcpy(send_buf + sizeof(iphdr) + sizeof(udphdr), kMessage, sizeof(kMessage));
+
+ // Send it.
+ ASSERT_THAT(sendto(socket_, send_buf, sizeof(send_buf), 0,
+ reinterpret_cast<struct sockaddr*>(&dest), sizeof(dest)),
+ SyscallSucceedsWithValue(sizeof(send_buf)));
+
+ // Wait for the packet to become available on both sockets.
+ struct pollfd pfd = {};
+ pfd.fd = udp_sock.get();
+ pfd.events = POLLIN;
+ ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, 5000), SyscallSucceedsWithValue(1));
+ pfd.fd = socket_;
+ pfd.events = POLLIN;
+ ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, 5000), SyscallSucceedsWithValue(1));
+
+ // Receive on the packet socket.
+ char recv_buf[sizeof(send_buf)];
+ ASSERT_THAT(recv(socket_, recv_buf, sizeof(recv_buf), 0),
+ SyscallSucceedsWithValue(sizeof(recv_buf)));
+ ASSERT_EQ(memcmp(recv_buf, send_buf, sizeof(send_buf)), 0);
+
+ // Receive on the UDP socket.
+ struct sockaddr_in src;
+ socklen_t src_len = sizeof(src);
+ ASSERT_THAT(recvfrom(udp_sock.get(), recv_buf, sizeof(recv_buf), MSG_DONTWAIT,
+ reinterpret_cast<struct sockaddr*>(&src), &src_len),
+ SyscallSucceedsWithValue(sizeof(kMessage)));
+ // Check src and payload.
+ EXPECT_EQ(strncmp(recv_buf, kMessage, sizeof(kMessage)), 0);
+ EXPECT_EQ(src.sin_family, AF_INET);
+ EXPECT_EQ(src.sin_port, kPort);
+ EXPECT_EQ(src.sin_addr.s_addr, htonl(INADDR_LOOPBACK));
+}
+
+// Bind and receive via packet socket.
+TEST_P(CookedPacketTest, BindReceive) {
+ struct sockaddr_ll bind_addr = {};
+ bind_addr.sll_family = AF_PACKET;
+ bind_addr.sll_protocol = htons(GetParam());
+ bind_addr.sll_ifindex = GetLoopbackIndex();
+
+ ASSERT_THAT(bind(socket_, reinterpret_cast<struct sockaddr*>(&bind_addr),
+ sizeof(bind_addr)),
+ SyscallSucceeds());
+
+ // Let's use a simple IP payload: a UDP datagram.
+ FileDescriptor udp_sock =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+ SendUDPMessage(udp_sock.get());
+
+ // Receive and verify the data.
+ ReceiveMessage(socket_, bind_addr.sll_ifindex);
+}
+
+// Double Bind socket.
+TEST_P(CookedPacketTest, DoubleBind) {
+ struct sockaddr_ll bind_addr = {};
+ bind_addr.sll_family = AF_PACKET;
+ bind_addr.sll_protocol = htons(GetParam());
+ bind_addr.sll_ifindex = GetLoopbackIndex();
+
+ ASSERT_THAT(bind(socket_, reinterpret_cast<struct sockaddr*>(&bind_addr),
+ sizeof(bind_addr)),
+ SyscallSucceeds());
+
+ // Binding socket again should fail.
+ ASSERT_THAT(
+ bind(socket_, reinterpret_cast<struct sockaddr*>(&bind_addr),
+ sizeof(bind_addr)),
+ // Linux 4.09 returns EINVAL here, but some time before 4.19 it switched
+ // to EADDRINUSE.
+ AnyOf(SyscallFailsWithErrno(EADDRINUSE), SyscallFailsWithErrno(EINVAL)));
+}
+
+// Bind and verify we do not receive data on interface which is not bound
+TEST_P(CookedPacketTest, BindDrop) {
+ // Let's use a simple IP payload: a UDP datagram.
+ FileDescriptor udp_sock =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+
+ struct ifaddrs* if_addr_list = nullptr;
+ auto cleanup = Cleanup([&if_addr_list]() { freeifaddrs(if_addr_list); });
+
+ ASSERT_THAT(getifaddrs(&if_addr_list), SyscallSucceeds());
+
+ // Get interface other than loopback.
+ struct ifreq ifr = {};
+ for (struct ifaddrs* i = if_addr_list; i; i = i->ifa_next) {
+ if (strcmp(i->ifa_name, "lo") != 0) {
+ strncpy(ifr.ifr_name, i->ifa_name, sizeof(ifr.ifr_name));
+ break;
+ }
+ }
+
+ // Skip if no interface is available other than loopback.
+ if (strlen(ifr.ifr_name) == 0) {
+ GTEST_SKIP();
+ }
+
+ // Get interface index.
+ EXPECT_THAT(ioctl(socket_, SIOCGIFINDEX, &ifr), SyscallSucceeds());
+ EXPECT_NE(ifr.ifr_ifindex, 0);
+
+ // Bind to packet socket requires only family, protocol and ifindex.
+ struct sockaddr_ll bind_addr = {};
+ bind_addr.sll_family = AF_PACKET;
+ bind_addr.sll_protocol = htons(GetParam());
+ bind_addr.sll_ifindex = ifr.ifr_ifindex;
+
+ ASSERT_THAT(bind(socket_, reinterpret_cast<struct sockaddr*>(&bind_addr),
+ sizeof(bind_addr)),
+ SyscallSucceeds());
+
+ // Send to loopback interface.
+ struct sockaddr_in dest = {};
+ dest.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ dest.sin_family = AF_INET;
+ dest.sin_port = kPort;
+ EXPECT_THAT(sendto(udp_sock.get(), kMessage, sizeof(kMessage), 0,
+ reinterpret_cast<struct sockaddr*>(&dest), sizeof(dest)),
+ SyscallSucceedsWithValue(sizeof(kMessage)));
+
+ // Wait and make sure the socket never receives any data.
+ struct pollfd pfd = {};
+ pfd.fd = socket_;
+ pfd.events = POLLIN;
+ EXPECT_THAT(RetryEINTR(poll)(&pfd, 1, 1000), SyscallSucceedsWithValue(0));
+}
+
+// Bind with invalid address.
+TEST_P(CookedPacketTest, BindFail) {
+ // Null address.
+ ASSERT_THAT(
+ bind(socket_, nullptr, sizeof(struct sockaddr)),
+ AnyOf(SyscallFailsWithErrno(EFAULT), SyscallFailsWithErrno(EINVAL)));
+
+ // Address of size 1.
+ uint8_t addr = 0;
+ ASSERT_THAT(
+ bind(socket_, reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr)),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+INSTANTIATE_TEST_SUITE_P(AllInetTests, CookedPacketTest,
+ ::testing::Values(ETH_P_IP, ETH_P_ALL));
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/packet_socket_raw.cc b/test/syscalls/linux/packet_socket_raw.cc
new file mode 100644
index 000000000..4093ac813
--- /dev/null
+++ b/test/syscalls/linux/packet_socket_raw.cc
@@ -0,0 +1,565 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arpa/inet.h>
+#include <linux/capability.h>
+#include <linux/if_arp.h>
+#include <linux/if_packet.h>
+#include <net/ethernet.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/udp.h>
+#include <poll.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/base/internal/endian.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/test_util.h"
+
+// Some of these tests involve sending packets via AF_PACKET sockets and the
+// loopback interface. Because AF_PACKET circumvents so much of the networking
+// stack, Linux sees these packets as "martian", i.e. they claim to be to/from
+// localhost but don't have the usual associated data. Thus Linux drops them by
+// default. You can see where this happens by following the code at:
+//
+// - net/ipv4/ip_input.c:ip_rcv_finish, which calls
+// - net/ipv4/route.c:ip_route_input_noref, which calls
+// - net/ipv4/route.c:ip_route_input_slow, which finds and drops martian
+// packets.
+//
+// To tell Linux not to drop these packets, you need to tell it to accept our
+// funny packets (which are completely valid and correct, but lack associated
+// in-kernel data because we use AF_PACKET):
+//
+// echo 1 >> /proc/sys/net/ipv4/conf/lo/accept_local
+// echo 1 >> /proc/sys/net/ipv4/conf/lo/route_localnet
+//
+// These tests require CAP_NET_RAW to run.
+
+// TODO(gvisor.dev/issue/173): gVisor support.
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+using ::testing::AnyOf;
+using ::testing::Eq;
+
+constexpr char kMessage[] = "soweoneul malhaebwa";
+constexpr in_port_t kPort = 0x409c; // htons(40000)
+
+// Send kMessage via sock to loopback
+void SendUDPMessage(int sock) {
+ struct sockaddr_in dest = {};
+ dest.sin_port = kPort;
+ dest.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ dest.sin_family = AF_INET;
+ EXPECT_THAT(sendto(sock, kMessage, sizeof(kMessage), 0,
+ reinterpret_cast<struct sockaddr*>(&dest), sizeof(dest)),
+ SyscallSucceedsWithValue(sizeof(kMessage)));
+}
+
+//
+// Raw tests. Packets sent with raw AF_PACKET sockets always include link layer
+// headers.
+//
+
+// Tests for "raw" (SOCK_RAW) packet(7) sockets.
+class RawPacketTest : public ::testing::TestWithParam<int> {
+ protected:
+ // Creates a socket to be used in tests.
+ void SetUp() override;
+
+ // Closes the socket created by SetUp().
+ void TearDown() override;
+
+ // Gets the device index of the loopback device.
+ int GetLoopbackIndex();
+
+ // The socket used for both reading and writing.
+ int s_;
+};
+
+void RawPacketTest::SetUp() {
+ if (!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) {
+ ASSERT_THAT(socket(AF_PACKET, SOCK_RAW, htons(GetParam())),
+ SyscallFailsWithErrno(EPERM));
+ GTEST_SKIP();
+ }
+
+ if (!IsRunningOnGvisor()) {
+ // Ensure that looped back packets aren't rejected by the kernel.
+ FileDescriptor acceptLocal = ASSERT_NO_ERRNO_AND_VALUE(
+ Open("/proc/sys/net/ipv4/conf/lo/accept_local", O_RDWR));
+ FileDescriptor routeLocalnet = ASSERT_NO_ERRNO_AND_VALUE(
+ Open("/proc/sys/net/ipv4/conf/lo/route_localnet", O_RDWR));
+ char enabled;
+ ASSERT_THAT(read(acceptLocal.get(), &enabled, 1),
+ SyscallSucceedsWithValue(1));
+ if (enabled != '1') {
+ enabled = '1';
+ ASSERT_THAT(lseek(acceptLocal.get(), 0, SEEK_SET),
+ SyscallSucceedsWithValue(0));
+ ASSERT_THAT(write(acceptLocal.get(), &enabled, 1),
+ SyscallSucceedsWithValue(1));
+ ASSERT_THAT(lseek(acceptLocal.get(), 0, SEEK_SET),
+ SyscallSucceedsWithValue(0));
+ ASSERT_THAT(read(acceptLocal.get(), &enabled, 1),
+ SyscallSucceedsWithValue(1));
+ ASSERT_EQ(enabled, '1');
+ }
+
+ ASSERT_THAT(read(routeLocalnet.get(), &enabled, 1),
+ SyscallSucceedsWithValue(1));
+ if (enabled != '1') {
+ enabled = '1';
+ ASSERT_THAT(lseek(routeLocalnet.get(), 0, SEEK_SET),
+ SyscallSucceedsWithValue(0));
+ ASSERT_THAT(write(routeLocalnet.get(), &enabled, 1),
+ SyscallSucceedsWithValue(1));
+ ASSERT_THAT(lseek(routeLocalnet.get(), 0, SEEK_SET),
+ SyscallSucceedsWithValue(0));
+ ASSERT_THAT(read(routeLocalnet.get(), &enabled, 1),
+ SyscallSucceedsWithValue(1));
+ ASSERT_EQ(enabled, '1');
+ }
+ }
+
+ ASSERT_THAT(s_ = socket(AF_PACKET, SOCK_RAW, htons(GetParam())),
+ SyscallSucceeds());
+}
+
+void RawPacketTest::TearDown() {
+ // TearDown will be run even if we skip the test.
+ if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) {
+ EXPECT_THAT(close(s_), SyscallSucceeds());
+ }
+}
+
+int RawPacketTest::GetLoopbackIndex() {
+ struct ifreq ifr;
+ snprintf(ifr.ifr_name, IFNAMSIZ, "lo");
+ EXPECT_THAT(ioctl(s_, SIOCGIFINDEX, &ifr), SyscallSucceeds());
+ EXPECT_NE(ifr.ifr_ifindex, 0);
+ return ifr.ifr_ifindex;
+}
+
+// Receive via a packet socket.
+TEST_P(RawPacketTest, Receive) {
+ // Let's use a simple IP payload: a UDP datagram.
+ FileDescriptor udp_sock =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+ SendUDPMessage(udp_sock.get());
+
+ // Wait for the socket to become readable.
+ struct pollfd pfd = {};
+ pfd.fd = s_;
+ pfd.events = POLLIN;
+ EXPECT_THAT(RetryEINTR(poll)(&pfd, 1, 2000), SyscallSucceedsWithValue(1));
+
+ // Read and verify the data.
+ constexpr size_t packet_size = sizeof(struct ethhdr) + sizeof(struct iphdr) +
+ sizeof(struct udphdr) + sizeof(kMessage);
+ char buf[64];
+ struct sockaddr_ll src = {};
+ socklen_t src_len = sizeof(src);
+ ASSERT_THAT(recvfrom(s_, buf, sizeof(buf), 0,
+ reinterpret_cast<struct sockaddr*>(&src), &src_len),
+ SyscallSucceedsWithValue(packet_size));
+ // sockaddr_ll ends with an 8 byte physical address field, but ethernet
+ // addresses only use 6 bytes. Linux used to return sizeof(sockaddr_ll)-2
+ // here, but since commit b2cf86e1563e33a14a1c69b3e508d15dc12f804c returns
+ // sizeof(sockaddr_ll).
+ ASSERT_THAT(src_len, AnyOf(Eq(sizeof(src)), Eq(sizeof(src) - 2)));
+
+ // TODO(b/129292371): Verify protocol once we return it.
+ // Verify the source address.
+ EXPECT_EQ(src.sll_family, AF_PACKET);
+ EXPECT_EQ(src.sll_ifindex, GetLoopbackIndex());
+ EXPECT_EQ(src.sll_halen, ETH_ALEN);
+ // This came from the loopback device, so the address is all 0s.
+ for (int i = 0; i < src.sll_halen; i++) {
+ EXPECT_EQ(src.sll_addr[i], 0);
+ }
+
+ // Verify the ethernet header. We memcpy to deal with pointer alignment.
+ struct ethhdr eth = {};
+ memcpy(&eth, buf, sizeof(eth));
+ // The destination and source address should be 0, for loopback.
+ for (int i = 0; i < ETH_ALEN; i++) {
+ EXPECT_EQ(eth.h_dest[i], 0);
+ EXPECT_EQ(eth.h_source[i], 0);
+ }
+ EXPECT_EQ(eth.h_proto, htons(ETH_P_IP));
+
+ // Verify the IP header. We memcpy to deal with pointer aligment.
+ struct iphdr ip = {};
+ memcpy(&ip, buf + sizeof(ethhdr), sizeof(ip));
+ EXPECT_EQ(ip.ihl, 5);
+ EXPECT_EQ(ip.version, 4);
+ EXPECT_EQ(ip.tot_len, htons(packet_size - sizeof(eth)));
+ EXPECT_EQ(ip.protocol, IPPROTO_UDP);
+ EXPECT_EQ(ip.daddr, htonl(INADDR_LOOPBACK));
+ EXPECT_EQ(ip.saddr, htonl(INADDR_LOOPBACK));
+
+ // Verify the UDP header. We memcpy to deal with pointer aligment.
+ struct udphdr udp = {};
+ memcpy(&udp, buf + sizeof(eth) + sizeof(iphdr), sizeof(udp));
+ EXPECT_EQ(udp.dest, kPort);
+ EXPECT_EQ(udp.len, htons(sizeof(udphdr) + sizeof(kMessage)));
+
+ // Verify the payload.
+ char* payload = reinterpret_cast<char*>(buf + sizeof(eth) + sizeof(iphdr) +
+ sizeof(udphdr));
+ EXPECT_EQ(strncmp(payload, kMessage, sizeof(kMessage)), 0);
+}
+
+// Send via a packet socket.
+TEST_P(RawPacketTest, Send) {
+ // TODO(b/129292371): Remove once we support packet socket writing.
+ SKIP_IF(IsRunningOnGvisor());
+
+ // Let's send a UDP packet and receive it using a regular UDP socket.
+ FileDescriptor udp_sock =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+ struct sockaddr_in bind_addr = {};
+ bind_addr.sin_family = AF_INET;
+ bind_addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ bind_addr.sin_port = kPort;
+ ASSERT_THAT(
+ bind(udp_sock.get(), reinterpret_cast<struct sockaddr*>(&bind_addr),
+ sizeof(bind_addr)),
+ SyscallSucceeds());
+
+ // Set up the destination physical address.
+ struct sockaddr_ll dest = {};
+ dest.sll_family = AF_PACKET;
+ dest.sll_halen = ETH_ALEN;
+ dest.sll_ifindex = GetLoopbackIndex();
+ dest.sll_protocol = htons(ETH_P_IP);
+ // We're sending to the loopback device, so the address is all 0s.
+ memset(dest.sll_addr, 0x00, ETH_ALEN);
+
+ // Set up the ethernet header. The kernel takes care of the footer.
+ // We're sending to and from hardware address 0 (loopback).
+ struct ethhdr eth = {};
+ eth.h_proto = htons(ETH_P_IP);
+
+ // Set up the IP header.
+ struct iphdr iphdr = {};
+ iphdr.ihl = 5;
+ iphdr.version = 4;
+ iphdr.tos = 0;
+ iphdr.tot_len =
+ htons(sizeof(struct iphdr) + sizeof(struct udphdr) + sizeof(kMessage));
+ // Get a pseudo-random ID. If we clash with an in-use ID the test will fail,
+ // but we have no way of getting an ID we know to be good.
+ srand(*reinterpret_cast<unsigned int*>(&iphdr));
+ iphdr.id = rand();
+ // Linux sets this bit ("do not fragment") for small packets.
+ iphdr.frag_off = 1 << 6;
+ iphdr.ttl = 64;
+ iphdr.protocol = IPPROTO_UDP;
+ iphdr.daddr = htonl(INADDR_LOOPBACK);
+ iphdr.saddr = htonl(INADDR_LOOPBACK);
+ iphdr.check = IPChecksum(iphdr);
+
+ // Set up the UDP header.
+ struct udphdr udphdr = {};
+ udphdr.source = kPort;
+ udphdr.dest = kPort;
+ udphdr.len = htons(sizeof(udphdr) + sizeof(kMessage));
+ udphdr.check = UDPChecksum(iphdr, udphdr, kMessage, sizeof(kMessage));
+
+ // Copy both headers and the payload into our packet buffer.
+ char
+ send_buf[sizeof(eth) + sizeof(iphdr) + sizeof(udphdr) + sizeof(kMessage)];
+ memcpy(send_buf, &eth, sizeof(eth));
+ memcpy(send_buf + sizeof(ethhdr), &iphdr, sizeof(iphdr));
+ memcpy(send_buf + sizeof(ethhdr) + sizeof(iphdr), &udphdr, sizeof(udphdr));
+ memcpy(send_buf + sizeof(ethhdr) + sizeof(iphdr) + sizeof(udphdr), kMessage,
+ sizeof(kMessage));
+
+ // Send it.
+ ASSERT_THAT(sendto(s_, send_buf, sizeof(send_buf), 0,
+ reinterpret_cast<struct sockaddr*>(&dest), sizeof(dest)),
+ SyscallSucceedsWithValue(sizeof(send_buf)));
+
+ // Wait for the packet to become available on both sockets.
+ struct pollfd pfd = {};
+ pfd.fd = udp_sock.get();
+ pfd.events = POLLIN;
+ ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, 5000), SyscallSucceedsWithValue(1));
+ pfd.fd = s_;
+ pfd.events = POLLIN;
+ ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, 5000), SyscallSucceedsWithValue(1));
+
+ // Receive on the packet socket.
+ char recv_buf[sizeof(send_buf)];
+ ASSERT_THAT(recv(s_, recv_buf, sizeof(recv_buf), 0),
+ SyscallSucceedsWithValue(sizeof(recv_buf)));
+ ASSERT_EQ(memcmp(recv_buf, send_buf, sizeof(send_buf)), 0);
+
+ // Receive on the UDP socket.
+ struct sockaddr_in src;
+ socklen_t src_len = sizeof(src);
+ ASSERT_THAT(recvfrom(udp_sock.get(), recv_buf, sizeof(recv_buf), MSG_DONTWAIT,
+ reinterpret_cast<struct sockaddr*>(&src), &src_len),
+ SyscallSucceedsWithValue(sizeof(kMessage)));
+ // Check src and payload.
+ EXPECT_EQ(strncmp(recv_buf, kMessage, sizeof(kMessage)), 0);
+ EXPECT_EQ(src.sin_family, AF_INET);
+ EXPECT_EQ(src.sin_port, kPort);
+ EXPECT_EQ(src.sin_addr.s_addr, htonl(INADDR_LOOPBACK));
+}
+
+// Check that setting SO_RCVBUF below min is clamped to the minimum
+// receive buffer size.
+TEST_P(RawPacketTest, SetSocketRecvBufBelowMin) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ // Discover minimum receive buf size by trying to set it to zero.
+ // See:
+ // https://github.com/torvalds/linux/blob/a5dc8300df75e8b8384b4c82225f1e4a0b4d9b55/net/core/sock.c#L820
+ constexpr int kRcvBufSz = 0;
+ ASSERT_THAT(
+ setsockopt(s_, SOL_SOCKET, SO_RCVBUF, &kRcvBufSz, sizeof(kRcvBufSz)),
+ SyscallSucceeds());
+
+ int min = 0;
+ socklen_t min_len = sizeof(min);
+ ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_RCVBUF, &min, &min_len),
+ SyscallSucceeds());
+
+ // Linux doubles the value so let's use a value that when doubled will still
+ // be smaller than min.
+ int below_min = min / 2 - 1;
+ ASSERT_THAT(
+ setsockopt(s_, SOL_SOCKET, SO_RCVBUF, &below_min, sizeof(below_min)),
+ SyscallSucceeds());
+
+ int val = 0;
+ socklen_t val_len = sizeof(val);
+ ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_RCVBUF, &val, &val_len),
+ SyscallSucceeds());
+
+ ASSERT_EQ(min, val);
+}
+
+// Check that setting SO_RCVBUF above max is clamped to the maximum
+// receive buffer size.
+TEST_P(RawPacketTest, SetSocketRecvBufAboveMax) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ // Discover max buf size by trying to set the largest possible buffer size.
+ constexpr int kRcvBufSz = 0xffffffff;
+ ASSERT_THAT(
+ setsockopt(s_, SOL_SOCKET, SO_RCVBUF, &kRcvBufSz, sizeof(kRcvBufSz)),
+ SyscallSucceeds());
+
+ int max = 0;
+ socklen_t max_len = sizeof(max);
+ ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_RCVBUF, &max, &max_len),
+ SyscallSucceeds());
+
+ int above_max = max + 1;
+ ASSERT_THAT(
+ setsockopt(s_, SOL_SOCKET, SO_RCVBUF, &above_max, sizeof(above_max)),
+ SyscallSucceeds());
+
+ int val = 0;
+ socklen_t val_len = sizeof(val);
+ ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_RCVBUF, &val, &val_len),
+ SyscallSucceeds());
+ ASSERT_EQ(max, val);
+}
+
+// Check that setting SO_RCVBUF min <= kRcvBufSz <= max is honored.
+TEST_P(RawPacketTest, SetSocketRecvBuf) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ int max = 0;
+ int min = 0;
+ {
+ // Discover max buf size by trying to set a really large buffer size.
+ constexpr int kRcvBufSz = 0xffffffff;
+ ASSERT_THAT(
+ setsockopt(s_, SOL_SOCKET, SO_RCVBUF, &kRcvBufSz, sizeof(kRcvBufSz)),
+ SyscallSucceeds());
+
+ max = 0;
+ socklen_t max_len = sizeof(max);
+ ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_RCVBUF, &max, &max_len),
+ SyscallSucceeds());
+ }
+
+ {
+ // Discover minimum buffer size by trying to set a zero size receive buffer
+ // size.
+ // See:
+ // https://github.com/torvalds/linux/blob/a5dc8300df75e8b8384b4c82225f1e4a0b4d9b55/net/core/sock.c#L820
+ constexpr int kRcvBufSz = 0;
+ ASSERT_THAT(
+ setsockopt(s_, SOL_SOCKET, SO_RCVBUF, &kRcvBufSz, sizeof(kRcvBufSz)),
+ SyscallSucceeds());
+
+ socklen_t min_len = sizeof(min);
+ ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_RCVBUF, &min, &min_len),
+ SyscallSucceeds());
+ }
+
+ int quarter_sz = min + (max - min) / 4;
+ ASSERT_THAT(
+ setsockopt(s_, SOL_SOCKET, SO_RCVBUF, &quarter_sz, sizeof(quarter_sz)),
+ SyscallSucceeds());
+
+ int val = 0;
+ socklen_t val_len = sizeof(val);
+ ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_RCVBUF, &val, &val_len),
+ SyscallSucceeds());
+
+ // Linux doubles the value set by SO_SNDBUF/SO_RCVBUF.
+ // TODO(gvisor.dev/issue/2926): Remove when Netstack matches linux behavior.
+ if (!IsRunningOnGvisor()) {
+ quarter_sz *= 2;
+ }
+ ASSERT_EQ(quarter_sz, val);
+}
+
+// Check that setting SO_SNDBUF below min is clamped to the minimum
+// receive buffer size.
+TEST_P(RawPacketTest, SetSocketSendBufBelowMin) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ // Discover minimum buffer size by trying to set it to zero.
+ constexpr int kSndBufSz = 0;
+ ASSERT_THAT(
+ setsockopt(s_, SOL_SOCKET, SO_SNDBUF, &kSndBufSz, sizeof(kSndBufSz)),
+ SyscallSucceeds());
+
+ int min = 0;
+ socklen_t min_len = sizeof(min);
+ ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_SNDBUF, &min, &min_len),
+ SyscallSucceeds());
+
+ // Linux doubles the value so let's use a value that when doubled will still
+ // be smaller than min.
+ int below_min = min / 2 - 1;
+ ASSERT_THAT(
+ setsockopt(s_, SOL_SOCKET, SO_SNDBUF, &below_min, sizeof(below_min)),
+ SyscallSucceeds());
+
+ int val = 0;
+ socklen_t val_len = sizeof(val);
+ ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_SNDBUF, &val, &val_len),
+ SyscallSucceeds());
+
+ ASSERT_EQ(min, val);
+}
+
+// Check that setting SO_SNDBUF above max is clamped to the maximum
+// send buffer size.
+TEST_P(RawPacketTest, SetSocketSendBufAboveMax) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ // Discover maximum buffer size by trying to set it to a large value.
+ constexpr int kSndBufSz = 0xffffffff;
+ ASSERT_THAT(
+ setsockopt(s_, SOL_SOCKET, SO_SNDBUF, &kSndBufSz, sizeof(kSndBufSz)),
+ SyscallSucceeds());
+
+ int max = 0;
+ socklen_t max_len = sizeof(max);
+ ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_SNDBUF, &max, &max_len),
+ SyscallSucceeds());
+
+ int above_max = max + 1;
+ ASSERT_THAT(
+ setsockopt(s_, SOL_SOCKET, SO_SNDBUF, &above_max, sizeof(above_max)),
+ SyscallSucceeds());
+
+ int val = 0;
+ socklen_t val_len = sizeof(val);
+ ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_SNDBUF, &val, &val_len),
+ SyscallSucceeds());
+ ASSERT_EQ(max, val);
+}
+
+// Check that setting SO_SNDBUF min <= kSndBufSz <= max is honored.
+TEST_P(RawPacketTest, SetSocketSendBuf) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ int max = 0;
+ int min = 0;
+ {
+ // Discover maximum buffer size by trying to set it to a large value.
+ constexpr int kSndBufSz = 0xffffffff;
+ ASSERT_THAT(
+ setsockopt(s_, SOL_SOCKET, SO_SNDBUF, &kSndBufSz, sizeof(kSndBufSz)),
+ SyscallSucceeds());
+
+ max = 0;
+ socklen_t max_len = sizeof(max);
+ ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_SNDBUF, &max, &max_len),
+ SyscallSucceeds());
+ }
+
+ {
+ // Discover minimum buffer size by trying to set it to zero.
+ constexpr int kSndBufSz = 0;
+ ASSERT_THAT(
+ setsockopt(s_, SOL_SOCKET, SO_SNDBUF, &kSndBufSz, sizeof(kSndBufSz)),
+ SyscallSucceeds());
+
+ socklen_t min_len = sizeof(min);
+ ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_SNDBUF, &min, &min_len),
+ SyscallSucceeds());
+ }
+
+ int quarter_sz = min + (max - min) / 4;
+ ASSERT_THAT(
+ setsockopt(s_, SOL_SOCKET, SO_SNDBUF, &quarter_sz, sizeof(quarter_sz)),
+ SyscallSucceeds());
+
+ int val = 0;
+ socklen_t val_len = sizeof(val);
+ ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_SNDBUF, &val, &val_len),
+ SyscallSucceeds());
+
+ // Linux doubles the value set by SO_SNDBUF/SO_RCVBUF.
+ // TODO(gvisor.dev/issue/2926): Remove the gvisor special casing when Netstack
+ // matches linux behavior.
+ if (!IsRunningOnGvisor()) {
+ quarter_sz *= 2;
+ }
+
+ ASSERT_EQ(quarter_sz, val);
+}
+
+INSTANTIATE_TEST_SUITE_P(AllInetTests, RawPacketTest,
+ ::testing::Values(ETH_P_IP, ETH_P_ALL));
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/partial_bad_buffer.cc b/test/syscalls/linux/partial_bad_buffer.cc
new file mode 100644
index 000000000..df7129acc
--- /dev/null
+++ b/test/syscalls/linux/partial_bad_buffer.cc
@@ -0,0 +1,405 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+using ::testing::Gt;
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+constexpr char kMessage[] = "hello world";
+
+// PartialBadBufferTest checks the result of various IO syscalls when passed a
+// buffer that does not have the space specified in the syscall (most of it is
+// PROT_NONE). Linux is annoyingly inconsistent among different syscalls, so we
+// test all of them.
+class PartialBadBufferTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ // Create and open a directory for getdents cases.
+ directory_ = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ ASSERT_THAT(
+ directory_fd_ = open(directory_.path().c_str(), O_RDONLY | O_DIRECTORY),
+ SyscallSucceeds());
+
+ // Create and open a normal file, placing it in the directory
+ // so the getdents cases have some dirents.
+ name_ = JoinPath(directory_.path(), "a");
+ ASSERT_THAT(fd_ = open(name_.c_str(), O_RDWR | O_CREAT, 0644),
+ SyscallSucceeds());
+
+ // Write some initial data.
+ size_t size = sizeof(kMessage) - 1;
+ EXPECT_THAT(WriteFd(fd_, &kMessage, size), SyscallSucceedsWithValue(size));
+ ASSERT_THAT(lseek(fd_, 0, SEEK_SET), SyscallSucceeds());
+
+ // Map a useable buffer.
+ addr_ = mmap(0, 2 * kPageSize, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ ASSERT_NE(addr_, MAP_FAILED);
+ char* buf = reinterpret_cast<char*>(addr_);
+
+ // Guard page for our read to run into.
+ ASSERT_THAT(mprotect(reinterpret_cast<void*>(buf + kPageSize), kPageSize,
+ PROT_NONE),
+ SyscallSucceeds());
+
+ // Leave only one free byte in the buffer.
+ bad_buffer_ = buf + kPageSize - 1;
+ }
+
+ off_t Size() {
+ struct stat st;
+ int rc = fstat(fd_, &st);
+ if (rc < 0) {
+ return static_cast<off_t>(rc);
+ }
+ return st.st_size;
+ }
+
+ void TearDown() override {
+ EXPECT_THAT(munmap(addr_, 2 * kPageSize), SyscallSucceeds()) << addr_;
+ EXPECT_THAT(close(fd_), SyscallSucceeds());
+ EXPECT_THAT(unlink(name_.c_str()), SyscallSucceeds());
+ EXPECT_THAT(close(directory_fd_), SyscallSucceeds());
+ }
+
+ // Return buffer with n bytes of free space.
+ // N.B. this is the same buffer used to back bad_buffer_.
+ char* FreeBytes(size_t n) {
+ TEST_CHECK(n <= static_cast<size_t>(4096));
+ return reinterpret_cast<char*>(addr_) + kPageSize - n;
+ }
+
+ std::string name_;
+ int fd_;
+ TempPath directory_;
+ int directory_fd_;
+ void* addr_;
+ char* bad_buffer_;
+};
+
+// We do both "big" and "small" tests to try to hit the "zero copy" and
+// non-"zero copy" paths, which have different code paths for handling faults.
+
+TEST_F(PartialBadBufferTest, ReadBig) {
+ EXPECT_THAT(RetryEINTR(read)(fd_, bad_buffer_, kPageSize),
+ SyscallSucceedsWithValue(1));
+ EXPECT_EQ('h', bad_buffer_[0]);
+}
+
+TEST_F(PartialBadBufferTest, ReadSmall) {
+ EXPECT_THAT(RetryEINTR(read)(fd_, bad_buffer_, 10),
+ SyscallSucceedsWithValue(1));
+ EXPECT_EQ('h', bad_buffer_[0]);
+}
+
+TEST_F(PartialBadBufferTest, PreadBig) {
+ EXPECT_THAT(RetryEINTR(pread)(fd_, bad_buffer_, kPageSize, 0),
+ SyscallSucceedsWithValue(1));
+ EXPECT_EQ('h', bad_buffer_[0]);
+}
+
+TEST_F(PartialBadBufferTest, PreadSmall) {
+ EXPECT_THAT(RetryEINTR(pread)(fd_, bad_buffer_, 10, 0),
+ SyscallSucceedsWithValue(1));
+ EXPECT_EQ('h', bad_buffer_[0]);
+}
+
+TEST_F(PartialBadBufferTest, ReadvBig) {
+ struct iovec vec;
+ vec.iov_base = bad_buffer_;
+ vec.iov_len = kPageSize;
+
+ EXPECT_THAT(RetryEINTR(readv)(fd_, &vec, 1), SyscallSucceedsWithValue(1));
+ EXPECT_EQ('h', bad_buffer_[0]);
+}
+
+TEST_F(PartialBadBufferTest, ReadvSmall) {
+ struct iovec vec;
+ vec.iov_base = bad_buffer_;
+ vec.iov_len = 10;
+
+ EXPECT_THAT(RetryEINTR(readv)(fd_, &vec, 1), SyscallSucceedsWithValue(1));
+ EXPECT_EQ('h', bad_buffer_[0]);
+}
+
+TEST_F(PartialBadBufferTest, PreadvBig) {
+ struct iovec vec;
+ vec.iov_base = bad_buffer_;
+ vec.iov_len = kPageSize;
+
+ EXPECT_THAT(RetryEINTR(preadv)(fd_, &vec, 1, 0), SyscallSucceedsWithValue(1));
+ EXPECT_EQ('h', bad_buffer_[0]);
+}
+
+TEST_F(PartialBadBufferTest, PreadvSmall) {
+ struct iovec vec;
+ vec.iov_base = bad_buffer_;
+ vec.iov_len = 10;
+
+ EXPECT_THAT(RetryEINTR(preadv)(fd_, &vec, 1, 0), SyscallSucceedsWithValue(1));
+ EXPECT_EQ('h', bad_buffer_[0]);
+}
+
+TEST_F(PartialBadBufferTest, WriteBig) {
+ off_t orig_size = Size();
+ int n;
+
+ ASSERT_THAT(lseek(fd_, orig_size, SEEK_SET), SyscallSucceeds());
+ EXPECT_THAT(
+ (n = RetryEINTR(write)(fd_, bad_buffer_, kPageSize)),
+ AnyOf(SyscallFailsWithErrno(EFAULT), SyscallSucceedsWithValue(1)));
+ EXPECT_EQ(Size(), orig_size + (n >= 0 ? n : 0));
+}
+
+TEST_F(PartialBadBufferTest, WriteSmall) {
+ off_t orig_size = Size();
+ int n;
+
+ ASSERT_THAT(lseek(fd_, orig_size, SEEK_SET), SyscallSucceeds());
+ EXPECT_THAT(
+ (n = RetryEINTR(write)(fd_, bad_buffer_, 10)),
+ AnyOf(SyscallFailsWithErrno(EFAULT), SyscallSucceedsWithValue(1)));
+ EXPECT_EQ(Size(), orig_size + (n >= 0 ? n : 0));
+}
+
+TEST_F(PartialBadBufferTest, PwriteBig) {
+ off_t orig_size = Size();
+ int n;
+
+ EXPECT_THAT(
+ (n = RetryEINTR(pwrite)(fd_, bad_buffer_, kPageSize, orig_size)),
+ AnyOf(SyscallFailsWithErrno(EFAULT), SyscallSucceedsWithValue(1)));
+ EXPECT_EQ(Size(), orig_size + (n >= 0 ? n : 0));
+}
+
+TEST_F(PartialBadBufferTest, PwriteSmall) {
+ off_t orig_size = Size();
+ int n;
+
+ EXPECT_THAT(
+ (n = RetryEINTR(pwrite)(fd_, bad_buffer_, 10, orig_size)),
+ AnyOf(SyscallFailsWithErrno(EFAULT), SyscallSucceedsWithValue(1)));
+ EXPECT_EQ(Size(), orig_size + (n >= 0 ? n : 0));
+}
+
+TEST_F(PartialBadBufferTest, WritevBig) {
+ struct iovec vec;
+ vec.iov_base = bad_buffer_;
+ vec.iov_len = kPageSize;
+ off_t orig_size = Size();
+ int n;
+
+ ASSERT_THAT(lseek(fd_, orig_size, SEEK_SET), SyscallSucceeds());
+ EXPECT_THAT(
+ (n = RetryEINTR(writev)(fd_, &vec, 1)),
+ AnyOf(SyscallFailsWithErrno(EFAULT), SyscallSucceedsWithValue(1)));
+ EXPECT_EQ(Size(), orig_size + (n >= 0 ? n : 0));
+}
+
+TEST_F(PartialBadBufferTest, WritevSmall) {
+ struct iovec vec;
+ vec.iov_base = bad_buffer_;
+ vec.iov_len = 10;
+ off_t orig_size = Size();
+ int n;
+
+ ASSERT_THAT(lseek(fd_, orig_size, SEEK_SET), SyscallSucceeds());
+ EXPECT_THAT(
+ (n = RetryEINTR(writev)(fd_, &vec, 1)),
+ AnyOf(SyscallFailsWithErrno(EFAULT), SyscallSucceedsWithValue(1)));
+ EXPECT_EQ(Size(), orig_size + (n >= 0 ? n : 0));
+}
+
+TEST_F(PartialBadBufferTest, PwritevBig) {
+ struct iovec vec;
+ vec.iov_base = bad_buffer_;
+ vec.iov_len = kPageSize;
+ off_t orig_size = Size();
+ int n;
+
+ EXPECT_THAT(
+ (n = RetryEINTR(pwritev)(fd_, &vec, 1, orig_size)),
+ AnyOf(SyscallFailsWithErrno(EFAULT), SyscallSucceedsWithValue(1)));
+ EXPECT_EQ(Size(), orig_size + (n >= 0 ? n : 0));
+}
+
+TEST_F(PartialBadBufferTest, PwritevSmall) {
+ struct iovec vec;
+ vec.iov_base = bad_buffer_;
+ vec.iov_len = 10;
+ off_t orig_size = Size();
+ int n;
+
+ EXPECT_THAT(
+ (n = RetryEINTR(pwritev)(fd_, &vec, 1, orig_size)),
+ AnyOf(SyscallFailsWithErrno(EFAULT), SyscallSucceedsWithValue(1)));
+ EXPECT_EQ(Size(), orig_size + (n >= 0 ? n : 0));
+}
+
+// getdents returns EFAULT when the you claim the buffer is large enough, but
+// it actually isn't.
+TEST_F(PartialBadBufferTest, GetdentsBig) {
+ EXPECT_THAT(RetryEINTR(syscall)(SYS_getdents64, directory_fd_, bad_buffer_,
+ kPageSize),
+ SyscallFailsWithErrno(EFAULT));
+}
+
+// getdents returns EINVAL when the you claim the buffer is too small.
+TEST_F(PartialBadBufferTest, GetdentsSmall) {
+ EXPECT_THAT(
+ RetryEINTR(syscall)(SYS_getdents64, directory_fd_, bad_buffer_, 10),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+// getdents will write entries into a buffer if there is space before it faults.
+TEST_F(PartialBadBufferTest, GetdentsOneEntry) {
+ // 30 bytes is enough for one (small) entry.
+ char* buf = FreeBytes(30);
+
+ EXPECT_THAT(
+ RetryEINTR(syscall)(SYS_getdents64, directory_fd_, buf, kPageSize),
+ SyscallSucceedsWithValue(Gt(0)));
+}
+
+PosixErrorOr<sockaddr_storage> InetLoopbackAddr(int family) {
+ struct sockaddr_storage addr;
+ memset(&addr, 0, sizeof(addr));
+ addr.ss_family = family;
+ switch (family) {
+ case AF_INET:
+ reinterpret_cast<struct sockaddr_in*>(&addr)->sin_addr.s_addr =
+ htonl(INADDR_LOOPBACK);
+ break;
+ case AF_INET6:
+ reinterpret_cast<struct sockaddr_in6*>(&addr)->sin6_addr =
+ in6addr_loopback;
+ break;
+ default:
+ return PosixError(EINVAL,
+ absl::StrCat("unknown socket family: ", family));
+ }
+ return addr;
+}
+
+// SendMsgTCP verifies that calling sendmsg with a bad address returns an
+// EFAULT. It also verifies that passing a buffer which is made up of 2
+// pages one valid and one guard page succeeds as long as the write is
+// for exactly the size of 1 page.
+TEST_F(PartialBadBufferTest, SendMsgTCP) {
+ auto listen_socket =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_STREAM, IPPROTO_TCP));
+
+ // Initialize address to the loopback one.
+ sockaddr_storage addr = ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(AF_INET));
+ socklen_t addrlen = sizeof(addr);
+
+ // Bind to some port then start listening.
+ ASSERT_THAT(bind(listen_socket.get(),
+ reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+ SyscallSucceeds());
+
+ ASSERT_THAT(listen(listen_socket.get(), SOMAXCONN), SyscallSucceeds());
+
+ // Get the address we're listening on, then connect to it. We need to do this
+ // because we're allowing the stack to pick a port for us.
+ ASSERT_THAT(getsockname(listen_socket.get(),
+ reinterpret_cast<struct sockaddr*>(&addr), &addrlen),
+ SyscallSucceeds());
+
+ auto send_socket =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_STREAM, IPPROTO_TCP));
+
+ ASSERT_THAT(
+ RetryEINTR(connect)(send_socket.get(),
+ reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+ SyscallSucceeds());
+
+ // Accept the connection.
+ auto recv_socket =
+ ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_socket.get(), nullptr, nullptr));
+
+ // TODO(gvisor.dev/issue/674): Update this once Netstack matches linux
+ // behaviour on a setsockopt of SO_RCVBUF/SO_SNDBUF.
+ //
+ // Set SO_SNDBUF for socket to exactly kPageSize+1.
+ //
+ // gVisor does not double the value passed in SO_SNDBUF like linux does so we
+ // just increase it by 1 byte here for gVisor so that we can test writing 1
+ // byte past the valid page and check that it triggers an EFAULT
+ // correctly. Otherwise in gVisor the sendmsg call will just return with no
+ // error with kPageSize bytes written successfully.
+ const uint32_t buf_size = kPageSize + 1;
+ ASSERT_THAT(setsockopt(send_socket.get(), SOL_SOCKET, SO_SNDBUF, &buf_size,
+ sizeof(buf_size)),
+ SyscallSucceedsWithValue(0));
+
+ struct msghdr hdr = {};
+ struct iovec iov = {};
+ iov.iov_base = bad_buffer_;
+ iov.iov_len = kPageSize;
+ hdr.msg_iov = &iov;
+ hdr.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(sendmsg)(send_socket.get(), &hdr, 0),
+ SyscallFailsWithErrno(EFAULT));
+
+ // Now assert that writing kPageSize from addr_ succeeds.
+ iov.iov_base = addr_;
+ ASSERT_THAT(RetryEINTR(sendmsg)(send_socket.get(), &hdr, 0),
+ SyscallSucceedsWithValue(kPageSize));
+ // Read all the data out so that we drain the socket SND_BUF on the sender.
+ std::vector<char> buffer(kPageSize);
+ ASSERT_THAT(RetryEINTR(read)(recv_socket.get(), buffer.data(), kPageSize),
+ SyscallSucceedsWithValue(kPageSize));
+
+ // Sleep for a shortwhile to ensure that we have time to process the
+ // ACKs. This is not strictly required unless running under gotsan which is a
+ // lot slower and can result in the next write to write only 1 byte instead of
+ // our intended kPageSize + 1.
+ absl::SleepFor(absl::Milliseconds(50));
+
+ // Now assert that writing > kPageSize results in EFAULT.
+ iov.iov_len = kPageSize + 1;
+ ASSERT_THAT(RetryEINTR(sendmsg)(send_socket.get(), &hdr, 0),
+ SyscallFailsWithErrno(EFAULT));
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/pause.cc b/test/syscalls/linux/pause.cc
new file mode 100644
index 000000000..8c05efd6f
--- /dev/null
+++ b/test/syscalls/linux/pause.cc
@@ -0,0 +1,88 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <signal.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <atomic>
+
+#include "gtest/gtest.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/signal_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+void NoopSignalHandler(int sig, siginfo_t* info, void* context) {}
+
+} // namespace
+
+TEST(PauseTest, OnlyReturnsWhenSignalHandled) {
+ struct sigaction sa;
+ sigfillset(&sa.sa_mask);
+
+ // Ensure that SIGUSR1 is ignored.
+ sa.sa_handler = SIG_IGN;
+ ASSERT_THAT(sigaction(SIGUSR1, &sa, nullptr), SyscallSucceeds());
+
+ // Register a handler for SIGUSR2.
+ sa.sa_sigaction = NoopSignalHandler;
+ sa.sa_flags = SA_SIGINFO;
+ ASSERT_THAT(sigaction(SIGUSR2, &sa, nullptr), SyscallSucceeds());
+
+ // The child sets their own tid.
+ absl::Mutex mu;
+ pid_t child_tid = 0;
+ bool child_tid_available = false;
+ std::atomic<int> sent_signal{0};
+ std::atomic<int> waking_signal{0};
+ ScopedThread t([&] {
+ mu.Lock();
+ child_tid = gettid();
+ child_tid_available = true;
+ mu.Unlock();
+ EXPECT_THAT(pause(), SyscallFailsWithErrno(EINTR));
+ waking_signal.store(sent_signal.load());
+ });
+ mu.Lock();
+ mu.Await(absl::Condition(&child_tid_available));
+ mu.Unlock();
+
+ // Wait a bit to let the child enter pause().
+ absl::SleepFor(absl::Seconds(3));
+
+ // The child should not be woken by SIGUSR1.
+ sent_signal.store(SIGUSR1);
+ ASSERT_THAT(tgkill(getpid(), child_tid, SIGUSR1), SyscallSucceeds());
+ absl::SleepFor(absl::Seconds(3));
+
+ // The child should be woken by SIGUSR2.
+ sent_signal.store(SIGUSR2);
+ ASSERT_THAT(tgkill(getpid(), child_tid, SIGUSR2), SyscallSucceeds());
+ absl::SleepFor(absl::Seconds(3));
+
+ EXPECT_EQ(SIGUSR2, waking_signal.load());
+}
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/ping_socket.cc b/test/syscalls/linux/ping_socket.cc
new file mode 100644
index 000000000..a9bfdb37b
--- /dev/null
+++ b/test/syscalls/linux/ping_socket.cc
@@ -0,0 +1,91 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/save_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+class PingSocket : public ::testing::Test {
+ protected:
+ // Creates a socket to be used in tests.
+ void SetUp() override;
+
+ // Closes the socket created by SetUp().
+ void TearDown() override;
+
+ // The loopback address.
+ struct sockaddr_in addr_;
+};
+
+void PingSocket::SetUp() {
+ // On some hosts ping sockets are restricted to specific groups using the
+ // sysctl "ping_group_range".
+ int s = socket(AF_INET, SOCK_RAW, IPPROTO_ICMP);
+ if (s < 0 && errno == EPERM) {
+ GTEST_SKIP();
+ }
+ close(s);
+
+ addr_ = {};
+ // Just a random port as the destination port number is irrelevant for ping
+ // sockets.
+ addr_.sin_port = 12345;
+ addr_.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ addr_.sin_family = AF_INET;
+}
+
+void PingSocket::TearDown() {}
+
+// Test ICMP port exhaustion returns EAGAIN.
+//
+// We disable both random/cooperative S/R for this test as it makes way too many
+// syscalls.
+TEST_F(PingSocket, ICMPPortExhaustion_NoRandomSave) {
+ DisableSave ds;
+ std::vector<FileDescriptor> sockets;
+ constexpr int kSockets = 65536;
+ addr_.sin_port = 0;
+ for (int i = 0; i < kSockets; i++) {
+ auto s =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, IPPROTO_ICMP));
+ int ret = connect(s.get(), reinterpret_cast<struct sockaddr*>(&addr_),
+ sizeof(addr_));
+ if (ret == 0) {
+ sockets.push_back(std::move(s));
+ continue;
+ }
+ ASSERT_THAT(ret, SyscallFailsWithErrno(EAGAIN));
+ break;
+ }
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/pipe.cc b/test/syscalls/linux/pipe.cc
new file mode 100644
index 000000000..34291850d
--- /dev/null
+++ b/test/syscalls/linux/pipe.cc
@@ -0,0 +1,670 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h> /* Obtain O_* constant definitions */
+#include <sys/ioctl.h>
+#include <sys/uio.h>
+#include <unistd.h>
+
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "absl/strings/str_cat.h"
+#include "absl/synchronization/notification.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Used as a non-zero sentinel value, below.
+constexpr int kTestValue = 0x12345678;
+
+// Used for synchronization in race tests.
+const absl::Duration syncDelay = absl::Seconds(2);
+
+struct PipeCreator {
+ std::string name_;
+
+ // void (fds, is_blocking, is_namedpipe).
+ std::function<void(int[2], bool*, bool*)> create_;
+};
+
+class PipeTest : public ::testing::TestWithParam<PipeCreator> {
+ public:
+ static void SetUpTestSuite() {
+ // Tests intentionally generate SIGPIPE.
+ TEST_PCHECK(signal(SIGPIPE, SIG_IGN) != SIG_ERR);
+ }
+
+ // Initializes rfd_ and wfd_ as a blocking pipe.
+ //
+ // The return value indicates success: the test should be skipped otherwise.
+ bool CreateBlocking() { return create(true); }
+
+ // Initializes rfd_ and wfd_ as a non-blocking pipe.
+ //
+ // The return value is per CreateBlocking.
+ bool CreateNonBlocking() { return create(false); }
+
+ // Returns true iff the pipe represents a named pipe.
+ bool IsNamedPipe() const { return named_pipe_; }
+
+ int Size() const {
+ int s1 = fcntl(rfd_.get(), F_GETPIPE_SZ);
+ int s2 = fcntl(wfd_.get(), F_GETPIPE_SZ);
+ EXPECT_GT(s1, 0);
+ EXPECT_GT(s2, 0);
+ EXPECT_EQ(s1, s2);
+ return s1;
+ }
+
+ static void TearDownTestSuite() {
+ TEST_PCHECK(signal(SIGPIPE, SIG_DFL) != SIG_ERR);
+ }
+
+ private:
+ bool create(bool wants_blocking) {
+ // Generate the pipe.
+ int fds[2] = {-1, -1};
+ bool is_blocking = false;
+ GetParam().create_(fds, &is_blocking, &named_pipe_);
+ if (fds[0] < 0 || fds[1] < 0) {
+ return false;
+ }
+
+ // Save descriptors.
+ rfd_.reset(fds[0]);
+ wfd_.reset(fds[1]);
+
+ // Adjust blocking, if needed.
+ if (!is_blocking && wants_blocking) {
+ // Clear the blocking flag.
+ EXPECT_THAT(fcntl(fds[0], F_SETFL, 0), SyscallSucceeds());
+ EXPECT_THAT(fcntl(fds[1], F_SETFL, 0), SyscallSucceeds());
+ } else if (is_blocking && !wants_blocking) {
+ // Set the descriptors to blocking.
+ EXPECT_THAT(fcntl(fds[0], F_SETFL, O_NONBLOCK), SyscallSucceeds());
+ EXPECT_THAT(fcntl(fds[1], F_SETFL, O_NONBLOCK), SyscallSucceeds());
+ }
+
+ return true;
+ }
+
+ protected:
+ FileDescriptor rfd_;
+ FileDescriptor wfd_;
+
+ private:
+ bool named_pipe_ = false;
+};
+
+TEST_P(PipeTest, Inode) {
+ SKIP_IF(!CreateBlocking());
+
+ // Ensure that the inode number is the same for each end.
+ struct stat rst;
+ ASSERT_THAT(fstat(rfd_.get(), &rst), SyscallSucceeds());
+ struct stat wst;
+ ASSERT_THAT(fstat(wfd_.get(), &wst), SyscallSucceeds());
+ EXPECT_EQ(rst.st_ino, wst.st_ino);
+}
+
+TEST_P(PipeTest, Permissions) {
+ SKIP_IF(!CreateBlocking());
+
+ // Attempt bad operations.
+ int buf = kTestValue;
+ ASSERT_THAT(write(rfd_.get(), &buf, sizeof(buf)),
+ SyscallFailsWithErrno(EBADF));
+ EXPECT_THAT(read(wfd_.get(), &buf, sizeof(buf)),
+ SyscallFailsWithErrno(EBADF));
+}
+
+TEST_P(PipeTest, Flags) {
+ SKIP_IF(!CreateBlocking());
+
+ if (IsNamedPipe()) {
+ // May be stubbed to zero; define locally.
+ EXPECT_THAT(fcntl(rfd_.get(), F_GETFL),
+ SyscallSucceedsWithValue(kOLargeFile | O_RDONLY));
+ EXPECT_THAT(fcntl(wfd_.get(), F_GETFL),
+ SyscallSucceedsWithValue(kOLargeFile | O_WRONLY));
+ } else {
+ EXPECT_THAT(fcntl(rfd_.get(), F_GETFL), SyscallSucceedsWithValue(O_RDONLY));
+ EXPECT_THAT(fcntl(wfd_.get(), F_GETFL), SyscallSucceedsWithValue(O_WRONLY));
+ }
+}
+
+TEST_P(PipeTest, Write) {
+ SKIP_IF(!CreateBlocking());
+
+ int wbuf = kTestValue;
+ int rbuf = ~kTestValue;
+ ASSERT_THAT(write(wfd_.get(), &wbuf, sizeof(wbuf)),
+ SyscallSucceedsWithValue(sizeof(wbuf)));
+ ASSERT_THAT(read(rfd_.get(), &rbuf, sizeof(rbuf)),
+ SyscallSucceedsWithValue(sizeof(rbuf)));
+ EXPECT_EQ(wbuf, rbuf);
+}
+
+TEST_P(PipeTest, WritePage) {
+ SKIP_IF(!CreateBlocking());
+
+ std::vector<char> wbuf(kPageSize);
+ RandomizeBuffer(wbuf.data(), wbuf.size());
+ std::vector<char> rbuf(wbuf.size());
+
+ ASSERT_THAT(write(wfd_.get(), wbuf.data(), wbuf.size()),
+ SyscallSucceedsWithValue(wbuf.size()));
+ ASSERT_THAT(read(rfd_.get(), rbuf.data(), rbuf.size()),
+ SyscallSucceedsWithValue(rbuf.size()));
+ EXPECT_EQ(memcmp(rbuf.data(), wbuf.data(), wbuf.size()), 0);
+}
+
+TEST_P(PipeTest, NonBlocking) {
+ SKIP_IF(!CreateNonBlocking());
+
+ int wbuf = kTestValue;
+ int rbuf = ~kTestValue;
+ EXPECT_THAT(read(rfd_.get(), &rbuf, sizeof(rbuf)),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+ ASSERT_THAT(write(wfd_.get(), &wbuf, sizeof(wbuf)),
+ SyscallSucceedsWithValue(sizeof(wbuf)));
+
+ ASSERT_THAT(read(rfd_.get(), &rbuf, sizeof(rbuf)),
+ SyscallSucceedsWithValue(sizeof(rbuf)));
+ EXPECT_EQ(wbuf, rbuf);
+ EXPECT_THAT(read(rfd_.get(), &rbuf, sizeof(rbuf)),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+}
+
+TEST(Pipe2Test, CloExec) {
+ int fds[2];
+ ASSERT_THAT(pipe2(fds, O_CLOEXEC), SyscallSucceeds());
+ EXPECT_THAT(fcntl(fds[0], F_GETFD), SyscallSucceedsWithValue(FD_CLOEXEC));
+ EXPECT_THAT(fcntl(fds[1], F_GETFD), SyscallSucceedsWithValue(FD_CLOEXEC));
+ EXPECT_THAT(close(fds[0]), SyscallSucceeds());
+ EXPECT_THAT(close(fds[1]), SyscallSucceeds());
+}
+
+TEST(Pipe2Test, BadOptions) {
+ int fds[2];
+ EXPECT_THAT(pipe2(fds, 0xDEAD), SyscallFailsWithErrno(EINVAL));
+}
+
+// Tests that opening named pipes with O_TRUNC shouldn't cause an error, but
+// calls to (f)truncate should.
+TEST(NamedPipeTest, Truncate) {
+ const std::string tmp_path = NewTempAbsPath();
+ SKIP_IF(mkfifo(tmp_path.c_str(), 0644) != 0);
+
+ ASSERT_THAT(open(tmp_path.c_str(), O_NONBLOCK | O_RDONLY), SyscallSucceeds());
+ FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Open(tmp_path.c_str(), O_RDWR | O_NONBLOCK | O_TRUNC));
+
+ ASSERT_THAT(truncate(tmp_path.c_str(), 0), SyscallFailsWithErrno(EINVAL));
+ ASSERT_THAT(ftruncate(fd.get(), 0), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(PipeTest, Seek) {
+ SKIP_IF(!CreateBlocking());
+
+ for (int i = 0; i < 4; i++) {
+ // Attempt absolute seeks.
+ EXPECT_THAT(lseek(rfd_.get(), 0, SEEK_SET), SyscallFailsWithErrno(ESPIPE));
+ EXPECT_THAT(lseek(rfd_.get(), 4, SEEK_SET), SyscallFailsWithErrno(ESPIPE));
+ EXPECT_THAT(lseek(wfd_.get(), 0, SEEK_SET), SyscallFailsWithErrno(ESPIPE));
+ EXPECT_THAT(lseek(wfd_.get(), 4, SEEK_SET), SyscallFailsWithErrno(ESPIPE));
+
+ // Attempt relative seeks.
+ EXPECT_THAT(lseek(rfd_.get(), 0, SEEK_CUR), SyscallFailsWithErrno(ESPIPE));
+ EXPECT_THAT(lseek(rfd_.get(), 4, SEEK_CUR), SyscallFailsWithErrno(ESPIPE));
+ EXPECT_THAT(lseek(wfd_.get(), 0, SEEK_CUR), SyscallFailsWithErrno(ESPIPE));
+ EXPECT_THAT(lseek(wfd_.get(), 4, SEEK_CUR), SyscallFailsWithErrno(ESPIPE));
+
+ // Attempt end-of-file seeks.
+ EXPECT_THAT(lseek(rfd_.get(), 0, SEEK_CUR), SyscallFailsWithErrno(ESPIPE));
+ EXPECT_THAT(lseek(rfd_.get(), -4, SEEK_END), SyscallFailsWithErrno(ESPIPE));
+ EXPECT_THAT(lseek(wfd_.get(), 0, SEEK_CUR), SyscallFailsWithErrno(ESPIPE));
+ EXPECT_THAT(lseek(wfd_.get(), -4, SEEK_END), SyscallFailsWithErrno(ESPIPE));
+
+ // Add some more data to the pipe.
+ int buf = kTestValue;
+ ASSERT_THAT(write(wfd_.get(), &buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+ }
+}
+
+TEST_P(PipeTest, OffsetCalls) {
+ SKIP_IF(!CreateBlocking());
+
+ int buf;
+ EXPECT_THAT(pread(wfd_.get(), &buf, sizeof(buf), 0),
+ SyscallFailsWithErrno(ESPIPE));
+ EXPECT_THAT(pwrite(rfd_.get(), &buf, sizeof(buf), 0),
+ SyscallFailsWithErrno(ESPIPE));
+
+ struct iovec iov;
+ iov.iov_base = &buf;
+ iov.iov_len = sizeof(buf);
+ EXPECT_THAT(preadv(wfd_.get(), &iov, 1, 0), SyscallFailsWithErrno(ESPIPE));
+ EXPECT_THAT(pwritev(rfd_.get(), &iov, 1, 0), SyscallFailsWithErrno(ESPIPE));
+}
+
+TEST_P(PipeTest, WriterSideCloses) {
+ SKIP_IF(!CreateBlocking());
+
+ ScopedThread t([this]() {
+ int buf = ~kTestValue;
+ ASSERT_THAT(read(rfd_.get(), &buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+ EXPECT_EQ(buf, kTestValue);
+ // This will return when the close() completes.
+ ASSERT_THAT(read(rfd_.get(), &buf, sizeof(buf)), SyscallSucceeds());
+ // This will return straight away.
+ ASSERT_THAT(read(rfd_.get(), &buf, sizeof(buf)),
+ SyscallSucceedsWithValue(0));
+ });
+
+ // Sleep a bit so the thread can block.
+ absl::SleepFor(syncDelay);
+
+ // Write to unblock.
+ int buf = kTestValue;
+ ASSERT_THAT(write(wfd_.get(), &buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ // Sleep a bit so the thread can block again.
+ absl::SleepFor(syncDelay);
+
+ // Allow the thread to complete.
+ ASSERT_THAT(close(wfd_.release()), SyscallSucceeds());
+ t.Join();
+}
+
+TEST_P(PipeTest, WriterSideClosesReadDataFirst) {
+ SKIP_IF(!CreateBlocking());
+
+ int wbuf = kTestValue;
+ ASSERT_THAT(write(wfd_.get(), &wbuf, sizeof(wbuf)),
+ SyscallSucceedsWithValue(sizeof(wbuf)));
+ ASSERT_THAT(close(wfd_.release()), SyscallSucceeds());
+
+ int rbuf;
+ ASSERT_THAT(read(rfd_.get(), &rbuf, sizeof(rbuf)),
+ SyscallSucceedsWithValue(sizeof(rbuf)));
+ EXPECT_EQ(wbuf, rbuf);
+ EXPECT_THAT(read(rfd_.get(), &rbuf, sizeof(rbuf)),
+ SyscallSucceedsWithValue(0));
+}
+
+TEST_P(PipeTest, ReaderSideCloses) {
+ SKIP_IF(!CreateBlocking());
+
+ ASSERT_THAT(close(rfd_.release()), SyscallSucceeds());
+ int buf = kTestValue;
+ EXPECT_THAT(write(wfd_.get(), &buf, sizeof(buf)),
+ SyscallFailsWithErrno(EPIPE));
+}
+
+TEST_P(PipeTest, CloseTwice) {
+ SKIP_IF(!CreateBlocking());
+
+ int reader = rfd_.release();
+ int writer = wfd_.release();
+ ASSERT_THAT(close(reader), SyscallSucceeds());
+ ASSERT_THAT(close(writer), SyscallSucceeds());
+ EXPECT_THAT(close(reader), SyscallFailsWithErrno(EBADF));
+ EXPECT_THAT(close(writer), SyscallFailsWithErrno(EBADF));
+}
+
+// Blocking write returns EPIPE when read end is closed if nothing has been
+// written.
+TEST_P(PipeTest, BlockWriteClosed) {
+ SKIP_IF(!CreateBlocking());
+
+ absl::Notification notify;
+ ScopedThread t([this, &notify]() {
+ std::vector<char> buf(Size());
+ // Exactly fill the pipe buffer.
+ ASSERT_THAT(WriteFd(wfd_.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(buf.size()));
+
+ notify.Notify();
+
+ // Attempt to write one more byte. Blocks.
+ // N.B. Don't use WriteFd, we don't want a retry.
+ EXPECT_THAT(write(wfd_.get(), buf.data(), 1), SyscallFailsWithErrno(EPIPE));
+ });
+
+ notify.WaitForNotification();
+ ASSERT_THAT(close(rfd_.release()), SyscallSucceeds());
+ t.Join();
+}
+
+// Blocking write returns EPIPE when read end is closed even if something has
+// been written.
+TEST_P(PipeTest, BlockPartialWriteClosed) {
+ SKIP_IF(!CreateBlocking());
+
+ ScopedThread t([this]() {
+ const int pipe_size = Size();
+ std::vector<char> buf(2 * pipe_size);
+
+ // Write more than fits in the buffer. Blocks then returns partial write
+ // when the other end is closed. The next call returns EPIPE.
+ ASSERT_THAT(write(wfd_.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(pipe_size));
+ EXPECT_THAT(write(wfd_.get(), buf.data(), buf.size()),
+ SyscallFailsWithErrno(EPIPE));
+ });
+
+ // Leave time for write to become blocked.
+ absl::SleepFor(syncDelay);
+
+ // Unblock the above.
+ ASSERT_THAT(close(rfd_.release()), SyscallSucceeds());
+ t.Join();
+}
+
+TEST_P(PipeTest, ReadFromClosedFd_NoRandomSave) {
+ SKIP_IF(!CreateBlocking());
+
+ absl::Notification notify;
+ ScopedThread t([this, &notify]() {
+ notify.Notify();
+ int buf;
+ ASSERT_THAT(read(rfd_.get(), &buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+ ASSERT_EQ(kTestValue, buf);
+ });
+ notify.WaitForNotification();
+
+ // Make sure that the thread gets to read().
+ absl::SleepFor(syncDelay);
+
+ {
+ // We cannot save/restore here as the read end of pipe is closed but there
+ // is ongoing read() above. We will not be able to restart the read()
+ // successfully in restore run since the read fd is closed.
+ const DisableSave ds;
+ ASSERT_THAT(close(rfd_.release()), SyscallSucceeds());
+ int buf = kTestValue;
+ ASSERT_THAT(write(wfd_.get(), &buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+ t.Join();
+ }
+}
+
+TEST_P(PipeTest, FionRead) {
+ SKIP_IF(!CreateBlocking());
+
+ int n;
+ ASSERT_THAT(ioctl(rfd_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
+ EXPECT_EQ(n, 0);
+ ASSERT_THAT(ioctl(wfd_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
+ EXPECT_EQ(n, 0);
+
+ std::vector<char> buf(Size());
+ ASSERT_THAT(write(wfd_.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(buf.size()));
+
+ EXPECT_THAT(ioctl(rfd_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
+ EXPECT_EQ(n, buf.size());
+ EXPECT_THAT(ioctl(wfd_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
+ EXPECT_EQ(n, buf.size());
+}
+
+// Test that opening an empty anonymous pipe RDONLY via /proc/self/fd/N does not
+// block waiting for a writer.
+TEST_P(PipeTest, OpenViaProcSelfFD) {
+ SKIP_IF(!CreateBlocking());
+ SKIP_IF(IsNamedPipe());
+
+ // Close the write end of the pipe.
+ ASSERT_THAT(close(wfd_.release()), SyscallSucceeds());
+
+ // Open other side via /proc/self/fd. It should not block.
+ FileDescriptor proc_self_fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Open(absl::StrCat("/proc/self/fd/", rfd_.get()), O_RDONLY));
+}
+
+// Test that opening and reading from an anonymous pipe (with existing writes)
+// RDONLY via /proc/self/fd/N returns the existing data.
+TEST_P(PipeTest, OpenViaProcSelfFDWithWrites) {
+ SKIP_IF(!CreateBlocking());
+ SKIP_IF(IsNamedPipe());
+
+ // Write to the pipe and then close the write fd.
+ int wbuf = kTestValue;
+ ASSERT_THAT(write(wfd_.get(), &wbuf, sizeof(wbuf)),
+ SyscallSucceedsWithValue(sizeof(wbuf)));
+ ASSERT_THAT(close(wfd_.release()), SyscallSucceeds());
+
+ // Open read side via /proc/self/fd, and read from it.
+ FileDescriptor proc_self_fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Open(absl::StrCat("/proc/self/fd/", rfd_.get()), O_RDONLY));
+ int rbuf;
+ ASSERT_THAT(read(proc_self_fd.get(), &rbuf, sizeof(rbuf)),
+ SyscallSucceedsWithValue(sizeof(rbuf)));
+ EXPECT_EQ(wbuf, rbuf);
+}
+
+// Test that accesses of /proc/<PID>/fd correctly decrement the refcount.
+TEST_P(PipeTest, ProcFDReleasesFile) {
+ SKIP_IF(!CreateBlocking());
+
+ // Stat the pipe FD, which shouldn't alter the refcount.
+ struct stat wst;
+ ASSERT_THAT(lstat(absl::StrCat("/proc/self/fd/", wfd_.get()).c_str(), &wst),
+ SyscallSucceeds());
+
+ // Close the write end and ensure that read indicates EOF.
+ wfd_.reset();
+ char buf;
+ ASSERT_THAT(read(rfd_.get(), &buf, 1), SyscallSucceedsWithValue(0));
+}
+
+// Same for /proc/<PID>/fdinfo.
+TEST_P(PipeTest, ProcFDInfoReleasesFile) {
+ SKIP_IF(!CreateBlocking());
+
+ // Stat the pipe FD, which shouldn't alter the refcount.
+ struct stat wst;
+ ASSERT_THAT(
+ lstat(absl::StrCat("/proc/self/fdinfo/", wfd_.get()).c_str(), &wst),
+ SyscallSucceeds());
+
+ // Close the write end and ensure that read indicates EOF.
+ wfd_.reset();
+ char buf;
+ ASSERT_THAT(read(rfd_.get(), &buf, 1), SyscallSucceedsWithValue(0));
+}
+
+TEST_P(PipeTest, SizeChange) {
+ SKIP_IF(!CreateBlocking());
+
+ // Set the minimum possible size.
+ ASSERT_THAT(fcntl(rfd_.get(), F_SETPIPE_SZ, 0), SyscallSucceeds());
+ int min = Size();
+ EXPECT_GT(min, 0); // Should be rounded up.
+
+ // Set from the read end.
+ ASSERT_THAT(fcntl(rfd_.get(), F_SETPIPE_SZ, min + 1), SyscallSucceeds());
+ int med = Size();
+ EXPECT_GT(med, min); // Should have grown, may be rounded.
+
+ // Set from the write end.
+ ASSERT_THAT(fcntl(wfd_.get(), F_SETPIPE_SZ, med + 1), SyscallSucceeds());
+ int max = Size();
+ EXPECT_GT(max, med); // Ditto.
+}
+
+TEST_P(PipeTest, SizeChangeMax) {
+ SKIP_IF(!CreateBlocking());
+
+ // Assert there's some maximum.
+ EXPECT_THAT(fcntl(rfd_.get(), F_SETPIPE_SZ, 0x7fffffffffffffff),
+ SyscallFailsWithErrno(EINVAL));
+ EXPECT_THAT(fcntl(wfd_.get(), F_SETPIPE_SZ, 0x7fffffffffffffff),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(PipeTest, SizeChangeFull) {
+ SKIP_IF(!CreateBlocking());
+
+ // Ensure that we adjust to a large enough size to avoid rounding when we
+ // perform the size decrease. If rounding occurs, we may not actually
+ // adjust the size and the call below will return success. It was found via
+ // experimentation that this granularity avoids the rounding for Linux.
+ constexpr int kDelta = 64 * 1024;
+ ASSERT_THAT(fcntl(wfd_.get(), F_SETPIPE_SZ, Size() + kDelta),
+ SyscallSucceeds());
+
+ // Fill the buffer and try to change down.
+ std::vector<char> buf(Size());
+ ASSERT_THAT(write(wfd_.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(buf.size()));
+ EXPECT_THAT(fcntl(wfd_.get(), F_SETPIPE_SZ, Size() - kDelta),
+ SyscallFailsWithErrno(EBUSY));
+}
+
+TEST_P(PipeTest, Streaming) {
+ SKIP_IF(!CreateBlocking());
+
+ // We make too many calls to go through full save cycles.
+ DisableSave ds;
+
+ // Size() requires 2 syscalls, call it once and remember the value.
+ const int pipe_size = Size();
+
+ absl::Notification notify;
+ ScopedThread t([this, &notify, pipe_size]() {
+ // Don't start until it's full.
+ notify.WaitForNotification();
+ for (int i = 0; i < pipe_size; i++) {
+ int rbuf;
+ ASSERT_THAT(read(rfd_.get(), &rbuf, sizeof(rbuf)),
+ SyscallSucceedsWithValue(sizeof(rbuf)));
+ EXPECT_EQ(rbuf, i);
+ }
+ });
+
+ // Write 4 bytes * pipe_size. It will fill up the pipe once, notify the reader
+ // to start. Then we write pipe size worth 3 more times to ensure the reader
+ // can follow along.
+ ssize_t total = 0;
+ for (int i = 0; i < pipe_size; i++) {
+ ssize_t written = write(wfd_.get(), &i, sizeof(i));
+ ASSERT_THAT(written, SyscallSucceedsWithValue(sizeof(i)));
+ total += written;
+
+ // Is the next write about to fill up the buffer? Wake up the reader once.
+ if (total < pipe_size && (total + written) >= pipe_size) {
+ notify.Notify();
+ }
+ }
+}
+
+std::string PipeCreatorName(::testing::TestParamInfo<PipeCreator> info) {
+ return info.param.name_; // Use the name specified.
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ Pipes, PipeTest,
+ ::testing::Values(
+ PipeCreator{
+ "pipe",
+ [](int fds[2], bool* is_blocking, bool* is_namedpipe) {
+ ASSERT_THAT(pipe(fds), SyscallSucceeds());
+ *is_blocking = true;
+ *is_namedpipe = false;
+ },
+ },
+ PipeCreator{
+ "pipe2blocking",
+ [](int fds[2], bool* is_blocking, bool* is_namedpipe) {
+ ASSERT_THAT(pipe2(fds, 0), SyscallSucceeds());
+ *is_blocking = true;
+ *is_namedpipe = false;
+ },
+ },
+ PipeCreator{
+ "pipe2nonblocking",
+ [](int fds[2], bool* is_blocking, bool* is_namedpipe) {
+ ASSERT_THAT(pipe2(fds, O_NONBLOCK), SyscallSucceeds());
+ *is_blocking = false;
+ *is_namedpipe = false;
+ },
+ },
+ PipeCreator{
+ "smallbuffer",
+ [](int fds[2], bool* is_blocking, bool* is_namedpipe) {
+ // Set to the minimum available size (will round up).
+ ASSERT_THAT(pipe(fds), SyscallSucceeds());
+ ASSERT_THAT(fcntl(fds[0], F_SETPIPE_SZ, 0), SyscallSucceeds());
+ *is_blocking = true;
+ *is_namedpipe = false;
+ },
+ },
+ PipeCreator{
+ "namednonblocking",
+ [](int fds[2], bool* is_blocking, bool* is_namedpipe) {
+ // Create a new file-based pipe (non-blocking).
+ std::string path;
+ {
+ auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ path = file.path();
+ }
+ SKIP_IF(mkfifo(path.c_str(), 0644) != 0);
+ fds[0] = open(path.c_str(), O_NONBLOCK | O_RDONLY);
+ fds[1] = open(path.c_str(), O_NONBLOCK | O_WRONLY);
+ MaybeSave();
+ *is_blocking = false;
+ *is_namedpipe = true;
+ },
+ },
+ PipeCreator{
+ "namedblocking",
+ [](int fds[2], bool* is_blocking, bool* is_namedpipe) {
+ // Create a new file-based pipe (blocking).
+ std::string path;
+ {
+ auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ path = file.path();
+ }
+ SKIP_IF(mkfifo(path.c_str(), 0644) != 0);
+ ScopedThread t(
+ [&path, &fds]() { fds[1] = open(path.c_str(), O_WRONLY); });
+ fds[0] = open(path.c_str(), O_RDONLY);
+ t.Join();
+ MaybeSave();
+ *is_blocking = true;
+ *is_namedpipe = true;
+ },
+ }),
+ PipeCreatorName);
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/poll.cc b/test/syscalls/linux/poll.cc
new file mode 100644
index 000000000..7a316427d
--- /dev/null
+++ b/test/syscalls/linux/poll.cc
@@ -0,0 +1,294 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <poll.h>
+#include <sys/resource.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include <algorithm>
+#include <iostream>
+
+#include "gtest/gtest.h"
+#include "absl/synchronization/notification.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/syscalls/linux/base_poll_test.h"
+#include "test/util/eventfd_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/logging.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+class PollTest : public BasePollTest {
+ protected:
+ void SetUp() override { BasePollTest::SetUp(); }
+ void TearDown() override { BasePollTest::TearDown(); }
+};
+
+TEST_F(PollTest, InvalidFds) {
+ // fds is invalid because it's null, but we tell ppoll the length is non-zero.
+ EXPECT_THAT(poll(nullptr, 1, 1), SyscallFailsWithErrno(EFAULT));
+ EXPECT_THAT(poll(nullptr, -1, 1), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_F(PollTest, NullFds) {
+ EXPECT_THAT(poll(nullptr, 0, 10), SyscallSucceeds());
+}
+
+TEST_F(PollTest, ZeroTimeout) {
+ EXPECT_THAT(poll(nullptr, 0, 0), SyscallSucceeds());
+}
+
+// If random S/R interrupts the poll, SIGALRM may be delivered before poll
+// restarts, causing the poll to hang forever.
+TEST_F(PollTest, NegativeTimeout_NoRandomSave) {
+ // Negative timeout mean wait forever so set a timer.
+ SetTimer(absl::Milliseconds(100));
+ EXPECT_THAT(poll(nullptr, 0, -1), SyscallFailsWithErrno(EINTR));
+ EXPECT_TRUE(TimerFired());
+}
+
+TEST_F(PollTest, NonBlockingEventPOLLIN) {
+ // Create a pipe.
+ int fds[2];
+ ASSERT_THAT(pipe(fds), SyscallSucceeds());
+
+ FileDescriptor fd0(fds[0]);
+ FileDescriptor fd1(fds[1]);
+
+ // Write some data to the pipe.
+ char s[] = "foo\n";
+ ASSERT_THAT(WriteFd(fd1.get(), s, strlen(s) + 1), SyscallSucceeds());
+
+ // Poll on the reader fd with POLLIN event.
+ struct pollfd poll_fd = {fd0.get(), POLLIN, 0};
+ EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, 0), SyscallSucceedsWithValue(1));
+
+ // Should trigger POLLIN event.
+ EXPECT_EQ(poll_fd.revents & POLLIN, POLLIN);
+}
+
+TEST_F(PollTest, BlockingEventPOLLIN) {
+ // Create a pipe.
+ int fds[2];
+ ASSERT_THAT(pipe(fds), SyscallSucceeds());
+
+ FileDescriptor fd0(fds[0]);
+ FileDescriptor fd1(fds[1]);
+
+ // Start a blocking poll on the read fd.
+ absl::Notification notify;
+ ScopedThread t([&fd0, &notify]() {
+ notify.Notify();
+
+ // Poll on the reader fd with POLLIN event.
+ struct pollfd poll_fd = {fd0.get(), POLLIN, 0};
+ EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, -1), SyscallSucceedsWithValue(1));
+
+ // Should trigger POLLIN event.
+ EXPECT_EQ(poll_fd.revents & POLLIN, POLLIN);
+ });
+
+ notify.WaitForNotification();
+ absl::SleepFor(absl::Seconds(1.0));
+
+ // Write some data to the pipe.
+ char s[] = "foo\n";
+ ASSERT_THAT(WriteFd(fd1.get(), s, strlen(s) + 1), SyscallSucceeds());
+}
+
+TEST_F(PollTest, NonBlockingEventPOLLHUP) {
+ // Create a pipe.
+ int fds[2];
+ ASSERT_THAT(pipe(fds), SyscallSucceeds());
+
+ FileDescriptor fd0(fds[0]);
+ FileDescriptor fd1(fds[1]);
+
+ // Close the writer fd.
+ fd1.reset();
+
+ // Poll on the reader fd with POLLIN event.
+ struct pollfd poll_fd = {fd0.get(), POLLIN, 0};
+ EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, 0), SyscallSucceedsWithValue(1));
+
+ // Should trigger POLLHUP event.
+ EXPECT_EQ(poll_fd.revents & POLLHUP, POLLHUP);
+
+ // Should not trigger POLLIN event.
+ EXPECT_EQ(poll_fd.revents & POLLIN, 0);
+}
+
+TEST_F(PollTest, BlockingEventPOLLHUP) {
+ // Create a pipe.
+ int fds[2];
+ ASSERT_THAT(pipe(fds), SyscallSucceeds());
+
+ FileDescriptor fd0(fds[0]);
+ FileDescriptor fd1(fds[1]);
+
+ // Start a blocking poll on the read fd.
+ absl::Notification notify;
+ ScopedThread t([&fd0, &notify]() {
+ notify.Notify();
+
+ // Poll on the reader fd with POLLIN event.
+ struct pollfd poll_fd = {fd0.get(), POLLIN, 0};
+ EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, -1), SyscallSucceedsWithValue(1));
+
+ // Should trigger POLLHUP event.
+ EXPECT_EQ(poll_fd.revents & POLLHUP, POLLHUP);
+
+ // Should not trigger POLLIN event.
+ EXPECT_EQ(poll_fd.revents & POLLIN, 0);
+ });
+
+ notify.WaitForNotification();
+ absl::SleepFor(absl::Seconds(1.0));
+
+ // Write some data and close the writer fd.
+ fd1.reset();
+}
+
+TEST_F(PollTest, NonBlockingEventPOLLERR) {
+ // Create a pipe.
+ int fds[2];
+ ASSERT_THAT(pipe(fds), SyscallSucceeds());
+
+ FileDescriptor fd0(fds[0]);
+ FileDescriptor fd1(fds[1]);
+
+ // Close the reader fd.
+ fd0.reset();
+
+ // Poll on the writer fd with POLLOUT event.
+ struct pollfd poll_fd = {fd1.get(), POLLOUT, 0};
+ EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, 0), SyscallSucceedsWithValue(1));
+
+ // Should trigger POLLERR event.
+ EXPECT_EQ(poll_fd.revents & POLLERR, POLLERR);
+
+ // Should also trigger POLLOUT event.
+ EXPECT_EQ(poll_fd.revents & POLLOUT, POLLOUT);
+}
+
+// This test will validate that if an FD is already ready on some event, whether
+// it's POLLIN or POLLOUT it will not immediately return unless that's actually
+// what the caller was interested in.
+TEST_F(PollTest, ImmediatelyReturnOnlyOnPollEvents) {
+ // Create a pipe.
+ int fds[2];
+ ASSERT_THAT(pipe(fds), SyscallSucceeds());
+
+ FileDescriptor fd0(fds[0]);
+ FileDescriptor fd1(fds[1]);
+
+ // Wait for read related event on the write side of the pipe, since a write
+ // is possible on fds[1] it would mean that POLLOUT would return immediately.
+ // We should make sure that we're not woken up with that state that we didn't
+ // specificially request.
+ constexpr int kTimeoutMs = 100;
+ struct pollfd poll_fd = {fd1.get(), POLLIN | POLLPRI | POLLRDHUP, 0};
+ EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, kTimeoutMs),
+ SyscallSucceedsWithValue(0)); // We should timeout.
+ EXPECT_EQ(poll_fd.revents, 0); // Nothing should be in returned events.
+
+ // Now let's poll on POLLOUT and we should get back 1 fd as being ready and
+ // it should contain POLLOUT in the revents.
+ poll_fd.events = POLLOUT;
+ EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, kTimeoutMs),
+ SyscallSucceedsWithValue(1)); // 1 fd should have an event.
+ EXPECT_EQ(poll_fd.revents, POLLOUT); // POLLOUT should be in revents.
+}
+
+// This test validates that poll(2) while data is available immediately returns.
+TEST_F(PollTest, PollLevelTriggered) {
+ int fds[2] = {};
+ ASSERT_THAT(socketpair(AF_UNIX, SOCK_STREAM, /*protocol=*/0, fds),
+ SyscallSucceeds());
+
+ FileDescriptor fd0(fds[0]);
+ FileDescriptor fd1(fds[1]);
+
+ // Write two bytes to the socket.
+ const char* kBuf = "aa";
+ ASSERT_THAT(RetryEINTR(send)(fd0.get(), kBuf, /*len=*/2, /*flags=*/0),
+ SyscallSucceedsWithValue(2)); // 2 bytes should be written.
+
+ // Poll(2) should immediately return as there is data available to read.
+ constexpr int kInfiniteTimeout = -1;
+ struct pollfd poll_fd = {fd1.get(), POLLIN, 0};
+ ASSERT_THAT(RetryEINTR(poll)(&poll_fd, /*nfds=*/1, kInfiniteTimeout),
+ SyscallSucceedsWithValue(1)); // 1 fd should be ready to read.
+ EXPECT_NE(poll_fd.revents & POLLIN, 0);
+
+ // Read a single byte.
+ char read_byte = 0;
+ ASSERT_THAT(RetryEINTR(recv)(fd1.get(), &read_byte, /*len=*/1, /*flags=*/0),
+ SyscallSucceedsWithValue(1)); // 1 byte should be read.
+ ASSERT_EQ(read_byte, 'a'); // We should have read a single 'a'.
+
+ // Create a separate pollfd for our second poll.
+ struct pollfd poll_fd_after = {fd1.get(), POLLIN, 0};
+
+ // Poll(2) should again immediately return since we only read one byte.
+ ASSERT_THAT(RetryEINTR(poll)(&poll_fd_after, /*nfds=*/1, kInfiniteTimeout),
+ SyscallSucceedsWithValue(1)); // 1 fd should be ready to read.
+ EXPECT_NE(poll_fd_after.revents & POLLIN, 0);
+}
+
+TEST_F(PollTest, Nfds) {
+ // Stash value of RLIMIT_NOFILES.
+ struct rlimit rlim;
+ TEST_PCHECK(getrlimit(RLIMIT_NOFILE, &rlim) == 0);
+
+ // gVisor caps the number of FDs that epoll can use beyond RLIMIT_NOFILE.
+ constexpr rlim_t maxFD = 4096;
+ if (rlim.rlim_cur > maxFD) {
+ rlim.rlim_cur = maxFD;
+ TEST_PCHECK(setrlimit(RLIMIT_NOFILE, &rlim) == 0);
+ }
+
+ rlim_t max_fds = rlim.rlim_cur;
+ std::cout << "Using limit: " << max_fds << std::endl;
+
+ // Create an eventfd. Since its value is initially zero, it is writable.
+ FileDescriptor efd = ASSERT_NO_ERRNO_AND_VALUE(NewEventFD());
+
+ // Create the biggest possible pollfd array such that each element is valid.
+ // Each entry in the 'fds' array refers to the eventfd and polls for
+ // "writable" events (events=POLLOUT). This essentially guarantees that the
+ // poll() is a no-op and allows negative testing of the 'nfds' parameter.
+ std::vector<struct pollfd> fds(max_fds + 1,
+ {.fd = efd.get(), .events = POLLOUT});
+
+ // Verify that 'nfds' up to RLIMIT_NOFILE are allowed.
+ EXPECT_THAT(RetryEINTR(poll)(fds.data(), 1, 1), SyscallSucceedsWithValue(1));
+ EXPECT_THAT(RetryEINTR(poll)(fds.data(), max_fds / 2, 1),
+ SyscallSucceedsWithValue(max_fds / 2));
+ EXPECT_THAT(RetryEINTR(poll)(fds.data(), max_fds, 1),
+ SyscallSucceedsWithValue(max_fds));
+
+ // If 'nfds' exceeds RLIMIT_NOFILE then it must fail with EINVAL.
+ EXPECT_THAT(poll(fds.data(), max_fds + 1, 1), SyscallFailsWithErrno(EINVAL));
+}
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/ppoll.cc b/test/syscalls/linux/ppoll.cc
new file mode 100644
index 000000000..8245a11e8
--- /dev/null
+++ b/test/syscalls/linux/ppoll.cc
@@ -0,0 +1,155 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <poll.h>
+#include <signal.h>
+#include <sys/syscall.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "absl/time/time.h"
+#include "test/syscalls/linux/base_poll_test.h"
+#include "test/util/signal_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+// Linux and glibc have a different idea of the sizeof sigset_t. When calling
+// the syscall directly, use what the kernel expects.
+unsigned kSigsetSize = SIGRTMAX / 8;
+
+// Linux ppoll(2) differs from the glibc wrapper function in that Linux updates
+// the timeout with the amount of time remaining. In order to test this behavior
+// we need to use the syscall directly.
+int syscallPpoll(struct pollfd* fds, nfds_t nfds, struct timespec* timeout_ts,
+ const sigset_t* sigmask, unsigned mask_size) {
+ return syscall(SYS_ppoll, fds, nfds, timeout_ts, sigmask, mask_size);
+}
+
+class PpollTest : public BasePollTest {
+ protected:
+ void SetUp() override { BasePollTest::SetUp(); }
+ void TearDown() override { BasePollTest::TearDown(); }
+};
+
+TEST_F(PpollTest, InvalidFds) {
+ // fds is invalid because it's null, but we tell ppoll the length is non-zero.
+ struct timespec timeout = {};
+ sigset_t sigmask;
+ TEST_PCHECK(sigemptyset(&sigmask) == 0);
+ EXPECT_THAT(syscallPpoll(nullptr, 1, &timeout, &sigmask, kSigsetSize),
+ SyscallFailsWithErrno(EFAULT));
+ EXPECT_THAT(syscallPpoll(nullptr, -1, &timeout, &sigmask, kSigsetSize),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+// See that when fds is null, ppoll behaves like sleep.
+TEST_F(PpollTest, NullFds) {
+ struct timespec timeout = absl::ToTimespec(absl::Milliseconds(10));
+ ASSERT_THAT(syscallPpoll(nullptr, 0, &timeout, nullptr, 0),
+ SyscallSucceeds());
+ EXPECT_EQ(timeout.tv_sec, 0);
+ EXPECT_EQ(timeout.tv_nsec, 0);
+}
+
+TEST_F(PpollTest, ZeroTimeout) {
+ struct timespec timeout = {};
+ ASSERT_THAT(syscallPpoll(nullptr, 0, &timeout, nullptr, 0),
+ SyscallSucceeds());
+ EXPECT_EQ(timeout.tv_sec, 0);
+ EXPECT_EQ(timeout.tv_nsec, 0);
+}
+
+// If random S/R interrupts the ppoll, SIGALRM may be delivered before ppoll
+// restarts, causing the ppoll to hang forever.
+TEST_F(PpollTest, NoTimeout_NoRandomSave) {
+ // When there's no timeout, ppoll may never return so set a timer.
+ SetTimer(absl::Milliseconds(100));
+ // See that we get interrupted by the timer.
+ ASSERT_THAT(syscallPpoll(nullptr, 0, nullptr, nullptr, 0),
+ SyscallFailsWithErrno(EINTR));
+ EXPECT_TRUE(TimerFired());
+}
+
+TEST_F(PpollTest, InvalidTimeoutNegative) {
+ struct timespec timeout = absl::ToTimespec(absl::Nanoseconds(-1));
+ EXPECT_THAT(syscallPpoll(nullptr, 0, &timeout, nullptr, 0),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_F(PpollTest, InvalidTimeoutNotNormalized) {
+ struct timespec timeout = {0, 1000000001};
+ EXPECT_THAT(syscallPpoll(nullptr, 0, &timeout, nullptr, 0),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_F(PpollTest, InvalidMaskSize) {
+ struct timespec timeout = {};
+ sigset_t sigmask;
+ TEST_PCHECK(sigemptyset(&sigmask) == 0);
+ EXPECT_THAT(syscallPpoll(nullptr, 0, &timeout, &sigmask, 128),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+// Verify that signals blocked by the ppoll mask (that would otherwise be
+// allowed) do not interrupt ppoll.
+TEST_F(PpollTest, SignalMaskBlocksSignal) {
+ absl::Duration duration(absl::Seconds(30));
+ struct timespec timeout = absl::ToTimespec(duration);
+ absl::Duration timer_duration(absl::Seconds(10));
+
+ // Call with a mask that blocks SIGALRM. See that ppoll is not interrupted
+ // (i.e. returns 0) and that upon completion, the timer has fired.
+ sigset_t mask;
+ ASSERT_THAT(sigprocmask(0, nullptr, &mask), SyscallSucceeds());
+ TEST_PCHECK(sigaddset(&mask, SIGALRM) == 0);
+ SetTimer(timer_duration);
+ MaybeSave();
+ ASSERT_FALSE(TimerFired());
+ ASSERT_THAT(syscallPpoll(nullptr, 0, &timeout, &mask, kSigsetSize),
+ SyscallSucceeds());
+ EXPECT_TRUE(TimerFired());
+ EXPECT_EQ(absl::DurationFromTimespec(timeout), absl::Duration());
+}
+
+// Verify that signals allowed by the ppoll mask (that would otherwise be
+// blocked) interrupt ppoll.
+TEST_F(PpollTest, SignalMaskAllowsSignal) {
+ absl::Duration duration(absl::Seconds(30));
+ struct timespec timeout = absl::ToTimespec(duration);
+ absl::Duration timer_duration(absl::Seconds(10));
+
+ sigset_t mask;
+ ASSERT_THAT(sigprocmask(0, nullptr, &mask), SyscallSucceeds());
+
+ // Block SIGALRM.
+ auto cleanup =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, SIGALRM));
+
+ // Call with a mask that unblocks SIGALRM. See that ppoll is interrupted.
+ SetTimer(timer_duration);
+ MaybeSave();
+ ASSERT_FALSE(TimerFired());
+ ASSERT_THAT(syscallPpoll(nullptr, 0, &timeout, &mask, kSigsetSize),
+ SyscallFailsWithErrno(EINTR));
+ EXPECT_TRUE(TimerFired());
+ EXPECT_GT(absl::DurationFromTimespec(timeout), absl::Duration());
+}
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/prctl.cc b/test/syscalls/linux/prctl.cc
new file mode 100644
index 000000000..04c5161f5
--- /dev/null
+++ b/test/syscalls/linux/prctl.cc
@@ -0,0 +1,230 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/prctl.h>
+#include <sys/ptrace.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include <string>
+
+#include "gtest/gtest.h"
+#include "absl/flags/flag.h"
+#include "test/util/capability_util.h"
+#include "test/util/cleanup.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+ABSL_FLAG(bool, prctl_no_new_privs_test_child, false,
+ "If true, exit with the return value of prctl(PR_GET_NO_NEW_PRIVS) "
+ "plus an offset (see test source).");
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+#ifndef SUID_DUMP_DISABLE
+#define SUID_DUMP_DISABLE 0
+#endif /* SUID_DUMP_DISABLE */
+#ifndef SUID_DUMP_USER
+#define SUID_DUMP_USER 1
+#endif /* SUID_DUMP_USER */
+#ifndef SUID_DUMP_ROOT
+#define SUID_DUMP_ROOT 2
+#endif /* SUID_DUMP_ROOT */
+
+TEST(PrctlTest, NameInitialized) {
+ const size_t name_length = 20;
+ char name[name_length] = {};
+ ASSERT_THAT(prctl(PR_GET_NAME, name), SyscallSucceeds());
+ ASSERT_NE(std::string(name), "");
+}
+
+TEST(PrctlTest, SetNameLongName) {
+ const size_t name_length = 20;
+ const std::string long_name(name_length, 'A');
+ ASSERT_THAT(prctl(PR_SET_NAME, long_name.c_str()), SyscallSucceeds());
+ char truncated_name[name_length] = {};
+ ASSERT_THAT(prctl(PR_GET_NAME, truncated_name), SyscallSucceeds());
+ const size_t truncated_length = 15;
+ ASSERT_EQ(long_name.substr(0, truncated_length), std::string(truncated_name));
+}
+
+TEST(PrctlTest, ChildProcessName) {
+ constexpr size_t kMaxNameLength = 15;
+
+ char parent_name[kMaxNameLength + 1] = {};
+ memset(parent_name, 'a', kMaxNameLength);
+
+ ASSERT_THAT(prctl(PR_SET_NAME, parent_name), SyscallSucceeds());
+
+ pid_t child_pid = fork();
+ TEST_PCHECK(child_pid >= 0);
+ if (child_pid == 0) {
+ char child_name[kMaxNameLength + 1] = {};
+ TEST_PCHECK(prctl(PR_GET_NAME, child_name) >= 0);
+ TEST_CHECK(memcmp(parent_name, child_name, sizeof(parent_name)) == 0);
+ _exit(0);
+ }
+
+ int status;
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+ << "status =" << status;
+}
+
+// Offset added to exit code from test child to distinguish from other abnormal
+// exits.
+constexpr int kPrctlNoNewPrivsTestChildExitBase = 100;
+
+TEST(PrctlTest, NoNewPrivsPreservedAcrossCloneForkAndExecve) {
+ // Check if no_new_privs is already set. If it is, we can still test that it's
+ // preserved across clone/fork/execve, but we also expect it to still be set
+ // at the end of the test. Otherwise, call prctl(PR_SET_NO_NEW_PRIVS) so as
+ // not to contaminate the original thread.
+ int no_new_privs;
+ ASSERT_THAT(no_new_privs = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0),
+ SyscallSucceeds());
+ ScopedThread([] {
+ ASSERT_THAT(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0), SyscallSucceeds());
+ EXPECT_THAT(prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0),
+ SyscallSucceedsWithValue(1));
+ ScopedThread([] {
+ EXPECT_THAT(prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0),
+ SyscallSucceedsWithValue(1));
+ // Note that these ASSERT_*s failing will only return from this thread,
+ // but this is the intended behavior.
+ pid_t child_pid = -1;
+ int execve_errno = 0;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec("/proc/self/exe",
+ {"/proc/self/exe", "--prctl_no_new_privs_test_child"}, {},
+ nullptr, &child_pid, &execve_errno));
+
+ ASSERT_GT(child_pid, 0);
+ ASSERT_EQ(execve_errno, 0);
+
+ int status = 0;
+ ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0),
+ SyscallSucceeds());
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), kPrctlNoNewPrivsTestChildExitBase + 1);
+
+ EXPECT_THAT(prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0),
+ SyscallSucceedsWithValue(1));
+ });
+ EXPECT_THAT(prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0),
+ SyscallSucceedsWithValue(1));
+ });
+ EXPECT_THAT(prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0),
+ SyscallSucceedsWithValue(no_new_privs));
+}
+
+TEST(PrctlTest, PDeathSig) {
+ pid_t child_pid;
+
+ // Make the new process' parent a separate thread since the parent death
+ // signal fires when the parent *thread* exits.
+ ScopedThread([&] {
+ child_pid = fork();
+ TEST_CHECK(child_pid >= 0);
+ if (child_pid == 0) {
+ // In child process.
+ TEST_CHECK(prctl(PR_SET_PDEATHSIG, SIGKILL) >= 0);
+ int signo;
+ TEST_CHECK(prctl(PR_GET_PDEATHSIG, &signo) >= 0);
+ TEST_CHECK(signo == SIGKILL);
+ // Enable tracing, then raise SIGSTOP and expect our parent to suppress
+ // it.
+ TEST_CHECK(ptrace(PTRACE_TRACEME, 0, 0, 0) >= 0);
+ raise(SIGSTOP);
+ // Sleep until killed by our parent death signal. sleep(3) is
+ // async-signal-safe, absl::SleepFor isn't.
+ while (true) {
+ sleep(10);
+ }
+ }
+ // In parent process.
+
+ // Wait for the child to send itself SIGSTOP and enter signal-delivery-stop.
+ int status;
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP)
+ << "status = " << status;
+
+ // Suppress the SIGSTOP and detach from the child.
+ ASSERT_THAT(ptrace(PTRACE_DETACH, child_pid, 0, 0), SyscallSucceeds());
+ });
+
+ // The child should have been killed by its parent death SIGKILL.
+ int status;
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGKILL)
+ << "status = " << status;
+}
+
+// This test is to validate that calling prctl with PR_SET_MM without the
+// CAP_SYS_RESOURCE returns EPERM.
+TEST(PrctlTest, InvalidPrSetMM) {
+ if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_RESOURCE))) {
+ ASSERT_NO_ERRNO(SetCapability(CAP_SYS_RESOURCE,
+ false)); // Drop capability to test below.
+ }
+ ASSERT_THAT(prctl(PR_SET_MM, 0, 0, 0, 0), SyscallFailsWithErrno(EPERM));
+}
+
+// Sanity check that dumpability is remembered.
+TEST(PrctlTest, SetGetDumpability) {
+ int before;
+ ASSERT_THAT(before = prctl(PR_GET_DUMPABLE), SyscallSucceeds());
+ auto cleanup = Cleanup([before] {
+ ASSERT_THAT(prctl(PR_SET_DUMPABLE, before), SyscallSucceeds());
+ });
+
+ EXPECT_THAT(prctl(PR_SET_DUMPABLE, SUID_DUMP_DISABLE), SyscallSucceeds());
+ EXPECT_THAT(prctl(PR_GET_DUMPABLE),
+ SyscallSucceedsWithValue(SUID_DUMP_DISABLE));
+
+ EXPECT_THAT(prctl(PR_SET_DUMPABLE, SUID_DUMP_USER), SyscallSucceeds());
+ EXPECT_THAT(prctl(PR_GET_DUMPABLE), SyscallSucceedsWithValue(SUID_DUMP_USER));
+}
+
+// SUID_DUMP_ROOT cannot be set via PR_SET_DUMPABLE.
+TEST(PrctlTest, RootDumpability) {
+ EXPECT_THAT(prctl(PR_SET_DUMPABLE, SUID_DUMP_ROOT),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
+
+int main(int argc, char** argv) {
+ gvisor::testing::TestInit(&argc, &argv);
+
+ if (absl::GetFlag(FLAGS_prctl_no_new_privs_test_child)) {
+ exit(gvisor::testing::kPrctlNoNewPrivsTestChildExitBase +
+ prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0));
+ }
+
+ return gvisor::testing::RunAllTests();
+}
diff --git a/test/syscalls/linux/prctl_setuid.cc b/test/syscalls/linux/prctl_setuid.cc
new file mode 100644
index 000000000..c4e9cf528
--- /dev/null
+++ b/test/syscalls/linux/prctl_setuid.cc
@@ -0,0 +1,268 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sched.h>
+#include <sys/prctl.h>
+
+#include <string>
+
+#include "gtest/gtest.h"
+#include "absl/flags/flag.h"
+#include "test/util/capability_util.h"
+#include "test/util/logging.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+ABSL_FLAG(int32_t, scratch_uid, 65534, "scratch UID");
+// This flag is used to verify that after an exec PR_GET_KEEPCAPS
+// returns 0, the return code will be offset by kPrGetKeepCapsExitBase.
+ABSL_FLAG(bool, prctl_pr_get_keepcaps, false,
+ "If true the test will verify that prctl with pr_get_keepcaps"
+ "returns 0. The test will exit with the result of that check.");
+
+// These tests exist seperately from prctl because we need to start
+// them as root. Setuid() has the behavior that permissions are fully
+// removed if one of the UIDs were 0 before a setuid() call. This
+// behavior can be changed by using PR_SET_KEEPCAPS and that is what
+// is tested here.
+//
+// Reference setuid(2):
+// The setuid() function checks the effective user ID of
+// the caller and if it is the superuser, all process-related user ID's
+// are set to uid. After this has occurred, it is impossible for the
+// program to regain root privileges.
+//
+// Thus, a set-user-ID-root program wishing to temporarily drop root
+// privileges, assume the identity of an unprivileged user, and then
+// regain root privileges afterward cannot use setuid(). You can
+// accomplish this with seteuid(2).
+namespace gvisor {
+namespace testing {
+
+// Offset added to exit code from test child to distinguish from other abnormal
+// exits.
+constexpr int kPrGetKeepCapsExitBase = 100;
+
+namespace {
+
+class PrctlKeepCapsSetuidTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ // PR_GET_KEEPCAPS will only return 0 or 1 (on success).
+ ASSERT_THAT(original_keepcaps_ = prctl(PR_GET_KEEPCAPS, 0, 0, 0, 0),
+ SyscallSucceeds());
+ ASSERT_TRUE(original_keepcaps_ == 0 || original_keepcaps_ == 1);
+ }
+
+ void TearDown() override {
+ // Restore PR_SET_KEEPCAPS.
+ ASSERT_THAT(prctl(PR_SET_KEEPCAPS, original_keepcaps_, 0, 0, 0),
+ SyscallSucceeds());
+
+ // Verify that it was restored.
+ ASSERT_THAT(prctl(PR_GET_KEEPCAPS, 0, 0, 0, 0),
+ SyscallSucceedsWithValue(original_keepcaps_));
+ }
+
+ // The original keep caps value exposed so tests can use it if they need.
+ int original_keepcaps_ = 0;
+};
+
+// This test will verify that a bad value, eg. not 0 or 1 for
+// PR_SET_KEEPCAPS will return EINVAL as required by prctl(2).
+TEST_F(PrctlKeepCapsSetuidTest, PrctlBadArgsToKeepCaps) {
+ ASSERT_THAT(prctl(PR_SET_KEEPCAPS, 2, 0, 0, 0),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+// This test will verify that a setuid(2) without PR_SET_KEEPCAPS will cause
+// all capabilities to be dropped.
+TEST_F(PrctlKeepCapsSetuidTest, SetUidNoKeepCaps) {
+ // getuid(2) never fails.
+ if (getuid() != 0) {
+ SKIP_IF(!IsRunningOnGvisor());
+ FAIL() << "User is not root on gvisor platform.";
+ }
+
+ // Do setuid in a separate thread so that after finishing this test, the
+ // process can still open files the test harness created before starting
+ // this test. Otherwise, the files are created by root (UID before the
+ // test), but cannot be opened by the `uid` set below after the test. After
+ // calling setuid(non-zero-UID), there is no way to get root privileges
+ // back.
+ ScopedThread([] {
+ // Start by verifying we have a capability.
+ TEST_CHECK(HaveCapability(CAP_SYS_ADMIN).ValueOrDie());
+
+ // Verify that PR_GET_KEEPCAPS is disabled.
+ ASSERT_THAT(prctl(PR_GET_KEEPCAPS, 0, 0, 0, 0),
+ SyscallSucceedsWithValue(0));
+
+ // Use syscall instead of glibc setuid wrapper because we want this setuid
+ // call to only apply to this task. POSIX threads, however, require that
+ // all threads have the same UIDs, so using the setuid wrapper sets all
+ // threads' real UID.
+ EXPECT_THAT(syscall(SYS_setuid, absl::GetFlag(FLAGS_scratch_uid)),
+ SyscallSucceeds());
+
+ // Verify that we changed uid.
+ EXPECT_THAT(getuid(),
+ SyscallSucceedsWithValue(absl::GetFlag(FLAGS_scratch_uid)));
+
+ // Verify we lost the capability in the effective set, this always happens.
+ TEST_CHECK(!HaveCapability(CAP_SYS_ADMIN).ValueOrDie());
+
+ // We should have also lost it in the permitted set by the setuid() so
+ // SetCapability should fail when we try to add it back to the effective set
+ ASSERT_FALSE(SetCapability(CAP_SYS_ADMIN, true).ok());
+ });
+}
+
+// This test will verify that a setuid with PR_SET_KEEPCAPS will cause
+// capabilities to be retained after we switch away from the root user.
+TEST_F(PrctlKeepCapsSetuidTest, SetUidKeepCaps) {
+ // getuid(2) never fails.
+ if (getuid() != 0) {
+ SKIP_IF(!IsRunningOnGvisor());
+ FAIL() << "User is not root on gvisor platform.";
+ }
+
+ // Do setuid in a separate thread so that after finishing this test, the
+ // process can still open files the test harness created before starting
+ // this test. Otherwise, the files are created by root (UID before the
+ // test), but cannot be opened by the `uid` set below after the test. After
+ // calling setuid(non-zero-UID), there is no way to get root privileges
+ // back.
+ ScopedThread([] {
+ // Start by verifying we have a capability.
+ TEST_CHECK(HaveCapability(CAP_SYS_ADMIN).ValueOrDie());
+
+ // Set PR_SET_KEEPCAPS.
+ ASSERT_THAT(prctl(PR_SET_KEEPCAPS, 1, 0, 0, 0), SyscallSucceeds());
+
+ // Verify PR_SET_KEEPCAPS was set before we proceed.
+ ASSERT_THAT(prctl(PR_GET_KEEPCAPS, 0, 0, 0, 0),
+ SyscallSucceedsWithValue(1));
+
+ // Use syscall instead of glibc setuid wrapper because we want this setuid
+ // call to only apply to this task. POSIX threads, however, require that
+ // all threads have the same UIDs, so using the setuid wrapper sets all
+ // threads' real UID.
+ EXPECT_THAT(syscall(SYS_setuid, absl::GetFlag(FLAGS_scratch_uid)),
+ SyscallSucceeds());
+
+ // Verify that we changed uid.
+ EXPECT_THAT(getuid(),
+ SyscallSucceedsWithValue(absl::GetFlag(FLAGS_scratch_uid)));
+
+ // Verify we lost the capability in the effective set, this always happens.
+ TEST_CHECK(!HaveCapability(CAP_SYS_ADMIN).ValueOrDie());
+
+ // We lost the capability in the effective set, but it will still
+ // exist in the permitted set so we can elevate the capability.
+ ASSERT_NO_ERRNO(SetCapability(CAP_SYS_ADMIN, true));
+
+ // Verify we got back the capability in the effective set.
+ TEST_CHECK(HaveCapability(CAP_SYS_ADMIN).ValueOrDie());
+ });
+}
+
+// This test will verify that PR_SET_KEEPCAPS is not retained
+// across an execve. According to prctl(2):
+// "The "keep capabilities" value will be reset to 0 on subsequent
+// calls to execve(2)."
+TEST_F(PrctlKeepCapsSetuidTest, NoKeepCapsAfterExec) {
+ ASSERT_THAT(prctl(PR_SET_KEEPCAPS, 1, 0, 0, 0), SyscallSucceeds());
+
+ // Verify PR_SET_KEEPCAPS was set before we proceed.
+ ASSERT_THAT(prctl(PR_GET_KEEPCAPS, 0, 0, 0, 0), SyscallSucceedsWithValue(1));
+
+ pid_t child_pid = -1;
+ int execve_errno = 0;
+ // Do an exec and then verify that PR_GET_KEEPCAPS returns 0
+ // see the body of main below.
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(ForkAndExec(
+ "/proc/self/exe", {"/proc/self/exe", "--prctl_pr_get_keepcaps"}, {},
+ nullptr, &child_pid, &execve_errno));
+
+ ASSERT_GT(child_pid, 0);
+ ASSERT_EQ(execve_errno, 0);
+
+ int status = 0;
+ ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+ ASSERT_TRUE(WIFEXITED(status));
+ // PR_SET_KEEPCAPS should have been cleared by the exec.
+ // Success should return gvisor::testing::kPrGetKeepCapsExitBase + 0
+ ASSERT_EQ(WEXITSTATUS(status), kPrGetKeepCapsExitBase);
+}
+
+TEST_F(PrctlKeepCapsSetuidTest, NoKeepCapsAfterNewUserNamespace) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanCreateUserNamespace()));
+
+ // Fork to avoid changing the user namespace of the original test process.
+ pid_t const child_pid = fork();
+
+ if (child_pid == 0) {
+ // Verify that the keepcaps flag is set to 0 when we change user namespaces.
+ TEST_PCHECK(prctl(PR_SET_KEEPCAPS, 1, 0, 0, 0) == 0);
+ MaybeSave();
+
+ TEST_PCHECK(prctl(PR_GET_KEEPCAPS, 0, 0, 0, 0) == 1);
+ MaybeSave();
+
+ TEST_PCHECK(unshare(CLONE_NEWUSER) == 0);
+ MaybeSave();
+
+ TEST_PCHECK(prctl(PR_GET_KEEPCAPS, 0, 0, 0, 0) == 0);
+ MaybeSave();
+
+ _exit(0);
+ }
+
+ int status;
+ ASSERT_THAT(child_pid, SyscallSucceeds());
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+ << "status = " << status;
+}
+
+// This test will verify that PR_SET_KEEPCAPS and PR_GET_KEEPCAPS work correctly
+TEST_F(PrctlKeepCapsSetuidTest, PrGetKeepCaps) {
+ // Set PR_SET_KEEPCAPS to the negation of the original.
+ ASSERT_THAT(prctl(PR_SET_KEEPCAPS, !original_keepcaps_, 0, 0, 0),
+ SyscallSucceeds());
+
+ // Verify it was set.
+ ASSERT_THAT(prctl(PR_GET_KEEPCAPS, 0, 0, 0, 0),
+ SyscallSucceedsWithValue(!original_keepcaps_));
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
+
+int main(int argc, char** argv) {
+ gvisor::testing::TestInit(&argc, &argv);
+
+ if (absl::GetFlag(FLAGS_prctl_pr_get_keepcaps)) {
+ return gvisor::testing::kPrGetKeepCapsExitBase +
+ prctl(PR_GET_KEEPCAPS, 0, 0, 0, 0);
+ }
+
+ return gvisor::testing::RunAllTests();
+}
diff --git a/test/syscalls/linux/pread64.cc b/test/syscalls/linux/pread64.cc
new file mode 100644
index 000000000..bcdbbb044
--- /dev/null
+++ b/test/syscalls/linux/pread64.cc
@@ -0,0 +1,167 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/unistd.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+class Pread64Test : public ::testing::Test {
+ void SetUp() override {
+ name_ = NewTempAbsPath();
+ ASSERT_NO_ERRNO_AND_VALUE(Open(name_, O_CREAT, 0644));
+ }
+
+ void TearDown() override { unlink(name_.c_str()); }
+
+ public:
+ std::string name_;
+};
+
+TEST(Pread64TestNoTempFile, BadFileDescriptor) {
+ char buf[1024];
+ EXPECT_THAT(pread64(-1, buf, 1024, 0), SyscallFailsWithErrno(EBADF));
+}
+
+TEST_F(Pread64Test, ZeroBuffer) {
+ const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(name_, O_RDWR));
+
+ char msg[] = "hello world";
+ EXPECT_THAT(pwrite64(fd.get(), msg, strlen(msg), 0),
+ SyscallSucceedsWithValue(strlen(msg)));
+
+ char buf[10];
+ EXPECT_THAT(pread64(fd.get(), buf, 0, 0), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(Pread64Test, BadBuffer) {
+ const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(name_, O_RDWR));
+
+ char msg[] = "hello world";
+ EXPECT_THAT(pwrite64(fd.get(), msg, strlen(msg), 0),
+ SyscallSucceedsWithValue(strlen(msg)));
+
+ char* bad_buffer = nullptr;
+ EXPECT_THAT(pread64(fd.get(), bad_buffer, 1024, 0),
+ SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(Pread64Test, WriteOnlyNotReadable) {
+ const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(name_, O_WRONLY));
+
+ char buf[1024];
+ EXPECT_THAT(pread64(fd.get(), buf, 1024, 0), SyscallFailsWithErrno(EBADF));
+}
+
+TEST_F(Pread64Test, DirNotReadable) {
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(GetAbsoluteTestTmpdir(), O_RDONLY));
+
+ char buf[1024];
+ EXPECT_THAT(pread64(fd.get(), buf, 1024, 0), SyscallFailsWithErrno(EISDIR));
+}
+
+TEST_F(Pread64Test, BadOffset) {
+ const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(name_, O_RDONLY));
+
+ char buf[1024];
+ EXPECT_THAT(pread64(fd.get(), buf, 1024, -1), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_F(Pread64Test, OffsetNotIncremented) {
+ const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(name_, O_RDWR));
+
+ char msg[] = "hello world";
+ EXPECT_THAT(write(fd.get(), msg, strlen(msg)),
+ SyscallSucceedsWithValue(strlen(msg)));
+ int offset;
+ EXPECT_THAT(offset = lseek(fd.get(), 0, SEEK_CUR), SyscallSucceeds());
+
+ char buf1[1024];
+ EXPECT_THAT(pread64(fd.get(), buf1, 1024, 0),
+ SyscallSucceedsWithValue(strlen(msg)));
+ EXPECT_THAT(lseek(fd.get(), 0, SEEK_CUR), SyscallSucceedsWithValue(offset));
+
+ char buf2[1024];
+ EXPECT_THAT(pread64(fd.get(), buf2, 1024, 3),
+ SyscallSucceedsWithValue(strlen(msg) - 3));
+ EXPECT_THAT(lseek(fd.get(), 0, SEEK_CUR), SyscallSucceedsWithValue(offset));
+}
+
+TEST_F(Pread64Test, EndOfFile) {
+ const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(name_, O_RDONLY));
+
+ char buf[1024];
+ EXPECT_THAT(pread64(fd.get(), buf, 1024, 0), SyscallSucceedsWithValue(0));
+}
+
+int memfd_create(const std::string& name, unsigned int flags) {
+ return syscall(__NR_memfd_create, name.c_str(), flags);
+}
+
+TEST_F(Pread64Test, Overflow) {
+ int f = memfd_create("negative", 0);
+ const FileDescriptor fd(f);
+
+ EXPECT_THAT(ftruncate(fd.get(), 0x7fffffffffffffffull), SyscallSucceeds());
+
+ char buf[10];
+ EXPECT_THAT(pread64(fd.get(), buf, sizeof(buf), 0x7fffffffffffffffull),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(Pread64TestNoTempFile, CantReadSocketPair_NoRandomSave) {
+ int sock_fds[2];
+ EXPECT_THAT(socketpair(AF_UNIX, SOCK_STREAM, 0, sock_fds), SyscallSucceeds());
+
+ char buf[1024];
+ EXPECT_THAT(pread64(sock_fds[0], buf, 1024, 0),
+ SyscallFailsWithErrno(ESPIPE));
+ EXPECT_THAT(pread64(sock_fds[1], buf, 1024, 0),
+ SyscallFailsWithErrno(ESPIPE));
+
+ EXPECT_THAT(close(sock_fds[0]), SyscallSucceeds());
+ EXPECT_THAT(close(sock_fds[1]), SyscallSucceeds());
+}
+
+TEST(Pread64TestNoTempFile, CantReadPipe) {
+ char buf[1024];
+
+ int pipe_fds[2];
+ EXPECT_THAT(pipe(pipe_fds), SyscallSucceeds());
+
+ EXPECT_THAT(pread64(pipe_fds[0], buf, 1024, 0),
+ SyscallFailsWithErrno(ESPIPE));
+
+ EXPECT_THAT(close(pipe_fds[0]), SyscallSucceeds());
+ EXPECT_THAT(close(pipe_fds[1]), SyscallSucceeds());
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/preadv.cc b/test/syscalls/linux/preadv.cc
new file mode 100644
index 000000000..5b0743fe9
--- /dev/null
+++ b/test/syscalls/linux/preadv.cc
@@ -0,0 +1,95 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include <atomic>
+#include <string>
+
+#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/logging.h"
+#include "test/util/memory_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+#include "test/util/timer_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Stress copy-on-write. Attempts to reproduce b/38430174.
+TEST(PreadvTest, MMConcurrencyStress) {
+ // Fill a one-page file with zeroes (the contents don't really matter).
+ const auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ /* parent = */ GetAbsoluteTestTmpdir(),
+ /* content = */ std::string(kPageSize, 0), TempPath::kDefaultFileMode));
+ const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(f.path(), O_RDONLY));
+
+ // Get a one-page private mapping to read to.
+ const Mapping m = ASSERT_NO_ERRNO_AND_VALUE(
+ MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+
+ // Repeatedly fork in a separate thread to force the mapping to become
+ // copy-on-write.
+ std::atomic<bool> done(false);
+ const ScopedThread t([&] {
+ while (!done.load()) {
+ const pid_t pid = fork();
+ TEST_CHECK(pid >= 0);
+ if (pid == 0) {
+ // In child. The parent was obviously multithreaded, so it's neither
+ // safe nor necessary to do much more than exit.
+ syscall(SYS_exit_group, 0);
+ }
+ int status;
+ ASSERT_THAT(RetryEINTR(waitpid)(pid, &status, 0),
+ SyscallSucceedsWithValue(pid));
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+ << "status = " << status;
+ }
+ });
+
+ // Repeatedly read to the mapping.
+ struct iovec iov[2];
+ iov[0].iov_base = m.ptr();
+ iov[0].iov_len = kPageSize / 2;
+ iov[1].iov_base = reinterpret_cast<void*>(m.addr() + kPageSize / 2);
+ iov[1].iov_len = kPageSize / 2;
+ constexpr absl::Duration kTestDuration = absl::Seconds(5);
+ const absl::Time end = absl::Now() + kTestDuration;
+ while (absl::Now() < end) {
+ // Among other causes, save/restore cycles may cause interruptions resulting
+ // in partial reads, so we don't expect any particular return value.
+ EXPECT_THAT(RetryEINTR(preadv)(fd.get(), iov, 2, 0), SyscallSucceeds());
+ }
+
+ // Stop the other thread.
+ done.store(true);
+
+ // The test passes if it neither deadlocks nor crashes the OS.
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/preadv2.cc b/test/syscalls/linux/preadv2.cc
new file mode 100644
index 000000000..4a9acd7ae
--- /dev/null
+++ b/test/syscalls/linux/preadv2.cc
@@ -0,0 +1,280 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "absl/memory/memory.h"
+#include "test/syscalls/linux/file_base.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+#ifndef SYS_preadv2
+#if defined(__x86_64__)
+#define SYS_preadv2 327
+#elif defined(__aarch64__)
+#define SYS_preadv2 286
+#else
+#error "Unknown architecture"
+#endif
+#endif // SYS_preadv2
+
+#ifndef RWF_HIPRI
+#define RWF_HIPRI 0x1
+#endif // RWF_HIPRI
+
+constexpr int kBufSize = 1024;
+
+std::string SetContent() {
+ std::string content;
+ for (int i = 0; i < kBufSize; i++) {
+ content += static_cast<char>((i % 10) + '0');
+ }
+ return content;
+}
+
+ssize_t preadv2(unsigned long fd, const struct iovec* iov, unsigned long iovcnt,
+ off_t offset, unsigned long flags) {
+ // syscall on preadv2 does some weird things (see man syscall and search
+ // preadv2), so we insert a 0 to word align the flags argument on native.
+ return syscall(SYS_preadv2, fd, iov, iovcnt, offset, 0, flags);
+}
+
+// This test is the base case where we call preadv (no offset, no flags).
+TEST(Preadv2Test, TestBaseCall) {
+ SKIP_IF(preadv2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS);
+
+ std::string content = SetContent();
+
+ const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), content, TempPath::kDefaultFileMode));
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY));
+
+ std::vector<char> buf(kBufSize);
+ struct iovec iov[2];
+ iov[0].iov_base = buf.data();
+ iov[0].iov_len = buf.size() / 2;
+ iov[1].iov_base = static_cast<char*>(iov[0].iov_base) + (content.size() / 2);
+ iov[1].iov_len = content.size() / 2;
+
+ EXPECT_THAT(preadv2(fd.get(), iov, /*iovcnt*/ 2, /*offset=*/0, /*flags=*/0),
+ SyscallSucceedsWithValue(kBufSize));
+
+ EXPECT_EQ(content, std::string(buf.data(), buf.size()));
+}
+
+// This test is where we call preadv with an offset and no flags.
+TEST(Preadv2Test, TestValidPositiveOffset) {
+ SKIP_IF(preadv2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS);
+
+ std::string content = SetContent();
+ const std::string prefix = "0";
+
+ const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), prefix + content, TempPath::kDefaultFileMode));
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY));
+
+ std::vector<char> buf(kBufSize, '0');
+ struct iovec iov;
+ iov.iov_base = buf.data();
+ iov.iov_len = buf.size();
+
+ EXPECT_THAT(preadv2(fd.get(), &iov, /*iovcnt=*/1, /*offset=*/prefix.size(),
+ /*flags=*/0),
+ SyscallSucceedsWithValue(kBufSize));
+
+ EXPECT_THAT(lseek(fd.get(), 0, SEEK_CUR), SyscallSucceedsWithValue(0));
+
+ EXPECT_EQ(content, std::string(buf.data(), buf.size()));
+}
+
+// This test is the base case where we call readv by using -1 as the offset. The
+// read should use the file offset, so the test increments it by one prior to
+// calling preadv2.
+TEST(Preadv2Test, TestNegativeOneOffset) {
+ SKIP_IF(preadv2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS);
+
+ std::string content = SetContent();
+ const std::string prefix = "231";
+
+ const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), prefix + content, TempPath::kDefaultFileMode));
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY));
+
+ ASSERT_THAT(lseek(fd.get(), prefix.size(), SEEK_SET),
+ SyscallSucceedsWithValue(prefix.size()));
+
+ std::vector<char> buf(kBufSize, '0');
+ struct iovec iov;
+ iov.iov_base = buf.data();
+ iov.iov_len = buf.size();
+
+ EXPECT_THAT(preadv2(fd.get(), &iov, /*iovcnt=*/1, /*offset=*/-1, /*flags=*/0),
+ SyscallSucceedsWithValue(kBufSize));
+
+ EXPECT_THAT(lseek(fd.get(), 0, SEEK_CUR),
+ SyscallSucceedsWithValue(prefix.size() + buf.size()));
+
+ EXPECT_EQ(content, std::string(buf.data(), buf.size()));
+}
+
+// preadv2 requires if the RWF_HIPRI flag is passed, the fd must be opened with
+// O_DIRECT. This test implements a correct call with the RWF_HIPRI flag.
+TEST(Preadv2Test, TestCallWithRWF_HIPRI) {
+ SKIP_IF(preadv2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS);
+
+ std::string content = SetContent();
+
+ const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), content, TempPath::kDefaultFileMode));
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY));
+
+ EXPECT_THAT(fsync(fd.get()), SyscallSucceeds());
+
+ std::vector<char> buf(kBufSize, '0');
+ struct iovec iov;
+ iov.iov_base = buf.data();
+ iov.iov_len = buf.size();
+
+ EXPECT_THAT(
+ preadv2(fd.get(), &iov, /*iovcnt=*/1, /*offset=*/0, /*flags=*/RWF_HIPRI),
+ SyscallSucceedsWithValue(kBufSize));
+
+ EXPECT_THAT(lseek(fd.get(), 0, SEEK_CUR), SyscallSucceedsWithValue(0));
+
+ EXPECT_EQ(content, std::string(buf.data(), buf.size()));
+}
+// This test calls preadv2 with an invalid flag.
+TEST(Preadv2Test, TestInvalidFlag) {
+ SKIP_IF(preadv2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS);
+
+ const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), "", TempPath::kDefaultFileMode));
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY | O_DIRECT));
+
+ std::vector<char> buf(kBufSize, '0');
+ struct iovec iov;
+ iov.iov_base = buf.data();
+ iov.iov_len = buf.size();
+
+ EXPECT_THAT(preadv2(fd.get(), &iov, /*iovcnt=*/1,
+ /*offset=*/0, /*flags=*/0xF0),
+ SyscallFailsWithErrno(EOPNOTSUPP));
+}
+
+// This test calls preadv2 with an invalid offset.
+TEST(Preadv2Test, TestInvalidOffset) {
+ SKIP_IF(preadv2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS);
+
+ const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), "", TempPath::kDefaultFileMode));
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY | O_DIRECT));
+
+ auto iov = absl::make_unique<struct iovec[]>(1);
+ iov[0].iov_base = nullptr;
+ iov[0].iov_len = 0;
+
+ EXPECT_THAT(preadv2(fd.get(), iov.get(), /*iovcnt=*/1, /*offset=*/-8,
+ /*flags=*/0),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+// This test calls preadv with a file set O_WRONLY.
+TEST(Preadv2Test, TestUnreadableFile) {
+ SKIP_IF(preadv2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS);
+
+ const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), "", TempPath::kDefaultFileMode));
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_WRONLY));
+
+ auto iov = absl::make_unique<struct iovec[]>(1);
+ iov[0].iov_base = nullptr;
+ iov[0].iov_len = 0;
+
+ EXPECT_THAT(preadv2(fd.get(), iov.get(), /*iovcnt=*/1,
+ /*offset=*/0, /*flags=*/0),
+ SyscallFailsWithErrno(EBADF));
+}
+
+// Calling preadv2 with a non-negative offset calls preadv. Calling preadv with
+// an unseekable file is not allowed. A pipe is used for an unseekable file.
+TEST(Preadv2Test, TestUnseekableFileInvalid) {
+ SKIP_IF(preadv2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS);
+
+ int pipe_fds[2];
+
+ ASSERT_THAT(pipe(pipe_fds), SyscallSucceeds());
+
+ auto iov = absl::make_unique<struct iovec[]>(1);
+ iov[0].iov_base = nullptr;
+ iov[0].iov_len = 0;
+
+ EXPECT_THAT(preadv2(pipe_fds[0], iov.get(), /*iovcnt=*/1,
+ /*offset=*/2, /*flags=*/0),
+ SyscallFailsWithErrno(ESPIPE));
+
+ EXPECT_THAT(close(pipe_fds[0]), SyscallSucceeds());
+ EXPECT_THAT(close(pipe_fds[1]), SyscallSucceeds());
+}
+
+TEST(Preadv2Test, TestUnseekableFileValid) {
+ SKIP_IF(preadv2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS);
+
+ int pipe_fds[2];
+
+ ASSERT_THAT(pipe(pipe_fds), SyscallSucceeds());
+
+ std::vector<char> content(32, 'X');
+
+ EXPECT_THAT(write(pipe_fds[1], content.data(), content.size()),
+ SyscallSucceedsWithValue(content.size()));
+
+ std::vector<char> buf(content.size());
+ auto iov = absl::make_unique<struct iovec[]>(1);
+ iov[0].iov_base = buf.data();
+ iov[0].iov_len = buf.size();
+
+ EXPECT_THAT(preadv2(pipe_fds[0], iov.get(), /*iovcnt=*/1,
+ /*offset=*/static_cast<off_t>(-1), /*flags=*/0),
+ SyscallSucceedsWithValue(buf.size()));
+
+ EXPECT_EQ(content, buf);
+
+ EXPECT_THAT(close(pipe_fds[0]), SyscallSucceeds());
+ EXPECT_THAT(close(pipe_fds[1]), SyscallSucceeds());
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/priority.cc b/test/syscalls/linux/priority.cc
new file mode 100644
index 000000000..1d9bdfa70
--- /dev/null
+++ b/test/syscalls/linux/priority.cc
@@ -0,0 +1,216 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/resource.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_split.h"
+#include "test/util/capability_util.h"
+#include "test/util/fs_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// These tests are for both the getpriority(2) and setpriority(2) syscalls
+// These tests are very rudimentary because getpriority and setpriority
+// have not yet been fully implemented.
+
+// Getpriority does something
+TEST(GetpriorityTest, Implemented) {
+ // "getpriority() can legitimately return the value -1, it is necessary to
+ // clear the external variable errno prior to the call"
+ errno = 0;
+ EXPECT_THAT(getpriority(PRIO_PROCESS, /*who=*/0), SyscallSucceeds());
+}
+
+// Invalid which
+TEST(GetpriorityTest, InvalidWhich) {
+ errno = 0;
+ EXPECT_THAT(getpriority(/*which=*/3, /*who=*/0),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+// Process is found when which=PRIO_PROCESS
+TEST(GetpriorityTest, ValidWho) {
+ errno = 0;
+ EXPECT_THAT(getpriority(PRIO_PROCESS, getpid()), SyscallSucceeds());
+}
+
+// Process is not found when which=PRIO_PROCESS
+TEST(GetpriorityTest, InvalidWho) {
+ errno = 0;
+ // Flaky, but it's tough to avoid a race condition when finding an unused pid
+ EXPECT_THAT(getpriority(PRIO_PROCESS, /*who=*/INT_MAX - 1),
+ SyscallFailsWithErrno(ESRCH));
+}
+
+// Setpriority does something
+TEST(SetpriorityTest, Implemented) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_NICE)));
+
+ // No need to clear errno for setpriority():
+ // "The setpriority() call returns 0 if there is no error, or -1 if there is"
+ EXPECT_THAT(setpriority(PRIO_PROCESS, /*who=*/0, /*nice=*/16),
+ SyscallSucceeds());
+}
+
+// Invalid which
+TEST(Setpriority, InvalidWhich) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_NICE)));
+
+ EXPECT_THAT(setpriority(/*which=*/3, /*who=*/0, /*nice=*/16),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+// Process is found when which=PRIO_PROCESS
+TEST(SetpriorityTest, ValidWho) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_NICE)));
+
+ EXPECT_THAT(setpriority(PRIO_PROCESS, getpid(), /*nice=*/16),
+ SyscallSucceeds());
+}
+
+// niceval is within the range [-20, 19]
+TEST(SetpriorityTest, InsideRange) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_NICE)));
+
+ // Set 0 < niceval < 19
+ int nice = 12;
+ EXPECT_THAT(setpriority(PRIO_PROCESS, getpid(), nice), SyscallSucceeds());
+
+ errno = 0;
+ EXPECT_THAT(getpriority(PRIO_PROCESS, getpid()),
+ SyscallSucceedsWithValue(nice));
+
+ // Set -20 < niceval < 0
+ nice = -12;
+ EXPECT_THAT(setpriority(PRIO_PROCESS, getpid(), nice), SyscallSucceeds());
+
+ errno = 0;
+ EXPECT_THAT(getpriority(PRIO_PROCESS, getpid()),
+ SyscallSucceedsWithValue(nice));
+}
+
+// Verify that priority/niceness are exposed via /proc/PID/stat.
+TEST(SetpriorityTest, NicenessExposedViaProcfs) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_NICE)));
+
+ constexpr int kNiceVal = 12;
+ ASSERT_THAT(setpriority(PRIO_PROCESS, getpid(), kNiceVal), SyscallSucceeds());
+
+ errno = 0;
+ ASSERT_THAT(getpriority(PRIO_PROCESS, getpid()),
+ SyscallSucceedsWithValue(kNiceVal));
+
+ // Now verify we can read that same value via /proc/self/stat.
+ std::string proc_stat;
+ ASSERT_NO_ERRNO(GetContents("/proc/self/stat", &proc_stat));
+ std::vector<std::string> pieces = absl::StrSplit(proc_stat, ' ');
+ ASSERT_GT(pieces.size(), 20);
+
+ int niceness_procfs = 0;
+ ASSERT_TRUE(absl::SimpleAtoi(pieces[18], &niceness_procfs));
+ EXPECT_EQ(niceness_procfs, kNiceVal);
+}
+
+// In the kernel's implementation, values outside the range of [-20, 19] are
+// truncated to these minimum and maximum values. See
+// https://elixir.bootlin.com/linux/v4.4/source/kernel/sys.c#L190
+TEST(SetpriorityTest, OutsideRange) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_NICE)));
+
+ // Set niceval > 19
+ EXPECT_THAT(setpriority(PRIO_PROCESS, getpid(), /*nice=*/100),
+ SyscallSucceeds());
+
+ errno = 0;
+ // Test niceval truncated to 19
+ EXPECT_THAT(getpriority(PRIO_PROCESS, getpid()),
+ SyscallSucceedsWithValue(/*maxnice=*/19));
+
+ // Set niceval < -20
+ EXPECT_THAT(setpriority(PRIO_PROCESS, getpid(), /*nice=*/-100),
+ SyscallSucceeds());
+
+ errno = 0;
+ // Test niceval truncated to -20
+ EXPECT_THAT(getpriority(PRIO_PROCESS, getpid()),
+ SyscallSucceedsWithValue(/*minnice=*/-20));
+}
+
+// Process is not found when which=PRIO_PROCESS
+TEST(SetpriorityTest, InvalidWho) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_NICE)));
+
+ // Flaky, but it's tough to avoid a race condition when finding an unused pid
+ EXPECT_THAT(setpriority(PRIO_PROCESS,
+ /*who=*/INT_MAX - 1,
+ /*nice=*/16),
+ SyscallFailsWithErrno(ESRCH));
+}
+
+// Nice succeeds, correctly modifies (or in this case does not
+// modify priority of process
+TEST(SetpriorityTest, NiceSucceeds) {
+ errno = 0;
+ const int priority_before = getpriority(PRIO_PROCESS, /*who=*/0);
+ ASSERT_THAT(nice(/*inc=*/0), SyscallSucceeds());
+
+ // nice(0) should not change priority
+ EXPECT_EQ(priority_before, getpriority(PRIO_PROCESS, /*who=*/0));
+}
+
+// Threads resulting from clone() maintain parent's priority
+// Changes to child priority do not affect parent's priority
+TEST(GetpriorityTest, CloneMaintainsPriority) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_NICE)));
+
+ constexpr int kParentPriority = 16;
+ constexpr int kChildPriority = 14;
+ ASSERT_THAT(setpriority(PRIO_PROCESS, getpid(), kParentPriority),
+ SyscallSucceeds());
+
+ ScopedThread th([]() {
+ // Check that priority equals that of parent thread
+ pid_t my_tid;
+ EXPECT_THAT(my_tid = syscall(__NR_gettid), SyscallSucceeds());
+ EXPECT_THAT(getpriority(PRIO_PROCESS, my_tid),
+ SyscallSucceedsWithValue(kParentPriority));
+
+ // Change the child thread's priority
+ EXPECT_THAT(setpriority(PRIO_PROCESS, my_tid, kChildPriority),
+ SyscallSucceeds());
+ });
+ th.Join();
+
+ // Check that parent's priority reemained the same even though
+ // the child's priority was altered
+ EXPECT_EQ(kParentPriority, getpriority(PRIO_PROCESS, syscall(__NR_gettid)));
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/priority_execve.cc b/test/syscalls/linux/priority_execve.cc
new file mode 100644
index 000000000..5cb343bad
--- /dev/null
+++ b/test/syscalls/linux/priority_execve.cc
@@ -0,0 +1,42 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/resource.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+int main(int argc, char** argv, char** envp) {
+ errno = 0;
+ int prio = getpriority(PRIO_PROCESS, getpid());
+
+ // NOTE: getpriority() can legitimately return negative values
+ // in the range [-20, 0). If errno is set, exit with a value that
+ // could not be reached by a valid priority. Valid exit values
+ // for the test are in the range [1, 40], so we'll use 0.
+ if (errno != 0) {
+ printf("getpriority() failed with errno = %d\n", errno);
+ exit(0);
+ }
+
+ // Used by test to verify priority is being maintained through
+ // calls to execve(). Since prio should always be in the range
+ // [-20, 19], we offset by 20 so as not to have negative exit codes.
+ exit(20 - prio);
+
+ return 0;
+}
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
new file mode 100644
index 000000000..d6b875dbf
--- /dev/null
+++ b/test/syscalls/linux/proc.cc
@@ -0,0 +1,2173 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <elf.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sched.h>
+#include <signal.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/prctl.h>
+#include <sys/stat.h>
+#include <sys/utsname.h>
+#include <syscall.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <atomic>
+#include <functional>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <ostream>
+#include <regex>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/strings/ascii.h"
+#include "absl/strings/match.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/synchronization/notification.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/capability_util.h"
+#include "test/util/cleanup.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/memory_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+#include "test/util/time_util.h"
+#include "test/util/timer_util.h"
+
+// NOTE(magi): No, this isn't really a syscall but this is a really simple
+// way to get it tested on both gVisor, PTrace and Linux.
+
+using ::testing::AllOf;
+using ::testing::AnyOf;
+using ::testing::ContainerEq;
+using ::testing::Contains;
+using ::testing::ContainsRegex;
+using ::testing::Eq;
+using ::testing::Gt;
+using ::testing::HasSubstr;
+using ::testing::IsSupersetOf;
+using ::testing::Pair;
+using ::testing::UnorderedElementsAre;
+using ::testing::UnorderedElementsAreArray;
+
+// Exported by glibc.
+extern char** environ;
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+#ifndef SUID_DUMP_DISABLE
+#define SUID_DUMP_DISABLE 0
+#endif /* SUID_DUMP_DISABLE */
+#ifndef SUID_DUMP_USER
+#define SUID_DUMP_USER 1
+#endif /* SUID_DUMP_USER */
+#ifndef SUID_DUMP_ROOT
+#define SUID_DUMP_ROOT 2
+#endif /* SUID_DUMP_ROOT */
+
+#if defined(__x86_64__) || defined(__i386__)
+// This list of "required" fields is taken from reading the file
+// arch/x86/kernel/cpu/proc.c and seeing which fields will be unconditionally
+// printed by the kernel.
+static const char* required_fields[] = {
+ "processor",
+ "vendor_id",
+ "cpu family",
+ "model\t\t:",
+ "model name",
+ "stepping",
+ "cpu MHz",
+ "fpu\t\t:",
+ "fpu_exception",
+ "cpuid level",
+ "wp",
+ "bogomips",
+ "clflush size",
+ "cache_alignment",
+ "address sizes",
+ "power management",
+};
+#elif __aarch64__
+// This list of "required" fields is taken from reading the file
+// arch/arm64/kernel/cpuinfo.c and seeing which fields will be unconditionally
+// printed by the kernel.
+static const char* required_fields[] = {
+ "processor", "BogoMIPS", "Features", "CPU implementer",
+ "CPU architecture", "CPU variant", "CPU part", "CPU revision",
+};
+#else
+#error "Unknown architecture"
+#endif
+
+// Takes the subprocess command line and pid.
+// If it returns !OK, WithSubprocess returns immediately.
+using SubprocessCallback = std::function<PosixError(int)>;
+
+std::vector<std::string> saved_argv; // NOLINT
+
+// Helper function to dump /proc/{pid}/status and check the
+// state data. State should = "Z" for zombied or "RSD" for
+// running, interruptible sleeping (S), or uninterruptible sleep
+// (D).
+void CompareProcessState(absl::string_view state, int pid) {
+ auto status_file = ASSERT_NO_ERRNO_AND_VALUE(
+ GetContents(absl::StrCat("/proc/", pid, "/status")));
+ // N.B. POSIX extended regexes don't support shorthand character classes (\w)
+ // inside of brackets.
+ EXPECT_THAT(status_file,
+ ContainsRegex(absl::StrCat("State:.[", state,
+ R"EOL(]\s+\([a-zA-Z ]+\))EOL")));
+}
+
+// Run callbacks while a subprocess is running, zombied, and/or exited.
+PosixError WithSubprocess(SubprocessCallback const& running,
+ SubprocessCallback const& zombied,
+ SubprocessCallback const& exited) {
+ int pipe_fds[2] = {};
+ if (pipe(pipe_fds) < 0) {
+ return PosixError(errno, "pipe");
+ }
+
+ int child_pid = fork();
+ if (child_pid < 0) {
+ return PosixError(errno, "fork");
+ }
+
+ if (child_pid == 0) {
+ close(pipe_fds[0]); // Close the read end.
+ const DisableSave ds; // Timing issues.
+
+ // Write to the pipe to tell it we're ready.
+ char buf = 'a';
+ int res = 0;
+ res = WriteFd(pipe_fds[1], &buf, sizeof(buf));
+ TEST_CHECK_MSG(res == sizeof(buf), "Write failure in subprocess");
+
+ while (true) {
+ SleepSafe(absl::Milliseconds(100));
+ }
+ }
+
+ close(pipe_fds[1]); // Close the write end.
+
+ int status = 0;
+ auto wait_cleanup = Cleanup([child_pid, &status] {
+ EXPECT_THAT(waitpid(child_pid, &status, 0), SyscallSucceeds());
+ });
+ auto kill_cleanup = Cleanup([child_pid] {
+ EXPECT_THAT(kill(child_pid, SIGKILL), SyscallSucceeds());
+ });
+
+ // Wait for the child.
+ char buf = 0;
+ int res = ReadFd(pipe_fds[0], &buf, sizeof(buf));
+ if (res < 0) {
+ return PosixError(errno, "Read from pipe");
+ } else if (res == 0) {
+ return PosixError(EPIPE, "Unable to read from pipe: EOF");
+ }
+
+ if (running) {
+ // The first arg, RSD, refers to a "running process", or a process with a
+ // state of Running (R), Interruptable Sleep (S) or Uninterruptable
+ // Sleep (D).
+ CompareProcessState("RSD", child_pid);
+ RETURN_IF_ERRNO(running(child_pid));
+ }
+
+ // Kill the process.
+ kill_cleanup.Release()();
+ siginfo_t info;
+ // Wait until the child process has exited (WEXITED flag) but don't
+ // reap the child (WNOWAIT flag).
+ EXPECT_THAT(waitid(P_PID, child_pid, &info, WNOWAIT | WEXITED),
+ SyscallSucceeds());
+
+ if (zombied) {
+ // Arg of "Z" refers to a Zombied Process.
+ CompareProcessState("Z", child_pid);
+ RETURN_IF_ERRNO(zombied(child_pid));
+ }
+
+ // Wait on the process.
+ wait_cleanup.Release()();
+ // If the process is reaped, then then this should return
+ // with ECHILD.
+ EXPECT_THAT(waitpid(child_pid, &status, WNOHANG),
+ SyscallFailsWithErrno(ECHILD));
+
+ if (exited) {
+ RETURN_IF_ERRNO(exited(child_pid));
+ }
+
+ return NoError();
+}
+
+// Access the file returned by name when a subprocess is running.
+PosixError AccessWhileRunning(std::function<std::string(int pid)> name,
+ int flags, std::function<void(int fd)> access) {
+ FileDescriptor fd;
+ return WithSubprocess(
+ [&](int pid) -> PosixError {
+ // Running.
+ ASSIGN_OR_RETURN_ERRNO(fd, Open(name(pid), flags));
+
+ access(fd.get());
+ return NoError();
+ },
+ nullptr, nullptr);
+}
+
+// Access the file returned by name when the a subprocess is zombied.
+PosixError AccessWhileZombied(std::function<std::string(int pid)> name,
+ int flags, std::function<void(int fd)> access) {
+ FileDescriptor fd;
+ return WithSubprocess(
+ [&](int pid) -> PosixError {
+ // Running.
+ ASSIGN_OR_RETURN_ERRNO(fd, Open(name(pid), flags));
+ return NoError();
+ },
+ [&](int pid) -> PosixError {
+ // Zombied.
+ access(fd.get());
+ return NoError();
+ },
+ nullptr);
+}
+
+// Access the file returned by name when the a subprocess is exited.
+PosixError AccessWhileExited(std::function<std::string(int pid)> name,
+ int flags, std::function<void(int fd)> access) {
+ FileDescriptor fd;
+ return WithSubprocess(
+ [&](int pid) -> PosixError {
+ // Running.
+ ASSIGN_OR_RETURN_ERRNO(fd, Open(name(pid), flags));
+ return NoError();
+ },
+ nullptr,
+ [&](int pid) -> PosixError {
+ // Exited.
+ access(fd.get());
+ return NoError();
+ });
+}
+
+// ReadFd(fd=/proc/PID/basename) while PID is running.
+int ReadWhileRunning(std::string const& basename, void* buf, size_t count) {
+ int ret = 0;
+ int err = 0;
+ EXPECT_NO_ERRNO(AccessWhileRunning(
+ [&](int pid) -> std::string {
+ return absl::StrCat("/proc/", pid, "/", basename);
+ },
+ O_RDONLY,
+ [&](int fd) {
+ ret = ReadFd(fd, buf, count);
+ err = errno;
+ }));
+ errno = err;
+ return ret;
+}
+
+// ReadFd(fd=/proc/PID/basename) while PID is zombied.
+int ReadWhileZombied(std::string const& basename, void* buf, size_t count) {
+ int ret = 0;
+ int err = 0;
+ EXPECT_NO_ERRNO(AccessWhileZombied(
+ [&](int pid) -> std::string {
+ return absl::StrCat("/proc/", pid, "/", basename);
+ },
+ O_RDONLY,
+ [&](int fd) {
+ ret = ReadFd(fd, buf, count);
+ err = errno;
+ }));
+ errno = err;
+ return ret;
+}
+
+// ReadFd(fd=/proc/PID/basename) while PID is exited.
+int ReadWhileExited(std::string const& basename, void* buf, size_t count) {
+ int ret = 0;
+ int err = 0;
+ EXPECT_NO_ERRNO(AccessWhileExited(
+ [&](int pid) -> std::string {
+ return absl::StrCat("/proc/", pid, "/", basename);
+ },
+ O_RDONLY,
+ [&](int fd) {
+ ret = ReadFd(fd, buf, count);
+ err = errno;
+ }));
+ errno = err;
+ return ret;
+}
+
+// readlinkat(fd=/proc/PID/, basename) while PID is running.
+int ReadlinkWhileRunning(std::string const& basename, char* buf, size_t count) {
+ int ret = 0;
+ int err = 0;
+ EXPECT_NO_ERRNO(AccessWhileRunning(
+ [&](int pid) -> std::string { return absl::StrCat("/proc/", pid, "/"); },
+ O_DIRECTORY,
+ [&](int fd) {
+ ret = readlinkat(fd, basename.c_str(), buf, count);
+ err = errno;
+ }));
+ errno = err;
+ return ret;
+}
+
+// readlinkat(fd=/proc/PID/, basename) while PID is zombied.
+int ReadlinkWhileZombied(std::string const& basename, char* buf, size_t count) {
+ int ret = 0;
+ int err = 0;
+ EXPECT_NO_ERRNO(AccessWhileZombied(
+ [&](int pid) -> std::string { return absl::StrCat("/proc/", pid, "/"); },
+ O_DIRECTORY,
+ [&](int fd) {
+ ret = readlinkat(fd, basename.c_str(), buf, count);
+ err = errno;
+ }));
+ errno = err;
+ return ret;
+}
+
+// readlinkat(fd=/proc/PID/, basename) while PID is exited.
+int ReadlinkWhileExited(std::string const& basename, char* buf, size_t count) {
+ int ret = 0;
+ int err = 0;
+ EXPECT_NO_ERRNO(AccessWhileExited(
+ [&](int pid) -> std::string { return absl::StrCat("/proc/", pid, "/"); },
+ O_DIRECTORY,
+ [&](int fd) {
+ ret = readlinkat(fd, basename.c_str(), buf, count);
+ err = errno;
+ }));
+ errno = err;
+ return ret;
+}
+
+TEST(ProcTest, NotFoundInRoot) {
+ struct stat s;
+ EXPECT_THAT(stat("/proc/foobar", &s), SyscallFailsWithErrno(ENOENT));
+}
+
+TEST(ProcSelfTest, IsThreadGroupLeader) {
+ ScopedThread([] {
+ const pid_t tgid = getpid();
+ const pid_t tid = syscall(SYS_gettid);
+ EXPECT_NE(tgid, tid);
+ auto link = ASSERT_NO_ERRNO_AND_VALUE(ReadLink("/proc/self"));
+ EXPECT_EQ(link, absl::StrCat(tgid));
+ });
+}
+
+TEST(ProcThreadSelfTest, Basic) {
+ const pid_t tgid = getpid();
+ const pid_t tid = syscall(SYS_gettid);
+ EXPECT_EQ(tgid, tid);
+ auto link_threadself =
+ ASSERT_NO_ERRNO_AND_VALUE(ReadLink("/proc/thread-self"));
+ EXPECT_EQ(link_threadself, absl::StrCat(tgid, "/task/", tid));
+ // Just read one file inside thread-self to ensure that the link is valid.
+ auto link_threadself_exe =
+ ASSERT_NO_ERRNO_AND_VALUE(ReadLink("/proc/thread-self/exe"));
+ auto link_procself_exe =
+ ASSERT_NO_ERRNO_AND_VALUE(ReadLink("/proc/self/exe"));
+ EXPECT_EQ(link_threadself_exe, link_procself_exe);
+}
+
+TEST(ProcThreadSelfTest, Thread) {
+ ScopedThread([] {
+ const pid_t tgid = getpid();
+ const pid_t tid = syscall(SYS_gettid);
+ EXPECT_NE(tgid, tid);
+ auto link_threadself =
+ ASSERT_NO_ERRNO_AND_VALUE(ReadLink("/proc/thread-self"));
+
+ EXPECT_EQ(link_threadself, absl::StrCat(tgid, "/task/", tid));
+ // Just read one file inside thread-self to ensure that the link is valid.
+ auto link_threadself_exe =
+ ASSERT_NO_ERRNO_AND_VALUE(ReadLink("/proc/thread-self/exe"));
+ auto link_procself_exe =
+ ASSERT_NO_ERRNO_AND_VALUE(ReadLink("/proc/self/exe"));
+ EXPECT_EQ(link_threadself_exe, link_procself_exe);
+ // A thread should not have "/proc/<tid>/task".
+ struct stat s;
+ EXPECT_THAT(stat("/proc/thread-self/task", &s),
+ SyscallFailsWithErrno(ENOENT));
+ });
+}
+
+// Returns the /proc/PID/maps entry for the MAP_PRIVATE | MAP_ANONYMOUS mapping
+// m with start address addr and length len.
+std::string AnonymousMapsEntry(uintptr_t addr, size_t len, int prot) {
+ return absl::StrCat(absl::Hex(addr, absl::PadSpec::kZeroPad8), "-",
+ absl::Hex(addr + len, absl::PadSpec::kZeroPad8), " ",
+ prot & PROT_READ ? "r" : "-",
+ prot & PROT_WRITE ? "w" : "-",
+ prot & PROT_EXEC ? "x" : "-", "p 00000000 00:00 0 ");
+}
+
+std::string AnonymousMapsEntryForMapping(const Mapping& m, int prot) {
+ return AnonymousMapsEntry(m.addr(), m.len(), prot);
+}
+
+PosixErrorOr<std::map<uint64_t, uint64_t>> ReadProcSelfAuxv() {
+ std::string auxv_file;
+ RETURN_IF_ERRNO(GetContents("/proc/self/auxv", &auxv_file));
+ const Elf64_auxv_t* auxv_data =
+ reinterpret_cast<const Elf64_auxv_t*>(auxv_file.data());
+ std::map<uint64_t, uint64_t> auxv_entries;
+ for (int i = 0; auxv_data[i].a_type != AT_NULL; i++) {
+ auto a_type = auxv_data[i].a_type;
+ EXPECT_EQ(0, auxv_entries.count(a_type)) << "a_type: " << a_type;
+ auxv_entries.emplace(a_type, auxv_data[i].a_un.a_val);
+ }
+ return auxv_entries;
+}
+
+TEST(ProcSelfAuxv, EntryPresence) {
+ auto auxv_entries = ASSERT_NO_ERRNO_AND_VALUE(ReadProcSelfAuxv());
+
+ EXPECT_EQ(auxv_entries.count(AT_ENTRY), 1);
+ EXPECT_EQ(auxv_entries.count(AT_PHDR), 1);
+ EXPECT_EQ(auxv_entries.count(AT_PHENT), 1);
+ EXPECT_EQ(auxv_entries.count(AT_PHNUM), 1);
+ EXPECT_EQ(auxv_entries.count(AT_BASE), 1);
+ EXPECT_EQ(auxv_entries.count(AT_UID), 1);
+ EXPECT_EQ(auxv_entries.count(AT_EUID), 1);
+ EXPECT_EQ(auxv_entries.count(AT_GID), 1);
+ EXPECT_EQ(auxv_entries.count(AT_EGID), 1);
+ EXPECT_EQ(auxv_entries.count(AT_SECURE), 1);
+ EXPECT_EQ(auxv_entries.count(AT_CLKTCK), 1);
+ EXPECT_EQ(auxv_entries.count(AT_RANDOM), 1);
+ EXPECT_EQ(auxv_entries.count(AT_EXECFN), 1);
+ EXPECT_EQ(auxv_entries.count(AT_PAGESZ), 1);
+ EXPECT_EQ(auxv_entries.count(AT_SYSINFO_EHDR), 1);
+}
+
+TEST(ProcSelfAuxv, EntryValues) {
+ auto proc_auxv = ASSERT_NO_ERRNO_AND_VALUE(ReadProcSelfAuxv());
+
+ // We need to find the ELF auxiliary vector. The section of memory pointed to
+ // by envp contains some pointers to non-null pointers, followed by a single
+ // pointer to a null pointer, followed by the auxiliary vector.
+ char** envpi = environ;
+ while (*envpi) {
+ ++envpi;
+ }
+
+ const Elf64_auxv_t* envp_auxv =
+ reinterpret_cast<const Elf64_auxv_t*>(envpi + 1);
+ int i;
+ for (i = 0; envp_auxv[i].a_type != AT_NULL; i++) {
+ auto a_type = envp_auxv[i].a_type;
+ EXPECT_EQ(proc_auxv.count(a_type), 1);
+ EXPECT_EQ(proc_auxv[a_type], envp_auxv[i].a_un.a_val)
+ << "a_type: " << a_type;
+ }
+ EXPECT_EQ(i, proc_auxv.size());
+}
+
+// Just open and read /proc/self/maps, check that we can find [stack]
+TEST(ProcSelfMaps, Basic) {
+ auto proc_self_maps =
+ ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/maps"));
+
+ std::vector<std::string> strings = absl::StrSplit(proc_self_maps, '\n');
+ std::vector<std::string> stacks;
+ // Make sure there's a stack in there.
+ for (const auto& str : strings) {
+ if (str.find("[stack]") != std::string::npos) {
+ stacks.push_back(str);
+ }
+ }
+ ASSERT_EQ(1, stacks.size()) << "[stack] not found in: " << proc_self_maps;
+ // Linux pads to 73 characters then we add 7.
+ EXPECT_EQ(80, stacks[0].length());
+}
+
+TEST(ProcSelfMaps, Map1) {
+ Mapping mapping =
+ ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_READ, MAP_PRIVATE));
+ auto proc_self_maps =
+ ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/maps"));
+ std::vector<std::string> strings = absl::StrSplit(proc_self_maps, '\n');
+ std::vector<std::string> addrs;
+ // Make sure if is listed.
+ for (const auto& str : strings) {
+ if (str == AnonymousMapsEntryForMapping(mapping, PROT_READ)) {
+ addrs.push_back(str);
+ }
+ }
+ ASSERT_EQ(1, addrs.size());
+}
+
+TEST(ProcSelfMaps, Map2) {
+ // NOTE(magi): The permissions must be different or the pages will get merged.
+ Mapping map1 = ASSERT_NO_ERRNO_AND_VALUE(
+ MmapAnon(kPageSize, PROT_READ | PROT_EXEC, MAP_PRIVATE));
+ Mapping map2 =
+ ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_WRITE, MAP_PRIVATE));
+
+ auto proc_self_maps =
+ ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/maps"));
+ std::vector<std::string> strings = absl::StrSplit(proc_self_maps, '\n');
+ std::vector<std::string> addrs;
+ // Make sure if is listed.
+ for (const auto& str : strings) {
+ if (str == AnonymousMapsEntryForMapping(map1, PROT_READ | PROT_EXEC)) {
+ addrs.push_back(str);
+ }
+ }
+ ASSERT_EQ(1, addrs.size());
+ addrs.clear();
+ for (const auto& str : strings) {
+ if (str == AnonymousMapsEntryForMapping(map2, PROT_WRITE)) {
+ addrs.push_back(str);
+ }
+ }
+ ASSERT_EQ(1, addrs.size());
+}
+
+TEST(ProcSelfMaps, MapUnmap) {
+ Mapping map1 = ASSERT_NO_ERRNO_AND_VALUE(
+ MmapAnon(kPageSize, PROT_READ | PROT_EXEC, MAP_PRIVATE));
+ Mapping map2 =
+ ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(kPageSize, PROT_WRITE, MAP_PRIVATE));
+
+ auto proc_self_maps =
+ ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/maps"));
+ std::vector<std::string> strings = absl::StrSplit(proc_self_maps, '\n');
+ std::vector<std::string> addrs;
+ // Make sure if is listed.
+ for (const auto& str : strings) {
+ if (str == AnonymousMapsEntryForMapping(map1, PROT_READ | PROT_EXEC)) {
+ addrs.push_back(str);
+ }
+ }
+ ASSERT_EQ(1, addrs.size()) << proc_self_maps;
+ addrs.clear();
+ for (const auto& str : strings) {
+ if (str == AnonymousMapsEntryForMapping(map2, PROT_WRITE)) {
+ addrs.push_back(str);
+ }
+ }
+ ASSERT_EQ(1, addrs.size());
+
+ map2.reset();
+
+ // Read it again.
+ proc_self_maps = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/maps"));
+ strings = absl::StrSplit(proc_self_maps, '\n');
+ // First entry should be there.
+ addrs.clear();
+ for (const auto& str : strings) {
+ if (str == AnonymousMapsEntryForMapping(map1, PROT_READ | PROT_EXEC)) {
+ addrs.push_back(str);
+ }
+ }
+ ASSERT_EQ(1, addrs.size());
+ addrs.clear();
+ // But not the second.
+ for (const auto& str : strings) {
+ if (str == AnonymousMapsEntryForMapping(map2, PROT_WRITE)) {
+ addrs.push_back(str);
+ }
+ }
+ ASSERT_EQ(0, addrs.size());
+}
+
+TEST(ProcSelfMaps, Mprotect) {
+ // FIXME(jamieliu): Linux's mprotect() sometimes fails to merge VMAs in this
+ // case.
+ SKIP_IF(!IsRunningOnGvisor());
+
+ // Reserve 5 pages of address space.
+ Mapping m = ASSERT_NO_ERRNO_AND_VALUE(
+ MmapAnon(5 * kPageSize, PROT_NONE, MAP_PRIVATE));
+
+ // Change the permissions on the middle 3 pages. (The first and last pages may
+ // be merged with other vmas on either side, so they aren't tested directly;
+ // they just ensure that the middle 3 pages are bracketed by VMAs with
+ // incompatible permissions.)
+ ASSERT_THAT(mprotect(reinterpret_cast<void*>(m.addr() + kPageSize),
+ 3 * kPageSize, PROT_READ),
+ SyscallSucceeds());
+
+ // Check that the middle 3 pages make up a single VMA.
+ auto proc_self_maps =
+ ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/maps"));
+ std::vector<std::string> strings = absl::StrSplit(proc_self_maps, '\n');
+ EXPECT_THAT(strings, Contains(AnonymousMapsEntry(m.addr() + kPageSize,
+ 3 * kPageSize, PROT_READ)));
+
+ // Change the permissions on the middle page only.
+ ASSERT_THAT(mprotect(reinterpret_cast<void*>(m.addr() + 2 * kPageSize),
+ kPageSize, PROT_READ | PROT_WRITE),
+ SyscallSucceeds());
+
+ // Check that the single VMA has been split into 3 VMAs.
+ proc_self_maps = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/maps"));
+ strings = absl::StrSplit(proc_self_maps, '\n');
+ EXPECT_THAT(
+ strings,
+ IsSupersetOf(
+ {AnonymousMapsEntry(m.addr() + kPageSize, kPageSize, PROT_READ),
+ AnonymousMapsEntry(m.addr() + 2 * kPageSize, kPageSize,
+ PROT_READ | PROT_WRITE),
+ AnonymousMapsEntry(m.addr() + 3 * kPageSize, kPageSize,
+ PROT_READ)}));
+
+ // Change the permissions on the middle page back.
+ ASSERT_THAT(mprotect(reinterpret_cast<void*>(m.addr() + 2 * kPageSize),
+ kPageSize, PROT_READ),
+ SyscallSucceeds());
+
+ // Check that the 3 VMAs have been merged back into a single VMA.
+ proc_self_maps = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/maps"));
+ strings = absl::StrSplit(proc_self_maps, '\n');
+ EXPECT_THAT(strings, Contains(AnonymousMapsEntry(m.addr() + kPageSize,
+ 3 * kPageSize, PROT_READ)));
+}
+
+TEST(ProcSelfFd, OpenFd) {
+ int pipe_fds[2];
+ ASSERT_THAT(pipe2(pipe_fds, O_CLOEXEC), SyscallSucceeds());
+
+ // Reopen the write end.
+ const std::string path = absl::StrCat("/proc/self/fd/", pipe_fds[1]);
+ const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, O_WRONLY));
+
+ // Ensure that a read/write works.
+ const std::string data = "hello";
+ std::unique_ptr<char[]> buffer(new char[data.size()]);
+ EXPECT_THAT(write(fd.get(), data.c_str(), data.size()),
+ SyscallSucceedsWithValue(5));
+ EXPECT_THAT(read(pipe_fds[0], buffer.get(), data.size()),
+ SyscallSucceedsWithValue(5));
+ EXPECT_EQ(strncmp(buffer.get(), data.c_str(), data.size()), 0);
+
+ // Cleanup.
+ ASSERT_THAT(close(pipe_fds[0]), SyscallSucceeds());
+ ASSERT_THAT(close(pipe_fds[1]), SyscallSucceeds());
+}
+
+TEST(ProcSelfFdInfo, CorrectFds) {
+ // Make sure there is at least one open file.
+ auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(f.path(), O_RDONLY));
+
+ // Get files in /proc/self/fd.
+ auto fd_files = ASSERT_NO_ERRNO_AND_VALUE(ListDir("/proc/self/fd", false));
+
+ // Get files in /proc/self/fdinfo.
+ auto fdinfo_files =
+ ASSERT_NO_ERRNO_AND_VALUE(ListDir("/proc/self/fdinfo", false));
+
+ // They should contain the same fds.
+ EXPECT_THAT(fd_files, UnorderedElementsAreArray(fdinfo_files));
+
+ // Both should contain fd.
+ auto fd_s = absl::StrCat(fd.get());
+ EXPECT_THAT(fd_files, Contains(fd_s));
+}
+
+TEST(ProcSelfFdInfo, Flags) {
+ std::string path = NewTempAbsPath();
+
+ // Create file here with O_CREAT to test that O_CREAT does not appear in
+ // fdinfo flags.
+ int flags = O_CREAT | O_RDWR | O_APPEND | O_CLOEXEC;
+ const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(path, flags, 0644));
+
+ // Automatically delete path.
+ TempPath temp_path(path);
+
+ // O_CREAT does not appear in fdinfo flags.
+ flags &= ~O_CREAT;
+
+ // O_LARGEFILE always appears (on x86_64).
+ flags |= kOLargeFile;
+
+ auto fd_info = ASSERT_NO_ERRNO_AND_VALUE(
+ GetContents(absl::StrCat("/proc/self/fdinfo/", fd.get())));
+ EXPECT_THAT(fd_info, HasSubstr(absl::StrFormat("flags:\t%#o", flags)));
+}
+
+TEST(ProcSelfExe, Absolute) {
+ auto exe = ASSERT_NO_ERRNO_AND_VALUE(
+ ReadLink(absl::StrCat("/proc/", getpid(), "/exe")));
+ EXPECT_EQ(exe[0], '/');
+}
+
+// Sanity check for /proc/cpuinfo fields that must be present.
+TEST(ProcCpuinfo, RequiredFieldsArePresent) {
+ std::string proc_cpuinfo =
+ ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/cpuinfo"));
+ ASSERT_FALSE(proc_cpuinfo.empty());
+ std::vector<std::string> cpuinfo_fields = absl::StrSplit(proc_cpuinfo, '\n');
+
+ // Check that the usual fields are there. We don't really care about the
+ // contents.
+ for (const std::string& field : required_fields) {
+ EXPECT_THAT(proc_cpuinfo, HasSubstr(field));
+ }
+}
+
+TEST(ProcCpuinfo, DeniesWriteNonRoot) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_FOWNER)));
+
+ // Do setuid in a separate thread so that after finishing this test, the
+ // process can still open files the test harness created before starting this
+ // test. Otherwise, the files are created by root (UID before the test), but
+ // cannot be opened by the `uid` set below after the test. After calling
+ // setuid(non-zero-UID), there is no way to get root privileges back.
+ ScopedThread([&] {
+ // Use syscall instead of glibc setuid wrapper because we want this setuid
+ // call to only apply to this task. POSIX threads, however, require that all
+ // threads have the same UIDs, so using the setuid wrapper sets all threads'
+ // real UID.
+ // Also drops capabilities.
+ constexpr int kNobody = 65534;
+ EXPECT_THAT(syscall(SYS_setuid, kNobody), SyscallSucceeds());
+ EXPECT_THAT(open("/proc/cpuinfo", O_WRONLY), SyscallFailsWithErrno(EACCES));
+ // TODO(gvisor.dev/issue/1193): Properly support setting size attributes in
+ // kernfs.
+ if (!IsRunningOnGvisor() || IsRunningWithVFS1()) {
+ EXPECT_THAT(truncate("/proc/cpuinfo", 123),
+ SyscallFailsWithErrno(EACCES));
+ }
+ });
+}
+
+// With root privileges, it is possible to open /proc/cpuinfo with write mode,
+// but all write operations will return EIO.
+TEST(ProcCpuinfo, DeniesWriteRoot) {
+ // VFS1 does not behave differently for root/non-root.
+ SKIP_IF(IsRunningWithVFS1());
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_FOWNER)));
+
+ int fd;
+ EXPECT_THAT(fd = open("/proc/cpuinfo", O_WRONLY), SyscallSucceeds());
+ if (fd > 0) {
+ EXPECT_THAT(write(fd, "x", 1), SyscallFailsWithErrno(EIO));
+ EXPECT_THAT(pwrite(fd, "x", 1, 123), SyscallFailsWithErrno(EIO));
+ }
+ // TODO(gvisor.dev/issue/1193): Properly support setting size attributes in
+ // kernfs.
+ if (!IsRunningOnGvisor() || IsRunningWithVFS1()) {
+ if (fd > 0) {
+ EXPECT_THAT(ftruncate(fd, 123), SyscallFailsWithErrno(EIO));
+ }
+ EXPECT_THAT(truncate("/proc/cpuinfo", 123), SyscallFailsWithErrno(EIO));
+ }
+}
+
+// Sanity checks that uptime is present.
+TEST(ProcUptime, IsPresent) {
+ std::string proc_uptime =
+ ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/uptime"));
+ ASSERT_FALSE(proc_uptime.empty());
+ std::vector<std::string> uptime_parts = absl::StrSplit(proc_uptime, ' ');
+
+ // Parse once.
+ double uptime0, uptime1, idletime0, idletime1;
+ ASSERT_TRUE(absl::SimpleAtod(uptime_parts[0], &uptime0));
+ ASSERT_TRUE(absl::SimpleAtod(uptime_parts[1], &idletime0));
+
+ // Sleep for one second.
+ absl::SleepFor(absl::Seconds(1));
+
+ // Parse again.
+ proc_uptime = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/uptime"));
+ ASSERT_FALSE(proc_uptime.empty());
+ uptime_parts = absl::StrSplit(proc_uptime, ' ');
+ ASSERT_TRUE(absl::SimpleAtod(uptime_parts[0], &uptime1));
+ ASSERT_TRUE(absl::SimpleAtod(uptime_parts[1], &idletime1));
+
+ // Sanity check.
+ //
+ // We assert that between 0.99 and 59.99 seconds have passed. If more than a
+ // minute has passed, then we must be executing really, really slowly.
+ EXPECT_GE(uptime0, 0.0);
+ EXPECT_GE(idletime0, 0.0);
+ EXPECT_GT(uptime1, uptime0);
+ EXPECT_GE(uptime1, uptime0 + 0.99);
+ EXPECT_LE(uptime1, uptime0 + 59.99);
+ EXPECT_GE(idletime1, idletime0);
+}
+
+TEST(ProcMeminfo, ContainsBasicFields) {
+ std::string proc_meminfo =
+ ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/meminfo"));
+ EXPECT_THAT(proc_meminfo, AllOf(ContainsRegex(R"(MemTotal:\s+[0-9]+ kB)"),
+ ContainsRegex(R"(MemFree:\s+[0-9]+ kB)")));
+}
+
+TEST(ProcStat, ContainsBasicFields) {
+ std::string proc_stat = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/stat"));
+
+ std::vector<std::string> names;
+ for (auto const& line : absl::StrSplit(proc_stat, '\n')) {
+ std::vector<std::string> fields =
+ absl::StrSplit(line, ' ', absl::SkipWhitespace());
+ if (fields.empty()) {
+ continue;
+ }
+ names.push_back(fields[0]);
+ }
+
+ EXPECT_THAT(names,
+ IsSupersetOf({"cpu", "intr", "ctxt", "btime", "processes",
+ "procs_running", "procs_blocked", "softirq"}));
+}
+
+TEST(ProcStat, EndsWithNewline) {
+ std::string proc_stat = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/stat"));
+ EXPECT_EQ(proc_stat.back(), '\n');
+}
+
+TEST(ProcStat, Fields) {
+ std::string proc_stat = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/stat"));
+
+ std::vector<std::string> names;
+ for (auto const& line : absl::StrSplit(proc_stat, '\n')) {
+ std::vector<std::string> fields =
+ absl::StrSplit(line, ' ', absl::SkipWhitespace());
+ if (fields.empty()) {
+ continue;
+ }
+
+ if (absl::StartsWith(fields[0], "cpu")) {
+ // As of Linux 3.11, each CPU entry has 10 fields, plus the name.
+ EXPECT_GE(fields.size(), 11) << proc_stat;
+ } else if (fields[0] == "ctxt") {
+ // Single field.
+ EXPECT_EQ(fields.size(), 2) << proc_stat;
+ } else if (fields[0] == "btime") {
+ // Single field.
+ EXPECT_EQ(fields.size(), 2) << proc_stat;
+ } else if (fields[0] == "itime") {
+ // Single field.
+ ASSERT_EQ(fields.size(), 2) << proc_stat;
+ // This is the only floating point field.
+ double val;
+ EXPECT_TRUE(absl::SimpleAtod(fields[1], &val)) << proc_stat;
+ continue;
+ } else if (fields[0] == "processes") {
+ // Single field.
+ EXPECT_EQ(fields.size(), 2) << proc_stat;
+ } else if (fields[0] == "procs_running") {
+ // Single field.
+ EXPECT_EQ(fields.size(), 2) << proc_stat;
+ } else if (fields[0] == "procs_blocked") {
+ // Single field.
+ EXPECT_EQ(fields.size(), 2) << proc_stat;
+ } else if (fields[0] == "softirq") {
+ // As of Linux 3.11, there are 10 softirqs. 12 fields for name + total.
+ EXPECT_GE(fields.size(), 12) << proc_stat;
+ }
+
+ // All fields besides itime are valid base 10 numbers.
+ for (size_t i = 1; i < fields.size(); i++) {
+ uint64_t val;
+ EXPECT_TRUE(absl::SimpleAtoi(fields[i], &val)) << proc_stat;
+ }
+ }
+}
+
+TEST(ProcLoadavg, EndsWithNewline) {
+ std::string proc_loadvg =
+ ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/loadavg"));
+ EXPECT_EQ(proc_loadvg.back(), '\n');
+}
+
+TEST(ProcLoadavg, Fields) {
+ std::string proc_loadvg =
+ ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/loadavg"));
+ std::vector<std::string> lines = absl::StrSplit(proc_loadvg, '\n');
+
+ // Single line.
+ EXPECT_EQ(lines.size(), 2) << proc_loadvg;
+
+ std::vector<std::string> fields =
+ absl::StrSplit(lines[0], absl::ByAnyChar(" /"), absl::SkipWhitespace());
+
+ // Six fields.
+ EXPECT_EQ(fields.size(), 6) << proc_loadvg;
+
+ double val;
+ uint64_t val2;
+ // First three fields are floating point numbers.
+ EXPECT_TRUE(absl::SimpleAtod(fields[0], &val)) << proc_loadvg;
+ EXPECT_TRUE(absl::SimpleAtod(fields[1], &val)) << proc_loadvg;
+ EXPECT_TRUE(absl::SimpleAtod(fields[2], &val)) << proc_loadvg;
+ // Rest of the fields are valid base 10 numbers.
+ EXPECT_TRUE(absl::SimpleAtoi(fields[3], &val2)) << proc_loadvg;
+ EXPECT_TRUE(absl::SimpleAtoi(fields[4], &val2)) << proc_loadvg;
+ EXPECT_TRUE(absl::SimpleAtoi(fields[5], &val2)) << proc_loadvg;
+}
+
+// NOTE: Tests in priority.cc also check certain priority related fields in
+// /proc/self/stat.
+
+class ProcPidStatTest : public ::testing::TestWithParam<std::string> {};
+
+TEST_P(ProcPidStatTest, HasBasicFields) {
+ std::string proc_pid_stat = ASSERT_NO_ERRNO_AND_VALUE(
+ GetContents(absl::StrCat("/proc/", GetParam(), "/stat")));
+
+ ASSERT_FALSE(proc_pid_stat.empty());
+ std::vector<std::string> fields = absl::StrSplit(proc_pid_stat, ' ');
+ ASSERT_GE(fields.size(), 24);
+ EXPECT_EQ(absl::StrCat(getpid()), fields[0]);
+ // fields[1] is the thread name.
+ EXPECT_EQ("R", fields[2]); // task state
+ EXPECT_EQ(absl::StrCat(getppid()), fields[3]);
+
+ // If the test starts up quickly, then the process start time and the kernel
+ // boot time will be very close, and the proc starttime field (which is the
+ // delta of the two times) will be 0. For that unfortunate reason, we can
+ // only check that starttime >= 0, and not that it is strictly > 0.
+ uint64_t starttime;
+ ASSERT_TRUE(absl::SimpleAtoi(fields[21], &starttime));
+ EXPECT_GE(starttime, 0);
+
+ uint64_t vss;
+ ASSERT_TRUE(absl::SimpleAtoi(fields[22], &vss));
+ EXPECT_GT(vss, 0);
+
+ uint64_t rss;
+ ASSERT_TRUE(absl::SimpleAtoi(fields[23], &rss));
+ EXPECT_GT(rss, 0);
+
+ uint64_t rsslim;
+ ASSERT_TRUE(absl::SimpleAtoi(fields[24], &rsslim));
+ EXPECT_GT(rsslim, 0);
+}
+
+INSTANTIATE_TEST_SUITE_P(SelfAndNumericPid, ProcPidStatTest,
+ ::testing::Values("self", absl::StrCat(getpid())));
+
+using ProcPidStatmTest = ::testing::TestWithParam<std::string>;
+
+TEST_P(ProcPidStatmTest, HasBasicFields) {
+ std::string proc_pid_statm = ASSERT_NO_ERRNO_AND_VALUE(
+ GetContents(absl::StrCat("/proc/", GetParam(), "/statm")));
+ ASSERT_FALSE(proc_pid_statm.empty());
+ std::vector<std::string> fields = absl::StrSplit(proc_pid_statm, ' ');
+ ASSERT_GE(fields.size(), 7);
+
+ uint64_t vss;
+ ASSERT_TRUE(absl::SimpleAtoi(fields[0], &vss));
+ EXPECT_GT(vss, 0);
+
+ uint64_t rss;
+ ASSERT_TRUE(absl::SimpleAtoi(fields[1], &rss));
+ EXPECT_GT(rss, 0);
+}
+
+INSTANTIATE_TEST_SUITE_P(SelfAndNumericPid, ProcPidStatmTest,
+ ::testing::Values("self", absl::StrCat(getpid())));
+
+PosixErrorOr<uint64_t> CurrentRSS() {
+ ASSIGN_OR_RETURN_ERRNO(auto proc_self_stat, GetContents("/proc/self/stat"));
+ if (proc_self_stat.empty()) {
+ return PosixError(EINVAL, "empty /proc/self/stat");
+ }
+
+ std::vector<std::string> fields = absl::StrSplit(proc_self_stat, ' ');
+ if (fields.size() < 24) {
+ return PosixError(
+ EINVAL,
+ absl::StrCat("/proc/self/stat has too few fields: ", proc_self_stat));
+ }
+
+ uint64_t rss;
+ if (!absl::SimpleAtoi(fields[23], &rss)) {
+ return PosixError(
+ EINVAL, absl::StrCat("/proc/self/stat RSS field is not a number: ",
+ fields[23]));
+ }
+
+ // RSS is given in number of pages.
+ return rss * kPageSize;
+}
+
+// The size of mapping created by MapPopulateRSS.
+constexpr uint64_t kMappingSize = 100 << 20;
+
+// Tolerance on RSS comparisons to account for background thread mappings,
+// reclaimed pages, newly faulted pages, etc.
+constexpr uint64_t kRSSTolerance = 10 << 20;
+
+// Capture RSS before and after an anonymous mapping with passed prot.
+void MapPopulateRSS(int prot, uint64_t* before, uint64_t* after) {
+ *before = ASSERT_NO_ERRNO_AND_VALUE(CurrentRSS());
+
+ // N.B. The kernel asynchronously accumulates per-task RSS counters into the
+ // mm RSS, which is exposed by /proc/PID/stat. Task exit is a synchronization
+ // point (kernel/exit.c:do_exit -> sync_mm_rss), so perform the mapping on
+ // another thread to ensure it is reflected in RSS after the thread exits.
+ Mapping mapping;
+ ScopedThread t([&mapping, prot] {
+ mapping = ASSERT_NO_ERRNO_AND_VALUE(
+ MmapAnon(kMappingSize, prot, MAP_PRIVATE | MAP_POPULATE));
+ });
+ t.Join();
+
+ *after = ASSERT_NO_ERRNO_AND_VALUE(CurrentRSS());
+}
+
+// TODO(b/73896574): Test for PROT_READ + MAP_POPULATE anonymous mappings. Their
+// semantics are more subtle:
+//
+// Small pages -> Zero page mapped, not counted in RSS
+// (mm/memory.c:do_anonymous_page).
+//
+// Huge pages (THP enabled, use_zero_page=0) -> Pages committed
+// (mm/memory.c:__handle_mm_fault -> create_huge_pmd).
+//
+// Huge pages (THP enabled, use_zero_page=1) -> Zero page mapped, not counted in
+// RSS (mm/huge_memory.c:do_huge_pmd_anonymous_page).
+
+// PROT_WRITE + MAP_POPULATE anonymous mappings are always committed.
+TEST(ProcSelfStat, PopulateWriteRSS) {
+ uint64_t before, after;
+ MapPopulateRSS(PROT_READ | PROT_WRITE, &before, &after);
+
+ // Mapping is committed.
+ EXPECT_NEAR(before + kMappingSize, after, kRSSTolerance);
+}
+
+// PROT_NONE + MAP_POPULATE anonymous mappings are never committed.
+TEST(ProcSelfStat, PopulateNoneRSS) {
+ uint64_t before, after;
+ MapPopulateRSS(PROT_NONE, &before, &after);
+
+ // Mapping not committed.
+ EXPECT_NEAR(before, after, kRSSTolerance);
+}
+
+// Returns the calling thread's name.
+PosixErrorOr<std::string> ThreadName() {
+ // "The buffer should allow space for up to 16 bytes; the returned std::string
+ // will be null-terminated if it is shorter than that." - prctl(2). But we
+ // always want the thread name to be null-terminated.
+ char thread_name[17];
+ int rc = prctl(PR_GET_NAME, thread_name, 0, 0, 0);
+ MaybeSave();
+ if (rc < 0) {
+ return PosixError(errno, "prctl(PR_GET_NAME)");
+ }
+ thread_name[16] = '\0';
+ return std::string(thread_name);
+}
+
+// Parses the contents of a /proc/[pid]/status file into a collection of
+// key-value pairs.
+PosixErrorOr<std::map<std::string, std::string>> ParseProcStatus(
+ absl::string_view status_str) {
+ std::map<std::string, std::string> fields;
+ for (absl::string_view const line :
+ absl::StrSplit(status_str, '\n', absl::SkipWhitespace())) {
+ const std::pair<absl::string_view, absl::string_view> kv =
+ absl::StrSplit(line, absl::MaxSplits(":\t", 1));
+ if (kv.first.empty()) {
+ return PosixError(
+ EINVAL, absl::StrCat("failed to parse key in line \"", line, "\""));
+ }
+ std::string key(kv.first);
+ if (fields.count(key)) {
+ return PosixError(EINVAL,
+ absl::StrCat("duplicate key \"", kv.first, "\""));
+ }
+ std::string value(kv.second);
+ absl::StripLeadingAsciiWhitespace(&value);
+ fields.emplace(std::move(key), std::move(value));
+ }
+ return fields;
+}
+
+TEST(ParseProcStatusTest, ParsesSimpleStatusFileWithMixedWhitespaceCorrectly) {
+ EXPECT_THAT(
+ ParseProcStatus(
+ "Name:\tinit\nState:\tS (sleeping)\nCapEff:\t 0000001fffffffff\n"),
+ IsPosixErrorOkAndHolds(UnorderedElementsAre(
+ Pair("Name", "init"), Pair("State", "S (sleeping)"),
+ Pair("CapEff", "0000001fffffffff"))));
+}
+
+TEST(ParseProcStatusTest, DetectsDuplicateKeys) {
+ auto proc_status_or = ParseProcStatus("Name:\tfoo\nName:\tfoo\n");
+ EXPECT_THAT(proc_status_or,
+ PosixErrorIs(EINVAL, ::testing::StrEq("duplicate key \"Name\"")));
+}
+
+TEST(ParseProcStatusTest, DetectsMissingTabs) {
+ EXPECT_THAT(ParseProcStatus("Name:foo\nPid: 1\n"),
+ IsPosixErrorOkAndHolds(UnorderedElementsAre(Pair("Name:foo", ""),
+ Pair("Pid: 1", ""))));
+}
+
+TEST(ProcPidStatusTest, HasBasicFields) {
+ // Do this on a separate thread since we want tgid != tid.
+ ScopedThread([] {
+ const pid_t tgid = getpid();
+ const pid_t tid = syscall(SYS_gettid);
+ EXPECT_NE(tgid, tid);
+ const auto thread_name = ASSERT_NO_ERRNO_AND_VALUE(ThreadName());
+
+ std::string status_str = ASSERT_NO_ERRNO_AND_VALUE(
+ GetContents(absl::StrCat("/proc/", tid, "/status")));
+
+ ASSERT_FALSE(status_str.empty());
+ const auto status = ASSERT_NO_ERRNO_AND_VALUE(ParseProcStatus(status_str));
+ EXPECT_THAT(status, IsSupersetOf({Pair("Name", thread_name),
+ Pair("Tgid", absl::StrCat(tgid)),
+ Pair("Pid", absl::StrCat(tid)),
+ Pair("PPid", absl::StrCat(getppid()))}));
+ });
+}
+
+TEST(ProcPidStatusTest, StateRunning) {
+ // Task must be running when reading the file.
+ const pid_t tid = syscall(SYS_gettid);
+ std::string status_str = ASSERT_NO_ERRNO_AND_VALUE(
+ GetContents(absl::StrCat("/proc/", tid, "/status")));
+
+ EXPECT_THAT(ParseProcStatus(status_str),
+ IsPosixErrorOkAndHolds(Contains(Pair("State", "R (running)"))));
+}
+
+TEST(ProcPidStatusTest, StateSleeping_NoRandomSave) {
+ // Starts a child process that blocks and checks that State is sleeping.
+ auto res = WithSubprocess(
+ [&](int pid) -> PosixError {
+ // Because this test is timing based we will disable cooperative saving
+ // and the test itself also has random saving disabled.
+ const DisableSave ds;
+ // Try multiple times in case the child isn't sleeping when status file
+ // is read.
+ MonotonicTimer timer;
+ timer.Start();
+ for (;;) {
+ ASSIGN_OR_RETURN_ERRNO(
+ std::string status_str,
+ GetContents(absl::StrCat("/proc/", pid, "/status")));
+ ASSIGN_OR_RETURN_ERRNO(auto map, ParseProcStatus(status_str));
+ if (map["State"] == std::string("S (sleeping)")) {
+ // Test passed!
+ return NoError();
+ }
+ if (timer.Duration() > absl::Seconds(10)) {
+ return PosixError(ETIMEDOUT, "Timeout waiting for child to sleep");
+ }
+ absl::SleepFor(absl::Milliseconds(10));
+ }
+ },
+ nullptr, nullptr);
+ ASSERT_NO_ERRNO(res);
+}
+
+TEST(ProcPidStatusTest, ValuesAreTabDelimited) {
+ std::string status_str =
+ ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/status"));
+ ASSERT_FALSE(status_str.empty());
+ for (absl::string_view const line :
+ absl::StrSplit(status_str, '\n', absl::SkipWhitespace())) {
+ EXPECT_NE(std::string::npos, line.find(":\t"));
+ }
+}
+
+// Threads properly counts running threads.
+//
+// TODO(mpratt): Test zombied threads while the thread group leader is still
+// running with generalized fork and clone children from the wait test.
+TEST(ProcPidStatusTest, Threads) {
+ char buf[4096] = {};
+ EXPECT_THAT(ReadWhileRunning("status", buf, sizeof(buf) - 1),
+ SyscallSucceedsWithValue(Gt(0)));
+
+ auto status = ASSERT_NO_ERRNO_AND_VALUE(ParseProcStatus(buf));
+ auto it = status.find("Threads");
+ ASSERT_NE(it, status.end());
+ int threads = -1;
+ EXPECT_TRUE(absl::SimpleAtoi(it->second, &threads))
+ << "Threads value " << it->second << " is not a number";
+ // Don't make assumptions about the exact number of threads, as it may not be
+ // constant.
+ EXPECT_GE(threads, 1);
+
+ memset(buf, 0, sizeof(buf));
+ EXPECT_THAT(ReadWhileZombied("status", buf, sizeof(buf) - 1),
+ SyscallSucceedsWithValue(Gt(0)));
+
+ status = ASSERT_NO_ERRNO_AND_VALUE(ParseProcStatus(buf));
+ it = status.find("Threads");
+ ASSERT_NE(it, status.end());
+ threads = -1;
+ EXPECT_TRUE(absl::SimpleAtoi(it->second, &threads))
+ << "Threads value " << it->second << " is not a number";
+ // There must be only the thread group leader remaining, zombied.
+ EXPECT_EQ(threads, 1);
+}
+
+// Returns true if all characters in s are digits.
+bool IsDigits(absl::string_view s) {
+ return std::all_of(s.begin(), s.end(), absl::ascii_isdigit);
+}
+
+TEST(ProcPidStatTest, VmStats) {
+ std::string status_str =
+ ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/status"));
+ ASSERT_FALSE(status_str.empty());
+ auto status = ASSERT_NO_ERRNO_AND_VALUE(ParseProcStatus(status_str));
+
+ const auto vss_it = status.find("VmSize");
+ ASSERT_NE(vss_it, status.end());
+
+ absl::string_view vss_str(vss_it->second);
+
+ // Room for the " kB" suffix plus at least one digit.
+ ASSERT_GT(vss_str.length(), 3);
+ EXPECT_TRUE(absl::EndsWith(vss_str, " kB"));
+ // Everything else is part of a number.
+ EXPECT_TRUE(IsDigits(vss_str.substr(0, vss_str.length() - 3))) << vss_str;
+ // ... which is not 0.
+ EXPECT_NE('0', vss_str[0]);
+
+ const auto rss_it = status.find("VmRSS");
+ ASSERT_NE(rss_it, status.end());
+
+ absl::string_view rss_str(rss_it->second);
+
+ // Room for the " kB" suffix plus at least one digit.
+ ASSERT_GT(rss_str.length(), 3);
+ EXPECT_TRUE(absl::EndsWith(rss_str, " kB"));
+ // Everything else is part of a number.
+ EXPECT_TRUE(IsDigits(rss_str.substr(0, rss_str.length() - 3))) << rss_str;
+ // ... which is not 0.
+ EXPECT_NE('0', rss_str[0]);
+
+ const auto data_it = status.find("VmData");
+ ASSERT_NE(data_it, status.end());
+
+ absl::string_view data_str(data_it->second);
+
+ // Room for the " kB" suffix plus at least one digit.
+ ASSERT_GT(data_str.length(), 3);
+ EXPECT_TRUE(absl::EndsWith(data_str, " kB"));
+ // Everything else is part of a number.
+ EXPECT_TRUE(IsDigits(data_str.substr(0, data_str.length() - 3))) << data_str;
+ // ... which is not 0.
+ EXPECT_NE('0', data_str[0]);
+}
+
+// Parse an array of NUL-terminated char* arrays, returning a vector of
+// strings.
+std::vector<std::string> ParseNulTerminatedStrings(std::string contents) {
+ EXPECT_EQ('\0', contents.back());
+ // The split will leave an empty string if the NUL-byte remains, so pop
+ // it.
+ contents.pop_back();
+
+ return absl::StrSplit(contents, '\0');
+}
+
+TEST(ProcPidCmdline, MatchesArgv) {
+ std::vector<std::string> proc_cmdline = ParseNulTerminatedStrings(
+ ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/cmdline")));
+ EXPECT_THAT(saved_argv, ContainerEq(proc_cmdline));
+}
+
+TEST(ProcPidEnviron, MatchesEnviron) {
+ std::vector<std::string> proc_environ = ParseNulTerminatedStrings(
+ ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/environ")));
+ // Get the environment from the environ variable, which we will compare with
+ // /proc/self/environ.
+ std::vector<std::string> env;
+ for (char** v = environ; *v; v++) {
+ env.push_back(*v);
+ }
+ EXPECT_THAT(env, ContainerEq(proc_environ));
+}
+
+TEST(ProcPidCmdline, SubprocessForkSameCmdline) {
+ std::vector<std::string> proc_cmdline_parent;
+ std::vector<std::string> proc_cmdline;
+ proc_cmdline_parent = ParseNulTerminatedStrings(
+ ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/cmdline")));
+ auto res = WithSubprocess(
+ [&](int pid) -> PosixError {
+ ASSIGN_OR_RETURN_ERRNO(
+ auto raw_cmdline,
+ GetContents(absl::StrCat("/proc/", pid, "/cmdline")));
+ proc_cmdline = ParseNulTerminatedStrings(raw_cmdline);
+ return NoError();
+ },
+ nullptr, nullptr);
+ ASSERT_NO_ERRNO(res);
+
+ for (size_t i = 0; i < proc_cmdline_parent.size(); i++) {
+ EXPECT_EQ(proc_cmdline_parent[i], proc_cmdline[i]);
+ }
+}
+
+// Test whether /proc/PID/ symlinks can be read for a running process.
+TEST(ProcPidSymlink, SubprocessRunning) {
+ char buf[1];
+
+ EXPECT_THAT(ReadlinkWhileRunning("exe", buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ EXPECT_THAT(ReadlinkWhileRunning("ns/net", buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ EXPECT_THAT(ReadlinkWhileRunning("ns/pid", buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ EXPECT_THAT(ReadlinkWhileRunning("ns/user", buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+}
+
+TEST(ProcPidSymlink, SubprocessZombied) {
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+ char buf[1];
+
+ int want = EACCES;
+ if (!IsRunningOnGvisor()) {
+ auto version = ASSERT_NO_ERRNO_AND_VALUE(GetKernelVersion());
+ if (version.major > 4 || (version.major == 4 && version.minor > 3)) {
+ want = ENOENT;
+ }
+ }
+
+ EXPECT_THAT(ReadlinkWhileZombied("exe", buf, sizeof(buf)),
+ SyscallFailsWithErrno(want));
+
+ if (!IsRunningOnGvisor()) {
+ EXPECT_THAT(ReadlinkWhileZombied("ns/net", buf, sizeof(buf)),
+ SyscallFailsWithErrno(want));
+ }
+
+ // FIXME(gvisor.dev/issue/164): Inconsistent behavior between linux on proc
+ // files.
+ //
+ // ~4.3: Syscall fails with EACCES.
+ // 4.17: Syscall succeeds and returns 1.
+ //
+ if (!IsRunningOnGvisor()) {
+ return;
+ }
+
+ EXPECT_THAT(ReadlinkWhileZombied("ns/pid", buf, sizeof(buf)),
+ SyscallFailsWithErrno(want));
+
+ EXPECT_THAT(ReadlinkWhileZombied("ns/user", buf, sizeof(buf)),
+ SyscallFailsWithErrno(want));
+}
+
+// Test whether /proc/PID/ symlinks can be read for an exited process.
+TEST(ProcPidSymlink, SubprocessExited) {
+ char buf[1];
+
+ EXPECT_THAT(ReadlinkWhileExited("exe", buf, sizeof(buf)),
+ SyscallFailsWithErrno(ESRCH));
+
+ EXPECT_THAT(ReadlinkWhileExited("ns/net", buf, sizeof(buf)),
+ SyscallFailsWithErrno(ESRCH));
+
+ EXPECT_THAT(ReadlinkWhileExited("ns/pid", buf, sizeof(buf)),
+ SyscallFailsWithErrno(ESRCH));
+
+ EXPECT_THAT(ReadlinkWhileExited("ns/user", buf, sizeof(buf)),
+ SyscallFailsWithErrno(ESRCH));
+}
+
+// /proc/PID/exe points to the correct binary.
+TEST(ProcPidExe, Subprocess) {
+ auto link = ASSERT_NO_ERRNO_AND_VALUE(ReadLink("/proc/self/exe"));
+ auto expected_absolute_path =
+ ASSERT_NO_ERRNO_AND_VALUE(MakeAbsolute(link, ""));
+
+ char actual[PATH_MAX + 1] = {};
+ ASSERT_THAT(ReadlinkWhileRunning("exe", actual, sizeof(actual)),
+ SyscallSucceedsWithValue(Gt(0)));
+ EXPECT_EQ(actual, expected_absolute_path);
+}
+
+// Test whether /proc/PID/ files can be read for a running process.
+TEST(ProcPidFile, SubprocessRunning) {
+ char buf[1];
+
+ EXPECT_THAT(ReadWhileRunning("auxv", buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ EXPECT_THAT(ReadWhileRunning("cmdline", buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ EXPECT_THAT(ReadWhileRunning("comm", buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ EXPECT_THAT(ReadWhileRunning("gid_map", buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ EXPECT_THAT(ReadWhileRunning("io", buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ EXPECT_THAT(ReadWhileRunning("maps", buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ EXPECT_THAT(ReadWhileRunning("stat", buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ EXPECT_THAT(ReadWhileRunning("status", buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ EXPECT_THAT(ReadWhileRunning("uid_map", buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ EXPECT_THAT(ReadWhileRunning("oom_score", buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ EXPECT_THAT(ReadWhileRunning("oom_score_adj", buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+}
+
+// Test whether /proc/PID/ files can be read for a zombie process.
+TEST(ProcPidFile, SubprocessZombie) {
+ char buf[1];
+
+ // FIXME(gvisor.dev/issue/164): Loosen requirement due to inconsistent
+ // behavior on different kernels.
+ //
+ // ~4.3: Succeds and returns 0.
+ // 4.17: Succeeds and returns 1.
+ // gVisor: Succeeds and returns 0.
+ EXPECT_THAT(ReadWhileZombied("auxv", buf, sizeof(buf)), SyscallSucceeds());
+
+ EXPECT_THAT(ReadWhileZombied("cmdline", buf, sizeof(buf)),
+ SyscallSucceedsWithValue(0));
+
+ EXPECT_THAT(ReadWhileZombied("comm", buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ EXPECT_THAT(ReadWhileZombied("gid_map", buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ EXPECT_THAT(ReadWhileZombied("maps", buf, sizeof(buf)),
+ SyscallSucceedsWithValue(0));
+
+ EXPECT_THAT(ReadWhileZombied("stat", buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ EXPECT_THAT(ReadWhileZombied("status", buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ EXPECT_THAT(ReadWhileZombied("uid_map", buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ EXPECT_THAT(ReadWhileZombied("oom_score", buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ EXPECT_THAT(ReadWhileZombied("oom_score_adj", buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ // FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux
+ // on proc files.
+ //
+ // ~4.3: Fails and returns EACCES.
+ // gVisor & 4.17: Succeeds and returns 1.
+ //
+ // EXPECT_THAT(ReadWhileZombied("io", buf, sizeof(buf)),
+ // SyscallFailsWithErrno(EACCES));
+}
+
+// Test whether /proc/PID/ files can be read for an exited process.
+TEST(ProcPidFile, SubprocessExited) {
+ char buf[1];
+
+ // FIXME(gvisor.dev/issue/164): Inconsistent behavior between kernels.
+ //
+ // ~4.3: Fails and returns ESRCH.
+ // gVisor: Fails with ESRCH.
+ // 4.17: Succeeds and returns 1.
+ //
+ // EXPECT_THAT(ReadWhileExited("auxv", buf, sizeof(buf)),
+ // SyscallFailsWithErrno(ESRCH));
+
+ EXPECT_THAT(ReadWhileExited("cmdline", buf, sizeof(buf)),
+ SyscallFailsWithErrno(ESRCH));
+
+ if (!IsRunningOnGvisor()) {
+ // FIXME(gvisor.dev/issue/164): Succeeds on gVisor.
+ EXPECT_THAT(ReadWhileExited("comm", buf, sizeof(buf)),
+ SyscallFailsWithErrno(ESRCH));
+ }
+
+ EXPECT_THAT(ReadWhileExited("gid_map", buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ if (!IsRunningOnGvisor()) {
+ // FIXME(gvisor.dev/issue/164): Succeeds on gVisor.
+ EXPECT_THAT(ReadWhileExited("io", buf, sizeof(buf)),
+ SyscallFailsWithErrno(ESRCH));
+ }
+
+ if (!IsRunningOnGvisor()) {
+ // FIXME(gvisor.dev/issue/164): Returns EOF on gVisor.
+ EXPECT_THAT(ReadWhileExited("maps", buf, sizeof(buf)),
+ SyscallFailsWithErrno(ESRCH));
+ }
+
+ if (!IsRunningOnGvisor()) {
+ // FIXME(gvisor.dev/issue/164): Succeeds on gVisor.
+ EXPECT_THAT(ReadWhileExited("stat", buf, sizeof(buf)),
+ SyscallFailsWithErrno(ESRCH));
+ }
+
+ if (!IsRunningOnGvisor()) {
+ // FIXME(gvisor.dev/issue/164): Succeeds on gVisor.
+ EXPECT_THAT(ReadWhileExited("status", buf, sizeof(buf)),
+ SyscallFailsWithErrno(ESRCH));
+ }
+
+ EXPECT_THAT(ReadWhileExited("uid_map", buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ if (!IsRunningOnGvisor()) {
+ // FIXME(gvisor.dev/issue/164): Succeeds on gVisor.
+ EXPECT_THAT(ReadWhileExited("oom_score", buf, sizeof(buf)),
+ SyscallFailsWithErrno(ESRCH));
+ }
+
+ EXPECT_THAT(ReadWhileExited("oom_score_adj", buf, sizeof(buf)),
+ SyscallFailsWithErrno(ESRCH));
+}
+
+PosixError DirContainsImpl(absl::string_view path,
+ const std::vector<std::string>& targets,
+ bool strict) {
+ ASSIGN_OR_RETURN_ERRNO(auto listing, ListDir(path, false));
+ bool success = true;
+
+ for (auto& expected_entry : targets) {
+ auto cursor = std::find(listing.begin(), listing.end(), expected_entry);
+ if (cursor == listing.end()) {
+ success = false;
+ }
+ }
+
+ if (!success) {
+ return PosixError(
+ ENOENT,
+ absl::StrCat("Failed to find one or more paths in '", path, "'"));
+ }
+
+ if (strict) {
+ if (targets.size() != listing.size()) {
+ return PosixError(
+ EINVAL,
+ absl::StrCat("Expected to find ", targets.size(), " elements in '",
+ path, "', but found ", listing.size()));
+ }
+ }
+
+ return NoError();
+}
+
+PosixError DirContains(absl::string_view path,
+ const std::vector<std::string>& targets) {
+ return DirContainsImpl(path, targets, false);
+}
+
+PosixError DirContainsExactly(absl::string_view path,
+ const std::vector<std::string>& targets) {
+ return DirContainsImpl(path, targets, true);
+}
+
+PosixError EventuallyDirContainsExactly(
+ absl::string_view path, const std::vector<std::string>& targets) {
+ constexpr int kRetryCount = 100;
+ const absl::Duration kRetryDelay = absl::Milliseconds(100);
+
+ for (int i = 0; i < kRetryCount; ++i) {
+ auto res = DirContainsExactly(path, targets);
+ if (res.ok()) {
+ return res;
+ } else if (i < kRetryCount - 1) {
+ // Sleep if this isn't the final iteration.
+ absl::SleepFor(kRetryDelay);
+ }
+ }
+ return PosixError(ETIMEDOUT,
+ "Timed out while waiting for directory to contain files ");
+}
+
+TEST(ProcTask, Basic) {
+ EXPECT_NO_ERRNO(
+ DirContains("/proc/self/task", {".", "..", absl::StrCat(getpid())}));
+}
+
+std::vector<std::string> TaskFiles(
+ const std::vector<std::string>& initial_contents,
+ const std::vector<pid_t>& pids) {
+ return VecCat<std::string>(
+ initial_contents,
+ ApplyVec<std::string>([](const pid_t p) { return absl::StrCat(p); },
+ pids));
+}
+
+std::vector<std::string> TaskFiles(const std::vector<pid_t>& pids) {
+ return TaskFiles({".", "..", absl::StrCat(getpid())}, pids);
+}
+
+// Helper class for creating a new task in the current thread group.
+class BlockingChild {
+ public:
+ BlockingChild() : thread_([=] { Start(); }) {}
+ ~BlockingChild() { Join(); }
+
+ pid_t Tid() const {
+ absl::MutexLock ml(&mu_);
+ mu_.Await(absl::Condition(&tid_ready_));
+ return tid_;
+ }
+
+ void Join() { Stop(); }
+
+ private:
+ void Start() {
+ absl::MutexLock ml(&mu_);
+ tid_ = syscall(__NR_gettid);
+ tid_ready_ = true;
+ mu_.Await(absl::Condition(&stop_));
+ }
+
+ void Stop() {
+ absl::MutexLock ml(&mu_);
+ stop_ = true;
+ }
+
+ mutable absl::Mutex mu_;
+ bool stop_ ABSL_GUARDED_BY(mu_) = false;
+ pid_t tid_;
+ bool tid_ready_ ABSL_GUARDED_BY(mu_) = false;
+
+ // Must be last to ensure that the destructor for the thread is run before
+ // any other member of the object is destroyed.
+ ScopedThread thread_;
+};
+
+TEST(ProcTask, NewThreadAppears) {
+ auto initial = ASSERT_NO_ERRNO_AND_VALUE(ListDir("/proc/self/task", false));
+ BlockingChild child1;
+ EXPECT_NO_ERRNO(DirContainsExactly("/proc/self/task",
+ TaskFiles(initial, {child1.Tid()})));
+}
+
+TEST(ProcTask, KilledThreadsDisappear) {
+ auto initial = ASSERT_NO_ERRNO_AND_VALUE(ListDir("/proc/self/task/", false));
+
+ BlockingChild child1;
+ EXPECT_NO_ERRNO(DirContainsExactly("/proc/self/task",
+ TaskFiles(initial, {child1.Tid()})));
+
+ // Stat child1's task file. Regression test for b/32097707.
+ struct stat statbuf;
+ const std::string child1_task_file =
+ absl::StrCat("/proc/self/task/", child1.Tid());
+ EXPECT_THAT(stat(child1_task_file.c_str(), &statbuf), SyscallSucceeds());
+
+ BlockingChild child2;
+ EXPECT_NO_ERRNO(DirContainsExactly(
+ "/proc/self/task", TaskFiles(initial, {child1.Tid(), child2.Tid()})));
+
+ BlockingChild child3;
+ BlockingChild child4;
+ BlockingChild child5;
+ EXPECT_NO_ERRNO(DirContainsExactly(
+ "/proc/self/task",
+ TaskFiles(initial, {child1.Tid(), child2.Tid(), child3.Tid(),
+ child4.Tid(), child5.Tid()})));
+
+ child2.Join();
+ EXPECT_NO_ERRNO(EventuallyDirContainsExactly(
+ "/proc/self/task", TaskFiles(initial, {child1.Tid(), child3.Tid(),
+ child4.Tid(), child5.Tid()})));
+
+ child1.Join();
+ child4.Join();
+ EXPECT_NO_ERRNO(EventuallyDirContainsExactly(
+ "/proc/self/task", TaskFiles(initial, {child3.Tid(), child5.Tid()})));
+
+ // Stat child1's task file again. This time it should fail. See b/32097707.
+ EXPECT_THAT(stat(child1_task_file.c_str(), &statbuf),
+ SyscallFailsWithErrno(ENOENT));
+
+ child3.Join();
+ child5.Join();
+ EXPECT_NO_ERRNO(EventuallyDirContainsExactly("/proc/self/task", initial));
+}
+
+TEST(ProcTask, ChildTaskDir) {
+ BlockingChild child1;
+ EXPECT_NO_ERRNO(DirContains("/proc/self/task", TaskFiles({child1.Tid()})));
+ EXPECT_NO_ERRNO(DirContains(absl::StrCat("/proc/", child1.Tid(), "/task"),
+ TaskFiles({child1.Tid()})));
+}
+
+PosixError VerifyPidDir(std::string path) {
+ return DirContains(path, {"exe", "fd", "io", "maps", "ns", "stat", "status"});
+}
+
+TEST(ProcTask, VerifyTaskDir) {
+ EXPECT_NO_ERRNO(VerifyPidDir("/proc/self"));
+
+ EXPECT_NO_ERRNO(VerifyPidDir(absl::StrCat("/proc/self/task/", getpid())));
+ BlockingChild child1;
+ EXPECT_NO_ERRNO(VerifyPidDir(absl::StrCat("/proc/self/task/", child1.Tid())));
+
+ // Only the first level of task directories should contain the 'task'
+ // directory. That is:
+ //
+ // /proc/1234/task <- should exist
+ // /proc/1234/task/1234/task <- should not exist
+ // /proc/1234/task/1235/task <- should not exist (where 1235 is in the same
+ // thread group as 1234).
+ EXPECT_FALSE(
+ DirContains(absl::StrCat("/proc/self/task/", getpid()), {"task"}).ok())
+ << "Found 'task' directory in an inner directory.";
+}
+
+TEST(ProcTask, TaskDirCannotBeDeleted) {
+ // Drop capabilities that allow us to override file and directory permissions.
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+
+ EXPECT_THAT(rmdir("/proc/self/task"), SyscallFails());
+ EXPECT_THAT(rmdir(absl::StrCat("/proc/self/task/", getpid()).c_str()),
+ SyscallFailsWithErrno(EACCES));
+}
+
+TEST(ProcTask, TaskDirHasCorrectMetadata) {
+ struct stat st;
+ EXPECT_THAT(stat("/proc/self/task", &st), SyscallSucceeds());
+ EXPECT_TRUE(S_ISDIR(st.st_mode));
+
+ // Verify file is readable and executable by everyone.
+ mode_t expected_permissions =
+ S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH;
+ mode_t permissions = st.st_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
+ EXPECT_EQ(expected_permissions, permissions);
+}
+
+TEST(ProcTask, TaskDirCanSeekToEnd) {
+ const FileDescriptor dirfd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/proc/self/task", O_RDONLY));
+ EXPECT_THAT(lseek(dirfd.get(), 0, SEEK_END), SyscallSucceeds());
+}
+
+TEST(ProcTask, VerifyTaskDirNlinks) {
+ // A task directory will have 3 links if the taskgroup has a single
+ // thread. For example, the following shows where the links to
+ // '/proc/12345/task comes' from for a single threaded process with pid 12345:
+ //
+ // /proc/12345/task <-- 1 link for the directory itself
+ // . <-- link from "."
+ // ..
+ // 12345
+ // .
+ // .. <-- link from ".." to parent.
+ // <other contents of a task dir>
+ //
+ // We can't assert an absolute number of links since we don't control how many
+ // threads the test framework spawns. Instead, we'll ensure creating a new
+ // thread increases the number of links as expected.
+
+ // Once we reach the test body, we can count on the thread count being stable
+ // unless we spawn a new one.
+ uint64_t initial_links = ASSERT_NO_ERRNO_AND_VALUE(Links("/proc/self/task"));
+ ASSERT_GE(initial_links, 3);
+
+ // For each new subtask, we should gain a new link.
+ BlockingChild child1;
+ EXPECT_THAT(Links("/proc/self/task"),
+ IsPosixErrorOkAndHolds(initial_links + 1));
+ BlockingChild child2;
+ EXPECT_THAT(Links("/proc/self/task"),
+ IsPosixErrorOkAndHolds(initial_links + 2));
+}
+
+TEST(ProcTask, CommContainsThreadNameAndTrailingNewline) {
+ constexpr char kThreadName[] = "TestThread12345";
+ ASSERT_THAT(prctl(PR_SET_NAME, kThreadName), SyscallSucceeds());
+
+ auto thread_name = ASSERT_NO_ERRNO_AND_VALUE(
+ GetContents(JoinPath("/proc", absl::StrCat(getpid()), "task",
+ absl::StrCat(syscall(SYS_gettid)), "comm")));
+ EXPECT_EQ(absl::StrCat(kThreadName, "\n"), thread_name);
+}
+
+TEST(ProcTaskNs, NsDirExistsAndHasCorrectMetadata) {
+ EXPECT_NO_ERRNO(DirContains("/proc/self/ns", {"net", "pid", "user"}));
+
+ // Let's just test the 'pid' entry, all of them are very similar.
+ struct stat st;
+ EXPECT_THAT(lstat("/proc/self/ns/pid", &st), SyscallSucceeds());
+ EXPECT_TRUE(S_ISLNK(st.st_mode));
+
+ auto link = ASSERT_NO_ERRNO_AND_VALUE(ReadLink("/proc/self/ns/pid"));
+ EXPECT_THAT(link, ::testing::StartsWith("pid:["));
+}
+
+TEST(ProcTaskNs, AccessOnNsNodeSucceeds) {
+ EXPECT_THAT(access("/proc/self/ns/pid", F_OK), SyscallSucceeds());
+}
+
+TEST(ProcSysKernelHostname, Exists) {
+ EXPECT_THAT(open("/proc/sys/kernel/hostname", O_RDONLY), SyscallSucceeds());
+}
+
+TEST(ProcSysKernelHostname, MatchesUname) {
+ struct utsname buf;
+ EXPECT_THAT(uname(&buf), SyscallSucceeds());
+ const std::string hostname = absl::StrCat(buf.nodename, "\n");
+ auto procfs_hostname =
+ ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/sys/kernel/hostname"));
+ EXPECT_EQ(procfs_hostname, hostname);
+}
+
+TEST(ProcSysVmMmapMinAddr, HasNumericValue) {
+ const std::string mmap_min_addr_str =
+ ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/sys/vm/mmap_min_addr"));
+ uintptr_t mmap_min_addr;
+ EXPECT_TRUE(absl::SimpleAtoi(mmap_min_addr_str, &mmap_min_addr))
+ << "/proc/sys/vm/mmap_min_addr does not contain a numeric value: "
+ << mmap_min_addr_str;
+}
+
+TEST(ProcSysVmOvercommitMemory, HasNumericValue) {
+ const std::string overcommit_memory_str =
+ ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/sys/vm/overcommit_memory"));
+ uintptr_t overcommit_memory;
+ EXPECT_TRUE(absl::SimpleAtoi(overcommit_memory_str, &overcommit_memory))
+ << "/proc/sys/vm/overcommit_memory does not contain a numeric value: "
+ << overcommit_memory;
+}
+
+// Check that link for proc fd entries point the target node, not the
+// symlink itself. Regression test for b/31155070.
+TEST(ProcTaskFd, FstatatFollowsSymlink) {
+ const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY));
+
+ struct stat sproc = {};
+ EXPECT_THAT(
+ fstatat(-1, absl::StrCat("/proc/self/fd/", fd.get()).c_str(), &sproc, 0),
+ SyscallSucceeds());
+
+ struct stat sfile = {};
+ EXPECT_THAT(fstatat(-1, file.path().c_str(), &sfile, 0), SyscallSucceeds());
+
+ // If fstatat follows the fd symlink, the device and inode numbers should
+ // match at a minimum.
+ EXPECT_EQ(sproc.st_dev, sfile.st_dev);
+ EXPECT_EQ(sproc.st_ino, sfile.st_ino);
+ EXPECT_EQ(0, memcmp(&sfile, &sproc, sizeof(sfile)));
+}
+
+TEST(ProcFilesystems, Bug65172365) {
+ std::string proc_filesystems =
+ ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/filesystems"));
+ ASSERT_FALSE(proc_filesystems.empty());
+}
+
+TEST(ProcFilesystems, PresenceOfShmMaxMniAll) {
+ uint64_t shmmax = 0;
+ uint64_t shmall = 0;
+ uint64_t shmmni = 0;
+ std::string proc_file;
+ proc_file = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/sys/kernel/shmmax"));
+ ASSERT_FALSE(proc_file.empty());
+ ASSERT_TRUE(absl::SimpleAtoi(proc_file, &shmmax));
+ proc_file = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/sys/kernel/shmall"));
+ ASSERT_FALSE(proc_file.empty());
+ ASSERT_TRUE(absl::SimpleAtoi(proc_file, &shmall));
+ proc_file = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/sys/kernel/shmmni"));
+ ASSERT_FALSE(proc_file.empty());
+ ASSERT_TRUE(absl::SimpleAtoi(proc_file, &shmmni));
+
+ ASSERT_GT(shmmax, 0);
+ ASSERT_GT(shmall, 0);
+ ASSERT_GT(shmmni, 0);
+ ASSERT_LE(shmall, shmmax);
+
+ // These values should never be higher than this by default, for more
+ // information see uapi/linux/shm.h
+ ASSERT_LE(shmmax, ULONG_MAX - (1UL << 24));
+ ASSERT_LE(shmall, ULONG_MAX - (1UL << 24));
+}
+
+// Check that /proc/mounts is a symlink to self/mounts.
+TEST(ProcMounts, IsSymlink) {
+ auto link = ASSERT_NO_ERRNO_AND_VALUE(ReadLink("/proc/mounts"));
+ EXPECT_EQ(link, "self/mounts");
+}
+
+TEST(ProcSelfMountinfo, RequiredFieldsArePresent) {
+ auto mountinfo =
+ ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/mountinfo"));
+ EXPECT_THAT(
+ mountinfo,
+ AllOf(
+ // Root mount.
+ ContainsRegex(
+ R"([0-9]+ [0-9]+ [0-9]+:[0-9]+ /\S* / (rw|ro).*- \S+ \S+ (rw|ro)\S*)"),
+ // Proc mount - always rw.
+ ContainsRegex(
+ R"([0-9]+ [0-9]+ [0-9]+:[0-9]+ / /proc rw.*- \S+ \S+ rw\S*)")));
+}
+
+// Check that /proc/self/mounts looks something like a real mounts file.
+TEST(ProcSelfMounts, RequiredFieldsArePresent) {
+ auto mounts = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/mounts"));
+ EXPECT_THAT(mounts,
+ AllOf(
+ // Root mount.
+ ContainsRegex(R"(\S+ / \S+ (rw|ro)\S* [0-9]+ [0-9]+\s)"),
+ // Root mount.
+ ContainsRegex(R"(\S+ /proc \S+ rw\S* [0-9]+ [0-9]+\s)")));
+}
+
+void CheckDuplicatesRecursively(std::string path) {
+ std::vector<std::string> child_dirs;
+
+ // There is the known issue of the linux procfs, that two consequent calls of
+ // readdir can return the same entry twice if between these calls one or more
+ // entries have been removed from this directory.
+ int max_attempts = 5;
+ for (int i = 0; i < max_attempts; i++) {
+ child_dirs.clear();
+ errno = 0;
+ bool success = true;
+ DIR* dir = opendir(path.c_str());
+ if (dir == nullptr) {
+ // Ignore any directories we can't read or missing directories as the
+ // directory could have been deleted/mutated from the time the parent
+ // directory contents were read.
+ return;
+ }
+ auto dir_closer = Cleanup([&dir]() { closedir(dir); });
+ std::unordered_set<std::string> children;
+ while (true) {
+ // Readdir(3): If the end of the directory stream is reached, NULL is
+ // returned and errno is not changed. If an error occurs, NULL is
+ // returned and errno is set appropriately. To distinguish end of stream
+ // and from an error, set errno to zero before calling readdir() and then
+ // check the value of errno if NULL is returned.
+ errno = 0;
+ struct dirent* dp = readdir(dir);
+ if (dp == nullptr) {
+ // Linux will return EINVAL when calling getdents on a /proc/tid/net
+ // file corresponding to a zombie task.
+ // See fs/proc/proc_net.c:proc_tgid_net_readdir().
+ //
+ // We just ignore the directory in this case.
+ if (errno == EINVAL && absl::StartsWith(path, "/proc/") &&
+ absl::EndsWith(path, "/net")) {
+ break;
+ }
+
+ // Otherwise, no errors are allowed.
+ ASSERT_EQ(errno, 0) << path;
+ break; // We're done.
+ }
+
+ const std::string name = dp->d_name;
+
+ if (name == "." || name == "..") {
+ continue;
+ }
+
+ // Ignore a duplicate entry if it isn't the last attempt.
+ if (i == max_attempts - 1) {
+ ASSERT_EQ(children.find(name), children.end())
+ << absl::StrCat(path, "/", name);
+ } else if (children.find(name) != children.end()) {
+ std::cerr << "Duplicate entry: " << i << ":"
+ << absl::StrCat(path, "/", name) << std::endl;
+ success = false;
+ break;
+ }
+ children.insert(name);
+
+ if (dp->d_type == DT_DIR) {
+ child_dirs.push_back(name);
+ }
+ }
+ if (success) {
+ break;
+ }
+ }
+ for (auto dname = child_dirs.begin(); dname != child_dirs.end(); dname++) {
+ CheckDuplicatesRecursively(absl::StrCat(path, "/", *dname));
+ }
+}
+
+TEST(Proc, NoDuplicates) { CheckDuplicatesRecursively("/proc"); }
+
+// Most /proc/PID files are owned by the task user with SUID_DUMP_USER.
+TEST(ProcPid, UserDumpableOwner) {
+ int before;
+ ASSERT_THAT(before = prctl(PR_GET_DUMPABLE), SyscallSucceeds());
+ auto cleanup = Cleanup([before] {
+ ASSERT_THAT(prctl(PR_SET_DUMPABLE, before), SyscallSucceeds());
+ });
+
+ EXPECT_THAT(prctl(PR_SET_DUMPABLE, SUID_DUMP_USER), SyscallSucceeds());
+
+ // This applies to the task directory itself and files inside.
+ struct stat st;
+ ASSERT_THAT(stat("/proc/self/", &st), SyscallSucceeds());
+ EXPECT_EQ(st.st_uid, geteuid());
+ EXPECT_EQ(st.st_gid, getegid());
+
+ ASSERT_THAT(stat("/proc/self/stat", &st), SyscallSucceeds());
+ EXPECT_EQ(st.st_uid, geteuid());
+ EXPECT_EQ(st.st_gid, getegid());
+}
+
+// /proc/PID files are owned by root with SUID_DUMP_DISABLE.
+TEST(ProcPid, RootDumpableOwner) {
+ int before;
+ ASSERT_THAT(before = prctl(PR_GET_DUMPABLE), SyscallSucceeds());
+ auto cleanup = Cleanup([before] {
+ ASSERT_THAT(prctl(PR_SET_DUMPABLE, before), SyscallSucceeds());
+ });
+
+ EXPECT_THAT(prctl(PR_SET_DUMPABLE, SUID_DUMP_DISABLE), SyscallSucceeds());
+
+ // This *does not* applies to the task directory itself (or other 0555
+ // directories), but does to files inside.
+ struct stat st;
+ ASSERT_THAT(stat("/proc/self/", &st), SyscallSucceeds());
+ EXPECT_EQ(st.st_uid, geteuid());
+ EXPECT_EQ(st.st_gid, getegid());
+
+ // This file is owned by root. Also allow nobody in case this test is running
+ // in a userns without root mapped.
+ ASSERT_THAT(stat("/proc/self/stat", &st), SyscallSucceeds());
+ EXPECT_THAT(st.st_uid, AnyOf(Eq(0), Eq(65534)));
+ EXPECT_THAT(st.st_gid, AnyOf(Eq(0), Eq(65534)));
+}
+
+TEST(Proc, GetdentsEnoent) {
+ FileDescriptor fd;
+ ASSERT_NO_ERRNO(WithSubprocess(
+ [&](int pid) -> PosixError {
+ // Running.
+ ASSIGN_OR_RETURN_ERRNO(fd, Open(absl::StrCat("/proc/", pid, "/task"),
+ O_RDONLY | O_DIRECTORY));
+
+ return NoError();
+ },
+ nullptr, nullptr));
+ char buf[1024];
+ ASSERT_THAT(syscall(SYS_getdents64, fd.get(), buf, sizeof(buf)),
+ SyscallFailsWithErrno(ENOENT));
+}
+
+void CheckSyscwFromIOFile(const std::string& path, const std::string& regex) {
+ std::string output;
+ ASSERT_NO_ERRNO(GetContents(path, &output));
+ ASSERT_THAT(output, ContainsRegex(absl::StrCat("syscw:\\s+", regex, "\n")));
+}
+
+// Checks that there is variable accounting of IO between threads/tasks.
+TEST(Proc, PidTidIOAccounting) {
+ absl::Notification notification;
+
+ // Run a thread with a bunch of writes. Check that io account records exactly
+ // the number of write calls. File open/close is there to prevent buffering.
+ ScopedThread writer([&notification] {
+ const int num_writes = 100;
+ for (int i = 0; i < num_writes; i++) {
+ auto path = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ ASSERT_NO_ERRNO(SetContents(path.path(), "a"));
+ }
+ notification.Notify();
+ const std::string& writer_dir =
+ absl::StrCat("/proc/", getpid(), "/task/", gettid(), "/io");
+
+ CheckSyscwFromIOFile(writer_dir, std::to_string(num_writes));
+ });
+
+ // Run a thread and do no writes. Check that no writes are recorded.
+ ScopedThread noop([&notification] {
+ notification.WaitForNotification();
+ const std::string& noop_dir =
+ absl::StrCat("/proc/", getpid(), "/task/", gettid(), "/io");
+
+ CheckSyscwFromIOFile(noop_dir, "0");
+ });
+
+ writer.Join();
+ noop.Join();
+}
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
+
+int main(int argc, char** argv) {
+ for (int i = 0; i < argc; ++i) {
+ gvisor::testing::saved_argv.emplace_back(std::string(argv[i]));
+ }
+
+ gvisor::testing::TestInit(&argc, &argv);
+ return gvisor::testing::RunAllTests();
+}
diff --git a/test/syscalls/linux/proc_net.cc b/test/syscalls/linux/proc_net.cc
new file mode 100644
index 000000000..3377b65cf
--- /dev/null
+++ b/test/syscalls/linux/proc_net.cc
@@ -0,0 +1,482 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <netinet/in.h>
+#include <poll.h>
+#include <sys/socket.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+constexpr const char kProcNet[] = "/proc/net";
+
+TEST(ProcNetSymlinkTarget, FileMode) {
+ struct stat s;
+ ASSERT_THAT(stat(kProcNet, &s), SyscallSucceeds());
+ EXPECT_EQ(s.st_mode & S_IFMT, S_IFDIR);
+ EXPECT_EQ(s.st_mode & 0777, 0555);
+}
+
+TEST(ProcNetSymlink, FileMode) {
+ struct stat s;
+ ASSERT_THAT(lstat(kProcNet, &s), SyscallSucceeds());
+ EXPECT_EQ(s.st_mode & S_IFMT, S_IFLNK);
+ EXPECT_EQ(s.st_mode & 0777, 0777);
+}
+
+TEST(ProcNetSymlink, Contents) {
+ char buf[40] = {};
+ int n = readlink(kProcNet, buf, sizeof(buf));
+ ASSERT_THAT(n, SyscallSucceeds());
+
+ buf[n] = 0;
+ EXPECT_STREQ(buf, "self/net");
+}
+
+TEST(ProcNetIfInet6, Format) {
+ auto ifinet6 = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/if_inet6"));
+ EXPECT_THAT(ifinet6,
+ ::testing::MatchesRegex(
+ // Ex: "00000000000000000000000000000001 01 80 10 80 lo\n"
+ "^([a-f0-9]{32}( [a-f0-9]{2}){4} +[a-z][a-z0-9]*\n)+$"));
+}
+
+TEST(ProcSysNetIpv4Sack, Exists) {
+ EXPECT_THAT(open("/proc/sys/net/ipv4/tcp_sack", O_RDONLY), SyscallSucceeds());
+}
+
+TEST(ProcSysNetIpv4Sack, CanReadAndWrite) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability((CAP_DAC_OVERRIDE))));
+
+ auto const fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/proc/sys/net/ipv4/tcp_sack", O_RDWR));
+
+ char buf;
+ EXPECT_THAT(PreadFd(fd.get(), &buf, sizeof(buf), 0),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ EXPECT_TRUE(buf == '0' || buf == '1') << "unexpected tcp_sack: " << buf;
+
+ char to_write = (buf == '1') ? '0' : '1';
+ EXPECT_THAT(PwriteFd(fd.get(), &to_write, sizeof(to_write), 0),
+ SyscallSucceedsWithValue(sizeof(to_write)));
+
+ buf = 0;
+ EXPECT_THAT(PreadFd(fd.get(), &buf, sizeof(buf), 0),
+ SyscallSucceedsWithValue(sizeof(buf)));
+ EXPECT_EQ(buf, to_write);
+}
+
+// DeviceEntry is an entry in /proc/net/dev
+struct DeviceEntry {
+ std::string name;
+ uint64_t stats[16];
+};
+
+PosixErrorOr<std::vector<DeviceEntry>> GetDeviceMetricsFromProc(
+ const std::string dev) {
+ std::vector<std::string> lines = absl::StrSplit(dev, '\n');
+ std::vector<DeviceEntry> entries;
+
+ // /proc/net/dev prints 2 lines of headers followed by a line of metrics for
+ // each network interface.
+ for (unsigned i = 2; i < lines.size(); i++) {
+ // Ignore empty lines.
+ if (lines[i].empty()) {
+ continue;
+ }
+
+ std::vector<std::string> values =
+ absl::StrSplit(lines[i], ' ', absl::SkipWhitespace());
+
+ // Interface name + 16 values.
+ if (values.size() != 17) {
+ return PosixError(EINVAL, "invalid line: " + lines[i]);
+ }
+
+ DeviceEntry entry;
+ entry.name = values[0];
+ // Skip the interface name and read only the values.
+ for (unsigned j = 1; j < 17; j++) {
+ uint64_t num;
+ if (!absl::SimpleAtoi(values[j], &num)) {
+ return PosixError(EINVAL, "invalid value: " + values[j]);
+ }
+ entry.stats[j - 1] = num;
+ }
+
+ entries.push_back(entry);
+ }
+
+ return entries;
+}
+
+// TEST(ProcNetDev, Format) tests that /proc/net/dev is parsable and
+// contains at least one entry.
+TEST(ProcNetDev, Format) {
+ auto dev = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/dev"));
+ auto entries = ASSERT_NO_ERRNO_AND_VALUE(GetDeviceMetricsFromProc(dev));
+
+ EXPECT_GT(entries.size(), 0);
+}
+
+PosixErrorOr<uint64_t> GetSNMPMetricFromProc(const std::string snmp,
+ const std::string& type,
+ const std::string& item) {
+ std::vector<std::string> snmp_vec = absl::StrSplit(snmp, '\n');
+
+ // /proc/net/snmp prints a line of headers followed by a line of metrics.
+ // Only search the headers.
+ for (unsigned i = 0; i < snmp_vec.size(); i = i + 2) {
+ if (!absl::StartsWith(snmp_vec[i], type)) continue;
+
+ std::vector<std::string> fields =
+ absl::StrSplit(snmp_vec[i], ' ', absl::SkipWhitespace());
+
+ EXPECT_TRUE((i + 1) < snmp_vec.size());
+ std::vector<std::string> values =
+ absl::StrSplit(snmp_vec[i + 1], ' ', absl::SkipWhitespace());
+
+ EXPECT_TRUE(!fields.empty() && fields.size() == values.size());
+
+ // Metrics start at the first index.
+ for (unsigned j = 1; j < fields.size(); j++) {
+ if (fields[j] == item) {
+ uint64_t val;
+ if (!absl::SimpleAtoi(values[j], &val)) {
+ return PosixError(EINVAL,
+ absl::StrCat("field is not a number: ", values[j]));
+ }
+
+ return val;
+ }
+ }
+ }
+ // We should never get here.
+ return PosixError(
+ EINVAL, absl::StrCat("failed to find ", type, "/", item, " in:", snmp));
+}
+
+TEST(ProcNetSnmp, TcpReset_NoRandomSave) {
+ // TODO(gvisor.dev/issue/866): epsocket metrics are not savable.
+ DisableSave ds;
+
+ uint64_t oldAttemptFails;
+ uint64_t oldActiveOpens;
+ uint64_t oldOutRsts;
+ auto snmp = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/snmp"));
+ oldActiveOpens = ASSERT_NO_ERRNO_AND_VALUE(
+ GetSNMPMetricFromProc(snmp, "Tcp", "ActiveOpens"));
+ oldOutRsts =
+ ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "OutRsts"));
+ oldAttemptFails = ASSERT_NO_ERRNO_AND_VALUE(
+ GetSNMPMetricFromProc(snmp, "Tcp", "AttemptFails"));
+
+ FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_STREAM, 0));
+
+ struct sockaddr_in sin = {
+ .sin_family = AF_INET,
+ .sin_port = htons(1234),
+ };
+
+ ASSERT_EQ(inet_pton(AF_INET, "127.0.0.1", &(sin.sin_addr)), 1);
+ ASSERT_THAT(connect(s.get(), (struct sockaddr*)&sin, sizeof(sin)),
+ SyscallFailsWithErrno(ECONNREFUSED));
+
+ uint64_t newAttemptFails;
+ uint64_t newActiveOpens;
+ uint64_t newOutRsts;
+ snmp = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/snmp"));
+ newActiveOpens = ASSERT_NO_ERRNO_AND_VALUE(
+ GetSNMPMetricFromProc(snmp, "Tcp", "ActiveOpens"));
+ newOutRsts =
+ ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Tcp", "OutRsts"));
+ newAttemptFails = ASSERT_NO_ERRNO_AND_VALUE(
+ GetSNMPMetricFromProc(snmp, "Tcp", "AttemptFails"));
+
+ EXPECT_EQ(oldActiveOpens, newActiveOpens - 1);
+ EXPECT_EQ(oldOutRsts, newOutRsts - 1);
+ EXPECT_EQ(oldAttemptFails, newAttemptFails - 1);
+}
+
+TEST(ProcNetSnmp, TcpEstab_NoRandomSave) {
+ // TODO(gvisor.dev/issue/866): epsocket metrics are not savable.
+ DisableSave ds;
+
+ uint64_t oldEstabResets;
+ uint64_t oldActiveOpens;
+ uint64_t oldPassiveOpens;
+ uint64_t oldCurrEstab;
+ auto snmp = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/snmp"));
+ oldActiveOpens = ASSERT_NO_ERRNO_AND_VALUE(
+ GetSNMPMetricFromProc(snmp, "Tcp", "ActiveOpens"));
+ oldPassiveOpens = ASSERT_NO_ERRNO_AND_VALUE(
+ GetSNMPMetricFromProc(snmp, "Tcp", "PassiveOpens"));
+ oldCurrEstab = ASSERT_NO_ERRNO_AND_VALUE(
+ GetSNMPMetricFromProc(snmp, "Tcp", "CurrEstab"));
+ oldEstabResets = ASSERT_NO_ERRNO_AND_VALUE(
+ GetSNMPMetricFromProc(snmp, "Tcp", "EstabResets"));
+
+ FileDescriptor s_listen =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_STREAM, 0));
+ struct sockaddr_in sin = {
+ .sin_family = AF_INET,
+ .sin_port = 0,
+ };
+
+ ASSERT_EQ(inet_pton(AF_INET, "127.0.0.1", &(sin.sin_addr)), 1);
+ ASSERT_THAT(bind(s_listen.get(), (struct sockaddr*)&sin, sizeof(sin)),
+ SyscallSucceeds());
+ ASSERT_THAT(listen(s_listen.get(), 1), SyscallSucceeds());
+
+ // Get the port bound by the listening socket.
+ socklen_t addrlen = sizeof(sin);
+ ASSERT_THAT(
+ getsockname(s_listen.get(), reinterpret_cast<sockaddr*>(&sin), &addrlen),
+ SyscallSucceeds());
+
+ FileDescriptor s_connect =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_STREAM, 0));
+ ASSERT_THAT(connect(s_connect.get(), (struct sockaddr*)&sin, sizeof(sin)),
+ SyscallSucceeds());
+
+ auto s_accept =
+ ASSERT_NO_ERRNO_AND_VALUE(Accept(s_listen.get(), nullptr, nullptr));
+
+ uint64_t newEstabResets;
+ uint64_t newActiveOpens;
+ uint64_t newPassiveOpens;
+ uint64_t newCurrEstab;
+ snmp = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/snmp"));
+ newActiveOpens = ASSERT_NO_ERRNO_AND_VALUE(
+ GetSNMPMetricFromProc(snmp, "Tcp", "ActiveOpens"));
+ newPassiveOpens = ASSERT_NO_ERRNO_AND_VALUE(
+ GetSNMPMetricFromProc(snmp, "Tcp", "PassiveOpens"));
+ newCurrEstab = ASSERT_NO_ERRNO_AND_VALUE(
+ GetSNMPMetricFromProc(snmp, "Tcp", "CurrEstab"));
+
+ EXPECT_EQ(oldActiveOpens, newActiveOpens - 1);
+ EXPECT_EQ(oldPassiveOpens, newPassiveOpens - 1);
+ EXPECT_EQ(oldCurrEstab, newCurrEstab - 2);
+
+ // Send 1 byte from client to server.
+ ASSERT_THAT(send(s_connect.get(), "a", 1, 0), SyscallSucceedsWithValue(1));
+
+ constexpr int kPollTimeoutMs = 20000; // Wait up to 20 seconds for the data.
+
+ // Wait until server-side fd sees the data on its side but don't read it.
+ struct pollfd poll_fd = {s_accept.get(), POLLIN, 0};
+ ASSERT_THAT(RetryEINTR(poll)(&poll_fd, 1, kPollTimeoutMs),
+ SyscallSucceedsWithValue(1));
+
+ // Now close server-side fd without reading the data which leads to a RST
+ // packet sent to client side.
+ s_accept.reset(-1);
+
+ // Wait until client-side fd sees RST packet.
+ struct pollfd poll_fd1 = {s_connect.get(), POLLIN, 0};
+ ASSERT_THAT(RetryEINTR(poll)(&poll_fd1, 1, kPollTimeoutMs),
+ SyscallSucceedsWithValue(1));
+
+ // Now close client-side fd.
+ s_connect.reset(-1);
+
+ // Wait until the process of the netstack.
+ absl::SleepFor(absl::Seconds(1));
+
+ snmp = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/snmp"));
+ newCurrEstab = ASSERT_NO_ERRNO_AND_VALUE(
+ GetSNMPMetricFromProc(snmp, "Tcp", "CurrEstab"));
+ newEstabResets = ASSERT_NO_ERRNO_AND_VALUE(
+ GetSNMPMetricFromProc(snmp, "Tcp", "EstabResets"));
+
+ EXPECT_EQ(oldCurrEstab, newCurrEstab);
+ EXPECT_EQ(oldEstabResets, newEstabResets - 2);
+}
+
+TEST(ProcNetSnmp, UdpNoPorts_NoRandomSave) {
+ // TODO(gvisor.dev/issue/866): epsocket metrics are not savable.
+ DisableSave ds;
+
+ uint64_t oldOutDatagrams;
+ uint64_t oldNoPorts;
+ auto snmp = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/snmp"));
+ oldOutDatagrams = ASSERT_NO_ERRNO_AND_VALUE(
+ GetSNMPMetricFromProc(snmp, "Udp", "OutDatagrams"));
+ oldNoPorts =
+ ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Udp", "NoPorts"));
+
+ FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+
+ struct sockaddr_in sin = {
+ .sin_family = AF_INET,
+ .sin_port = htons(4444),
+ };
+ ASSERT_EQ(inet_pton(AF_INET, "127.0.0.1", &(sin.sin_addr)), 1);
+ ASSERT_THAT(sendto(s.get(), "a", 1, 0, (struct sockaddr*)&sin, sizeof(sin)),
+ SyscallSucceedsWithValue(1));
+
+ uint64_t newOutDatagrams;
+ uint64_t newNoPorts;
+ snmp = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/snmp"));
+ newOutDatagrams = ASSERT_NO_ERRNO_AND_VALUE(
+ GetSNMPMetricFromProc(snmp, "Udp", "OutDatagrams"));
+ newNoPorts =
+ ASSERT_NO_ERRNO_AND_VALUE(GetSNMPMetricFromProc(snmp, "Udp", "NoPorts"));
+
+ EXPECT_EQ(oldOutDatagrams, newOutDatagrams - 1);
+ EXPECT_EQ(oldNoPorts, newNoPorts - 1);
+}
+
+TEST(ProcNetSnmp, UdpIn_NoRandomSave) {
+ // TODO(gvisor.dev/issue/866): epsocket metrics are not savable.
+ const DisableSave ds;
+
+ uint64_t oldOutDatagrams;
+ uint64_t oldInDatagrams;
+ auto snmp = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/snmp"));
+ oldOutDatagrams = ASSERT_NO_ERRNO_AND_VALUE(
+ GetSNMPMetricFromProc(snmp, "Udp", "OutDatagrams"));
+ oldInDatagrams = ASSERT_NO_ERRNO_AND_VALUE(
+ GetSNMPMetricFromProc(snmp, "Udp", "InDatagrams"));
+
+ std::cerr << "snmp: " << std::endl << snmp << std::endl;
+ FileDescriptor server =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+ struct sockaddr_in sin = {
+ .sin_family = AF_INET,
+ .sin_port = htons(0),
+ };
+ ASSERT_EQ(inet_pton(AF_INET, "127.0.0.1", &(sin.sin_addr)), 1);
+ ASSERT_THAT(bind(server.get(), (struct sockaddr*)&sin, sizeof(sin)),
+ SyscallSucceeds());
+ // Get the port bound by the server socket.
+ socklen_t addrlen = sizeof(sin);
+ ASSERT_THAT(
+ getsockname(server.get(), reinterpret_cast<sockaddr*>(&sin), &addrlen),
+ SyscallSucceeds());
+
+ FileDescriptor client =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+ ASSERT_THAT(
+ sendto(client.get(), "a", 1, 0, (struct sockaddr*)&sin, sizeof(sin)),
+ SyscallSucceedsWithValue(1));
+
+ char buf[128];
+ ASSERT_THAT(recvfrom(server.get(), buf, sizeof(buf), 0, NULL, NULL),
+ SyscallSucceedsWithValue(1));
+
+ uint64_t newOutDatagrams;
+ uint64_t newInDatagrams;
+ snmp = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/snmp"));
+ std::cerr << "new snmp: " << std::endl << snmp << std::endl;
+ newOutDatagrams = ASSERT_NO_ERRNO_AND_VALUE(
+ GetSNMPMetricFromProc(snmp, "Udp", "OutDatagrams"));
+ newInDatagrams = ASSERT_NO_ERRNO_AND_VALUE(
+ GetSNMPMetricFromProc(snmp, "Udp", "InDatagrams"));
+
+ EXPECT_EQ(oldOutDatagrams, newOutDatagrams - 1);
+ EXPECT_EQ(oldInDatagrams, newInDatagrams - 1);
+}
+
+TEST(ProcNetSnmp, CheckNetStat) {
+ // TODO(b/155123175): SNMP and netstat don't work on gVisor.
+ SKIP_IF(IsRunningOnGvisor());
+
+ std::string contents =
+ ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/netstat"));
+
+ int name_count = 0;
+ int value_count = 0;
+ std::vector<absl::string_view> lines = absl::StrSplit(contents, '\n');
+ for (int i = 0; i + 1 < lines.size(); i += 2) {
+ std::vector<absl::string_view> names =
+ absl::StrSplit(lines[i], absl::ByAnyChar("\t "));
+ std::vector<absl::string_view> values =
+ absl::StrSplit(lines[i + 1], absl::ByAnyChar("\t "));
+ EXPECT_EQ(names.size(), values.size()) << " mismatch in lines '" << lines[i]
+ << "' and '" << lines[i + 1] << "'";
+ for (int j = 0; j < names.size() && j < values.size(); ++j) {
+ if (names[j] == "TCPOrigDataSent" || names[j] == "TCPSynRetrans" ||
+ names[j] == "TCPDSACKRecv" || names[j] == "TCPDSACKOfoRecv") {
+ ++name_count;
+ int64_t val;
+ if (absl::SimpleAtoi(values[j], &val)) {
+ ++value_count;
+ }
+ }
+ }
+ }
+ EXPECT_EQ(name_count, 4);
+ EXPECT_EQ(value_count, 4);
+}
+
+TEST(ProcNetSnmp, Stat) {
+ struct stat st = {};
+ ASSERT_THAT(stat("/proc/net/snmp", &st), SyscallSucceeds());
+}
+
+TEST(ProcNetSnmp, CheckSnmp) {
+ // TODO(b/155123175): SNMP and netstat don't work on gVisor.
+ SKIP_IF(IsRunningOnGvisor());
+
+ std::string contents =
+ ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/snmp"));
+
+ int name_count = 0;
+ int value_count = 0;
+ std::vector<absl::string_view> lines = absl::StrSplit(contents, '\n');
+ for (int i = 0; i + 1 < lines.size(); i += 2) {
+ std::vector<absl::string_view> names =
+ absl::StrSplit(lines[i], absl::ByAnyChar("\t "));
+ std::vector<absl::string_view> values =
+ absl::StrSplit(lines[i + 1], absl::ByAnyChar("\t "));
+ EXPECT_EQ(names.size(), values.size()) << " mismatch in lines '" << lines[i]
+ << "' and '" << lines[i + 1] << "'";
+ for (int j = 0; j < names.size() && j < values.size(); ++j) {
+ if (names[j] == "RetransSegs") {
+ ++name_count;
+ int64_t val;
+ if (absl::SimpleAtoi(values[j], &val)) {
+ ++value_count;
+ }
+ }
+ }
+ }
+ EXPECT_EQ(name_count, 1);
+ EXPECT_EQ(value_count, 1);
+}
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/proc_net_tcp.cc b/test/syscalls/linux/proc_net_tcp.cc
new file mode 100644
index 000000000..5b6e3e3cd
--- /dev/null
+++ b/test/syscalls/linux/proc_net_tcp.cc
@@ -0,0 +1,496 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <netinet/tcp.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/str_split.h"
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+using absl::StrCat;
+using absl::StrSplit;
+
+constexpr char kProcNetTCPHeader[] =
+ " sl local_address rem_address st tx_queue rx_queue tr tm->when "
+ "retrnsmt uid timeout inode "
+ " ";
+
+// TCPEntry represents a single entry from /proc/net/tcp.
+struct TCPEntry {
+ uint32_t local_addr;
+ uint16_t local_port;
+
+ uint32_t remote_addr;
+ uint16_t remote_port;
+
+ uint64_t state;
+ uint64_t uid;
+ uint64_t inode;
+};
+
+// Finds the first entry in 'entries' for which 'predicate' returns true.
+// Returns true on match, and sets 'match' to a copy of the matching entry. If
+// 'match' is null, it's ignored.
+bool FindBy(const std::vector<TCPEntry>& entries, TCPEntry* match,
+ std::function<bool(const TCPEntry&)> predicate) {
+ for (const TCPEntry& entry : entries) {
+ if (predicate(entry)) {
+ if (match != nullptr) {
+ *match = entry;
+ }
+ return true;
+ }
+ }
+ return false;
+}
+
+bool FindByLocalAddr(const std::vector<TCPEntry>& entries, TCPEntry* match,
+ const struct sockaddr* addr) {
+ uint32_t host = IPFromInetSockaddr(addr);
+ uint16_t port = PortFromInetSockaddr(addr);
+ return FindBy(entries, match, [host, port](const TCPEntry& e) {
+ return (e.local_addr == host && e.local_port == port);
+ });
+}
+
+bool FindByRemoteAddr(const std::vector<TCPEntry>& entries, TCPEntry* match,
+ const struct sockaddr* addr) {
+ uint32_t host = IPFromInetSockaddr(addr);
+ uint16_t port = PortFromInetSockaddr(addr);
+ return FindBy(entries, match, [host, port](const TCPEntry& e) {
+ return (e.remote_addr == host && e.remote_port == port);
+ });
+}
+
+// Returns a parsed representation of /proc/net/tcp entries.
+PosixErrorOr<std::vector<TCPEntry>> ProcNetTCPEntries() {
+ std::string content;
+ RETURN_IF_ERRNO(GetContents("/proc/net/tcp", &content));
+
+ bool found_header = false;
+ std::vector<TCPEntry> entries;
+ std::vector<std::string> lines = StrSplit(content, '\n');
+ std::cerr << "<contents of /proc/net/tcp>" << std::endl;
+ for (const std::string& line : lines) {
+ std::cerr << line << std::endl;
+
+ if (!found_header) {
+ EXPECT_EQ(line, kProcNetTCPHeader);
+ found_header = true;
+ continue;
+ }
+ if (line.empty()) {
+ continue;
+ }
+
+ // Parse a single entry from /proc/net/tcp.
+ //
+ // Example entries:
+ //
+ // clang-format off
+ //
+ // sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode
+ // 0: 00000000:006F 00000000:0000 0A 00000000:00000000 00:00000000 00000000 0 0 1968 1 0000000000000000 100 0 0 10 0
+ // 1: 0100007F:7533 00000000:0000 0A 00000000:00000000 00:00000000 00000000 120 0 10684 1 0000000000000000 100 0 0 10 0
+ // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
+ //
+ // clang-format on
+
+ TCPEntry entry;
+ std::vector<std::string> fields =
+ StrSplit(line, absl::ByAnyChar(": "), absl::SkipEmpty());
+
+ ASSIGN_OR_RETURN_ERRNO(entry.local_addr, AtoiBase(fields[1], 16));
+ ASSIGN_OR_RETURN_ERRNO(entry.local_port, AtoiBase(fields[2], 16));
+
+ ASSIGN_OR_RETURN_ERRNO(entry.remote_addr, AtoiBase(fields[3], 16));
+ ASSIGN_OR_RETURN_ERRNO(entry.remote_port, AtoiBase(fields[4], 16));
+
+ ASSIGN_OR_RETURN_ERRNO(entry.state, AtoiBase(fields[5], 16));
+ ASSIGN_OR_RETURN_ERRNO(entry.uid, Atoi<uint64_t>(fields[11]));
+ ASSIGN_OR_RETURN_ERRNO(entry.inode, Atoi<uint64_t>(fields[13]));
+
+ entries.push_back(entry);
+ }
+ std::cerr << "<end of /proc/net/tcp>" << std::endl;
+
+ return entries;
+}
+
+TEST(ProcNetTCP, Exists) {
+ const std::string content =
+ ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/tcp"));
+ const std::string header_line = StrCat(kProcNetTCPHeader, "\n");
+ if (IsRunningOnGvisor()) {
+ // Should be just the header since we don't have any tcp sockets yet.
+ EXPECT_EQ(content, header_line);
+ } else {
+ // On a general linux machine, we could have abitrary sockets on the system,
+ // so just check the header.
+ EXPECT_THAT(content, ::testing::StartsWith(header_line));
+ }
+}
+
+TEST(ProcNetTCP, EntryUID) {
+ auto sockets =
+ ASSERT_NO_ERRNO_AND_VALUE(IPv4TCPAcceptBindSocketPair(0).Create());
+ std::vector<TCPEntry> entries =
+ ASSERT_NO_ERRNO_AND_VALUE(ProcNetTCPEntries());
+ TCPEntry e;
+ ASSERT_TRUE(FindByLocalAddr(entries, &e, sockets->first_addr()));
+ EXPECT_EQ(e.uid, geteuid());
+ ASSERT_TRUE(FindByRemoteAddr(entries, &e, sockets->first_addr()));
+ EXPECT_EQ(e.uid, geteuid());
+}
+
+TEST(ProcNetTCP, BindAcceptConnect) {
+ auto sockets =
+ ASSERT_NO_ERRNO_AND_VALUE(IPv4TCPAcceptBindSocketPair(0).Create());
+ std::vector<TCPEntry> entries =
+ ASSERT_NO_ERRNO_AND_VALUE(ProcNetTCPEntries());
+ // We can only make assertions about the total number of entries if we control
+ // the entire "machine".
+ if (IsRunningOnGvisor()) {
+ EXPECT_EQ(entries.size(), 2);
+ }
+
+ EXPECT_TRUE(FindByLocalAddr(entries, nullptr, sockets->first_addr()));
+ EXPECT_TRUE(FindByRemoteAddr(entries, nullptr, sockets->first_addr()));
+}
+
+TEST(ProcNetTCP, InodeReasonable) {
+ auto sockets =
+ ASSERT_NO_ERRNO_AND_VALUE(IPv4TCPAcceptBindSocketPair(0).Create());
+ std::vector<TCPEntry> entries =
+ ASSERT_NO_ERRNO_AND_VALUE(ProcNetTCPEntries());
+
+ TCPEntry accepted_entry;
+ ASSERT_TRUE(FindByLocalAddr(entries, &accepted_entry, sockets->first_addr()));
+ EXPECT_NE(accepted_entry.inode, 0);
+
+ TCPEntry client_entry;
+ ASSERT_TRUE(FindByRemoteAddr(entries, &client_entry, sockets->first_addr()));
+ EXPECT_NE(client_entry.inode, 0);
+ EXPECT_NE(accepted_entry.inode, client_entry.inode);
+}
+
+TEST(ProcNetTCP, State) {
+ std::unique_ptr<FileDescriptor> server =
+ ASSERT_NO_ERRNO_AND_VALUE(IPv4TCPUnboundSocket(0).Create());
+
+ auto test_addr = V4Loopback();
+ ASSERT_THAT(
+ bind(server->get(), reinterpret_cast<struct sockaddr*>(&test_addr.addr),
+ test_addr.addr_len),
+ SyscallSucceeds());
+
+ struct sockaddr addr;
+ socklen_t addrlen = sizeof(struct sockaddr);
+ ASSERT_THAT(getsockname(server->get(), &addr, &addrlen), SyscallSucceeds());
+ ASSERT_EQ(addrlen, sizeof(struct sockaddr));
+
+ ASSERT_THAT(listen(server->get(), 10), SyscallSucceeds());
+ std::vector<TCPEntry> entries =
+ ASSERT_NO_ERRNO_AND_VALUE(ProcNetTCPEntries());
+ TCPEntry listen_entry;
+ ASSERT_TRUE(FindByLocalAddr(entries, &listen_entry, &addr));
+ EXPECT_EQ(listen_entry.state, TCP_LISTEN);
+
+ std::unique_ptr<FileDescriptor> client =
+ ASSERT_NO_ERRNO_AND_VALUE(IPv4TCPUnboundSocket(0).Create());
+ ASSERT_THAT(RetryEINTR(connect)(client->get(), &addr, addrlen),
+ SyscallSucceeds());
+ entries = ASSERT_NO_ERRNO_AND_VALUE(ProcNetTCPEntries());
+ ASSERT_TRUE(FindByLocalAddr(entries, &listen_entry, &addr));
+ EXPECT_EQ(listen_entry.state, TCP_LISTEN);
+ TCPEntry client_entry;
+ ASSERT_TRUE(FindByRemoteAddr(entries, &client_entry, &addr));
+ EXPECT_EQ(client_entry.state, TCP_ESTABLISHED);
+
+ FileDescriptor accepted =
+ ASSERT_NO_ERRNO_AND_VALUE(Accept(server->get(), nullptr, nullptr));
+
+ const uint32_t accepted_local_host = IPFromInetSockaddr(&addr);
+ const uint16_t accepted_local_port = PortFromInetSockaddr(&addr);
+
+ entries = ASSERT_NO_ERRNO_AND_VALUE(ProcNetTCPEntries());
+ TCPEntry accepted_entry;
+ ASSERT_TRUE(FindBy(entries, &accepted_entry,
+ [client_entry, accepted_local_host,
+ accepted_local_port](const TCPEntry& e) {
+ return e.local_addr == accepted_local_host &&
+ e.local_port == accepted_local_port &&
+ e.remote_addr == client_entry.local_addr &&
+ e.remote_port == client_entry.local_port;
+ }));
+ EXPECT_EQ(accepted_entry.state, TCP_ESTABLISHED);
+}
+
+constexpr char kProcNetTCP6Header[] =
+ " sl local_address remote_address"
+ " st tx_queue rx_queue tr tm->when retrnsmt"
+ " uid timeout inode";
+
+// TCP6Entry represents a single entry from /proc/net/tcp6.
+struct TCP6Entry {
+ struct in6_addr local_addr;
+ uint16_t local_port;
+
+ struct in6_addr remote_addr;
+ uint16_t remote_port;
+
+ uint64_t state;
+ uint64_t uid;
+ uint64_t inode;
+};
+
+bool IPv6AddrEqual(const struct in6_addr* a1, const struct in6_addr* a2) {
+ return memcmp(a1, a2, sizeof(struct in6_addr)) == 0;
+}
+
+// Finds the first entry in 'entries' for which 'predicate' returns true.
+// Returns true on match, and sets 'match' to a copy of the matching entry. If
+// 'match' is null, it's ignored.
+bool FindBy6(const std::vector<TCP6Entry>& entries, TCP6Entry* match,
+ std::function<bool(const TCP6Entry&)> predicate) {
+ for (const TCP6Entry& entry : entries) {
+ if (predicate(entry)) {
+ if (match != nullptr) {
+ *match = entry;
+ }
+ return true;
+ }
+ }
+ return false;
+}
+
+const struct in6_addr* IP6FromInetSockaddr(const struct sockaddr* addr) {
+ auto* addr6 = reinterpret_cast<const struct sockaddr_in6*>(addr);
+ return &addr6->sin6_addr;
+}
+
+bool FindByLocalAddr6(const std::vector<TCP6Entry>& entries, TCP6Entry* match,
+ const struct sockaddr* addr) {
+ const struct in6_addr* local = IP6FromInetSockaddr(addr);
+ uint16_t port = PortFromInetSockaddr(addr);
+ return FindBy6(entries, match, [local, port](const TCP6Entry& e) {
+ return (IPv6AddrEqual(&e.local_addr, local) && e.local_port == port);
+ });
+}
+
+bool FindByRemoteAddr6(const std::vector<TCP6Entry>& entries, TCP6Entry* match,
+ const struct sockaddr* addr) {
+ const struct in6_addr* remote = IP6FromInetSockaddr(addr);
+ uint16_t port = PortFromInetSockaddr(addr);
+ return FindBy6(entries, match, [remote, port](const TCP6Entry& e) {
+ return (IPv6AddrEqual(&e.remote_addr, remote) && e.remote_port == port);
+ });
+}
+
+void ReadIPv6Address(std::string s, struct in6_addr* addr) {
+ uint32_t a0, a1, a2, a3;
+ const char* fmt = "%08X%08X%08X%08X";
+ EXPECT_EQ(sscanf(s.c_str(), fmt, &a0, &a1, &a2, &a3), 4);
+
+ uint8_t* b = addr->s6_addr;
+ *((uint32_t*)&b[0]) = a0;
+ *((uint32_t*)&b[4]) = a1;
+ *((uint32_t*)&b[8]) = a2;
+ *((uint32_t*)&b[12]) = a3;
+}
+
+// Returns a parsed representation of /proc/net/tcp6 entries.
+PosixErrorOr<std::vector<TCP6Entry>> ProcNetTCP6Entries() {
+ std::string content;
+ RETURN_IF_ERRNO(GetContents("/proc/net/tcp6", &content));
+
+ bool found_header = false;
+ std::vector<TCP6Entry> entries;
+ std::vector<std::string> lines = StrSplit(content, '\n');
+ std::cerr << "<contents of /proc/net/tcp6>" << std::endl;
+ for (const std::string& line : lines) {
+ std::cerr << line << std::endl;
+
+ if (!found_header) {
+ EXPECT_EQ(line, kProcNetTCP6Header);
+ found_header = true;
+ continue;
+ }
+ if (line.empty()) {
+ continue;
+ }
+
+ // Parse a single entry from /proc/net/tcp6.
+ //
+ // Example entries:
+ //
+ // clang-format off
+ //
+ // sl local_address remote_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode
+ // 0: 00000000000000000000000000000000:1F90 00000000000000000000000000000000:0000 0A 00000000:00000000 00:00000000 00000000 0 0 876340 1 ffff8803da9c9380 100 0 0 10 0
+ // 1: 00000000000000000000000000000000:C350 00000000000000000000000000000000:0000 0A 00000000:00000000 00:00000000 00000000 0 0 876987 1 ffff8803ec408000 100 0 0 10 0
+ // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
+ //
+ // clang-format on
+
+ TCP6Entry entry;
+ std::vector<std::string> fields =
+ StrSplit(line, absl::ByAnyChar(": "), absl::SkipEmpty());
+
+ ReadIPv6Address(fields[1], &entry.local_addr);
+ ASSIGN_OR_RETURN_ERRNO(entry.local_port, AtoiBase(fields[2], 16));
+ ReadIPv6Address(fields[3], &entry.remote_addr);
+ ASSIGN_OR_RETURN_ERRNO(entry.remote_port, AtoiBase(fields[4], 16));
+ ASSIGN_OR_RETURN_ERRNO(entry.state, AtoiBase(fields[5], 16));
+ ASSIGN_OR_RETURN_ERRNO(entry.uid, Atoi<uint64_t>(fields[11]));
+ ASSIGN_OR_RETURN_ERRNO(entry.inode, Atoi<uint64_t>(fields[13]));
+
+ entries.push_back(entry);
+ }
+ std::cerr << "<end of /proc/net/tcp6>" << std::endl;
+
+ return entries;
+}
+
+TEST(ProcNetTCP6, Exists) {
+ const std::string content =
+ ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/tcp6"));
+ const std::string header_line = StrCat(kProcNetTCP6Header, "\n");
+ if (IsRunningOnGvisor()) {
+ // Should be just the header since we don't have any tcp sockets yet.
+ EXPECT_EQ(content, header_line);
+ } else {
+ // On a general linux machine, we could have abitrary sockets on the system,
+ // so just check the header.
+ EXPECT_THAT(content, ::testing::StartsWith(header_line));
+ }
+}
+
+TEST(ProcNetTCP6, EntryUID) {
+ auto sockets =
+ ASSERT_NO_ERRNO_AND_VALUE(IPv6TCPAcceptBindSocketPair(0).Create());
+ std::vector<TCP6Entry> entries =
+ ASSERT_NO_ERRNO_AND_VALUE(ProcNetTCP6Entries());
+ TCP6Entry e;
+
+ ASSERT_TRUE(FindByLocalAddr6(entries, &e, sockets->first_addr()));
+ EXPECT_EQ(e.uid, geteuid());
+ ASSERT_TRUE(FindByRemoteAddr6(entries, &e, sockets->first_addr()));
+ EXPECT_EQ(e.uid, geteuid());
+}
+
+TEST(ProcNetTCP6, BindAcceptConnect) {
+ auto sockets =
+ ASSERT_NO_ERRNO_AND_VALUE(IPv6TCPAcceptBindSocketPair(0).Create());
+ std::vector<TCP6Entry> entries =
+ ASSERT_NO_ERRNO_AND_VALUE(ProcNetTCP6Entries());
+ // We can only make assertions about the total number of entries if we control
+ // the entire "machine".
+ if (IsRunningOnGvisor()) {
+ EXPECT_EQ(entries.size(), 2);
+ }
+
+ EXPECT_TRUE(FindByLocalAddr6(entries, nullptr, sockets->first_addr()));
+ EXPECT_TRUE(FindByRemoteAddr6(entries, nullptr, sockets->first_addr()));
+}
+
+TEST(ProcNetTCP6, InodeReasonable) {
+ auto sockets =
+ ASSERT_NO_ERRNO_AND_VALUE(IPv6TCPAcceptBindSocketPair(0).Create());
+ std::vector<TCP6Entry> entries =
+ ASSERT_NO_ERRNO_AND_VALUE(ProcNetTCP6Entries());
+
+ TCP6Entry accepted_entry;
+
+ ASSERT_TRUE(
+ FindByLocalAddr6(entries, &accepted_entry, sockets->first_addr()));
+ EXPECT_NE(accepted_entry.inode, 0);
+
+ TCP6Entry client_entry;
+ ASSERT_TRUE(FindByRemoteAddr6(entries, &client_entry, sockets->first_addr()));
+ EXPECT_NE(client_entry.inode, 0);
+ EXPECT_NE(accepted_entry.inode, client_entry.inode);
+}
+
+TEST(ProcNetTCP6, State) {
+ std::unique_ptr<FileDescriptor> server =
+ ASSERT_NO_ERRNO_AND_VALUE(IPv6TCPUnboundSocket(0).Create());
+
+ auto test_addr = V6Loopback();
+ ASSERT_THAT(
+ bind(server->get(), reinterpret_cast<struct sockaddr*>(&test_addr.addr),
+ test_addr.addr_len),
+ SyscallSucceeds());
+
+ struct sockaddr_in6 addr6;
+ socklen_t addrlen = sizeof(struct sockaddr_in6);
+ auto* addr = reinterpret_cast<struct sockaddr*>(&addr6);
+ ASSERT_THAT(getsockname(server->get(), addr, &addrlen), SyscallSucceeds());
+ ASSERT_EQ(addrlen, sizeof(struct sockaddr_in6));
+
+ ASSERT_THAT(listen(server->get(), 10), SyscallSucceeds());
+ std::vector<TCP6Entry> entries =
+ ASSERT_NO_ERRNO_AND_VALUE(ProcNetTCP6Entries());
+ TCP6Entry listen_entry;
+
+ ASSERT_TRUE(FindByLocalAddr6(entries, &listen_entry, addr));
+ EXPECT_EQ(listen_entry.state, TCP_LISTEN);
+
+ std::unique_ptr<FileDescriptor> client =
+ ASSERT_NO_ERRNO_AND_VALUE(IPv6TCPUnboundSocket(0).Create());
+ ASSERT_THAT(RetryEINTR(connect)(client->get(), addr, addrlen),
+ SyscallSucceeds());
+ entries = ASSERT_NO_ERRNO_AND_VALUE(ProcNetTCP6Entries());
+ ASSERT_TRUE(FindByLocalAddr6(entries, &listen_entry, addr));
+ EXPECT_EQ(listen_entry.state, TCP_LISTEN);
+ TCP6Entry client_entry;
+ ASSERT_TRUE(FindByRemoteAddr6(entries, &client_entry, addr));
+ EXPECT_EQ(client_entry.state, TCP_ESTABLISHED);
+
+ FileDescriptor accepted =
+ ASSERT_NO_ERRNO_AND_VALUE(Accept(server->get(), nullptr, nullptr));
+
+ const struct in6_addr* local = IP6FromInetSockaddr(addr);
+ const uint16_t accepted_local_port = PortFromInetSockaddr(addr);
+
+ entries = ASSERT_NO_ERRNO_AND_VALUE(ProcNetTCP6Entries());
+ TCP6Entry accepted_entry;
+ ASSERT_TRUE(FindBy6(
+ entries, &accepted_entry,
+ [client_entry, local, accepted_local_port](const TCP6Entry& e) {
+ return IPv6AddrEqual(&e.local_addr, local) &&
+ e.local_port == accepted_local_port &&
+ IPv6AddrEqual(&e.remote_addr, &client_entry.local_addr) &&
+ e.remote_port == client_entry.local_port;
+ }));
+ EXPECT_EQ(accepted_entry.state, TCP_ESTABLISHED);
+}
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/proc_net_udp.cc b/test/syscalls/linux/proc_net_udp.cc
new file mode 100644
index 000000000..786b4b4af
--- /dev/null
+++ b/test/syscalls/linux/proc_net_udp.cc
@@ -0,0 +1,309 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <netinet/tcp.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/str_split.h"
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+using absl::StrCat;
+using absl::StrFormat;
+using absl::StrSplit;
+
+constexpr char kProcNetUDPHeader[] =
+ " sl local_address rem_address st tx_queue rx_queue tr tm->when "
+ "retrnsmt uid timeout inode ref pointer drops ";
+
+// UDPEntry represents a single entry from /proc/net/udp.
+struct UDPEntry {
+ uint32_t local_addr;
+ uint16_t local_port;
+
+ uint32_t remote_addr;
+ uint16_t remote_port;
+
+ uint64_t state;
+ uint64_t uid;
+ uint64_t inode;
+};
+
+std::string DescribeFirstInetSocket(const SocketPair& sockets) {
+ const struct sockaddr* addr = sockets.first_addr();
+ return StrFormat("First test socket: fd:%d %8X:%4X", sockets.first_fd(),
+ IPFromInetSockaddr(addr), PortFromInetSockaddr(addr));
+}
+
+std::string DescribeSecondInetSocket(const SocketPair& sockets) {
+ const struct sockaddr* addr = sockets.second_addr();
+ return StrFormat("Second test socket fd:%d %8X:%4X", sockets.second_fd(),
+ IPFromInetSockaddr(addr), PortFromInetSockaddr(addr));
+}
+
+// Finds the first entry in 'entries' for which 'predicate' returns true.
+// Returns true on match, and set 'match' to a copy of the matching entry. If
+// 'match' is null, it's ignored.
+bool FindBy(const std::vector<UDPEntry>& entries, UDPEntry* match,
+ std::function<bool(const UDPEntry&)> predicate) {
+ for (const UDPEntry& entry : entries) {
+ if (predicate(entry)) {
+ if (match != nullptr) {
+ *match = entry;
+ }
+ return true;
+ }
+ }
+ return false;
+}
+
+bool FindByLocalAddr(const std::vector<UDPEntry>& entries, UDPEntry* match,
+ const struct sockaddr* addr) {
+ uint32_t host = IPFromInetSockaddr(addr);
+ uint16_t port = PortFromInetSockaddr(addr);
+ return FindBy(entries, match, [host, port](const UDPEntry& e) {
+ return (e.local_addr == host && e.local_port == port);
+ });
+}
+
+bool FindByRemoteAddr(const std::vector<UDPEntry>& entries, UDPEntry* match,
+ const struct sockaddr* addr) {
+ uint32_t host = IPFromInetSockaddr(addr);
+ uint16_t port = PortFromInetSockaddr(addr);
+ return FindBy(entries, match, [host, port](const UDPEntry& e) {
+ return (e.remote_addr == host && e.remote_port == port);
+ });
+}
+
+PosixErrorOr<uint64_t> InodeFromSocketFD(int fd) {
+ ASSIGN_OR_RETURN_ERRNO(struct stat s, Fstat(fd));
+ if (!S_ISSOCK(s.st_mode)) {
+ return PosixError(EINVAL, StrFormat("FD %d is not a socket", fd));
+ }
+ return s.st_ino;
+}
+
+PosixErrorOr<bool> FindByFD(const std::vector<UDPEntry>& entries,
+ UDPEntry* match, int fd) {
+ ASSIGN_OR_RETURN_ERRNO(uint64_t inode, InodeFromSocketFD(fd));
+ return FindBy(entries, match,
+ [inode](const UDPEntry& e) { return (e.inode == inode); });
+}
+
+// Returns a parsed representation of /proc/net/udp entries.
+PosixErrorOr<std::vector<UDPEntry>> ProcNetUDPEntries() {
+ std::string content;
+ RETURN_IF_ERRNO(GetContents("/proc/net/udp", &content));
+
+ bool found_header = false;
+ std::vector<UDPEntry> entries;
+ std::vector<std::string> lines = StrSplit(content, '\n');
+ std::cerr << "<contents of /proc/net/udp>" << std::endl;
+ for (const std::string& line : lines) {
+ std::cerr << line << std::endl;
+
+ if (!found_header) {
+ EXPECT_EQ(line, kProcNetUDPHeader);
+ found_header = true;
+ continue;
+ }
+ if (line.empty()) {
+ continue;
+ }
+
+ // Parse a single entry from /proc/net/udp.
+ //
+ // Example entries:
+ //
+ // clang-format off
+ //
+ // sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode ref pointer drops
+ // 3503: 0100007F:0035 00000000:0000 07 00000000:00000000 00:00000000 00000000 0 0 33317 2 0000000000000000 0
+ // 3518: 00000000:0044 00000000:0000 07 00000000:00000000 00:00000000 00000000 0 0 40394 2 0000000000000000 0
+ // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
+ //
+ // clang-format on
+
+ UDPEntry entry;
+ std::vector<std::string> fields =
+ StrSplit(line, absl::ByAnyChar(": "), absl::SkipEmpty());
+
+ ASSIGN_OR_RETURN_ERRNO(entry.local_addr, AtoiBase(fields[1], 16));
+ ASSIGN_OR_RETURN_ERRNO(entry.local_port, AtoiBase(fields[2], 16));
+
+ ASSIGN_OR_RETURN_ERRNO(entry.remote_addr, AtoiBase(fields[3], 16));
+ ASSIGN_OR_RETURN_ERRNO(entry.remote_port, AtoiBase(fields[4], 16));
+
+ ASSIGN_OR_RETURN_ERRNO(entry.state, AtoiBase(fields[5], 16));
+ ASSIGN_OR_RETURN_ERRNO(entry.uid, Atoi<uint64_t>(fields[11]));
+ ASSIGN_OR_RETURN_ERRNO(entry.inode, Atoi<uint64_t>(fields[13]));
+
+ // Linux shares internal data structures between TCP and UDP sockets. The
+ // proc entries for UDP sockets share some fields with TCP sockets, but
+ // these fields should always be zero as they're not meaningful for UDP
+ // sockets.
+ EXPECT_EQ(fields[8], "00") << StrFormat("sl:%s, tr", fields[0]);
+ EXPECT_EQ(fields[9], "00000000") << StrFormat("sl:%s, tm->when", fields[0]);
+ EXPECT_EQ(fields[10], "00000000")
+ << StrFormat("sl:%s, retrnsmt", fields[0]);
+ EXPECT_EQ(fields[12], "0") << StrFormat("sl:%s, timeout", fields[0]);
+
+ entries.push_back(entry);
+ }
+ std::cerr << "<end of /proc/net/udp>" << std::endl;
+
+ return entries;
+}
+
+TEST(ProcNetUDP, Exists) {
+ const std::string content =
+ ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/udp"));
+ const std::string header_line = StrCat(kProcNetUDPHeader, "\n");
+ EXPECT_THAT(content, ::testing::StartsWith(header_line));
+}
+
+TEST(ProcNetUDP, EntryUID) {
+ auto sockets =
+ ASSERT_NO_ERRNO_AND_VALUE(IPv4UDPBidirectionalBindSocketPair(0).Create());
+ std::vector<UDPEntry> entries =
+ ASSERT_NO_ERRNO_AND_VALUE(ProcNetUDPEntries());
+ UDPEntry e;
+ ASSERT_TRUE(FindByLocalAddr(entries, &e, sockets->first_addr()))
+ << DescribeFirstInetSocket(*sockets);
+ EXPECT_EQ(e.uid, geteuid());
+ ASSERT_TRUE(FindByRemoteAddr(entries, &e, sockets->first_addr()))
+ << DescribeSecondInetSocket(*sockets);
+ EXPECT_EQ(e.uid, geteuid());
+}
+
+TEST(ProcNetUDP, FindMutualEntries) {
+ auto sockets =
+ ASSERT_NO_ERRNO_AND_VALUE(IPv4UDPBidirectionalBindSocketPair(0).Create());
+ std::vector<UDPEntry> entries =
+ ASSERT_NO_ERRNO_AND_VALUE(ProcNetUDPEntries());
+
+ EXPECT_TRUE(FindByLocalAddr(entries, nullptr, sockets->first_addr()))
+ << DescribeFirstInetSocket(*sockets);
+ EXPECT_TRUE(FindByRemoteAddr(entries, nullptr, sockets->first_addr()))
+ << DescribeSecondInetSocket(*sockets);
+
+ EXPECT_TRUE(FindByLocalAddr(entries, nullptr, sockets->second_addr()))
+ << DescribeSecondInetSocket(*sockets);
+ EXPECT_TRUE(FindByRemoteAddr(entries, nullptr, sockets->second_addr()))
+ << DescribeFirstInetSocket(*sockets);
+}
+
+TEST(ProcNetUDP, EntriesRemovedOnClose) {
+ auto sockets =
+ ASSERT_NO_ERRNO_AND_VALUE(IPv4UDPBidirectionalBindSocketPair(0).Create());
+ std::vector<UDPEntry> entries =
+ ASSERT_NO_ERRNO_AND_VALUE(ProcNetUDPEntries());
+
+ EXPECT_TRUE(FindByLocalAddr(entries, nullptr, sockets->first_addr()))
+ << DescribeFirstInetSocket(*sockets);
+ EXPECT_TRUE(FindByLocalAddr(entries, nullptr, sockets->second_addr()))
+ << DescribeSecondInetSocket(*sockets);
+
+ EXPECT_THAT(close(sockets->release_first_fd()), SyscallSucceeds());
+ entries = ASSERT_NO_ERRNO_AND_VALUE(ProcNetUDPEntries());
+ // First socket's entry should be gone, but the second socket's entry should
+ // still exist.
+ EXPECT_FALSE(FindByLocalAddr(entries, nullptr, sockets->first_addr()))
+ << DescribeFirstInetSocket(*sockets);
+ EXPECT_TRUE(FindByLocalAddr(entries, nullptr, sockets->second_addr()))
+ << DescribeSecondInetSocket(*sockets);
+
+ EXPECT_THAT(close(sockets->release_second_fd()), SyscallSucceeds());
+ entries = ASSERT_NO_ERRNO_AND_VALUE(ProcNetUDPEntries());
+ // Both entries should be gone.
+ EXPECT_FALSE(FindByLocalAddr(entries, nullptr, sockets->first_addr()))
+ << DescribeFirstInetSocket(*sockets);
+ EXPECT_FALSE(FindByLocalAddr(entries, nullptr, sockets->second_addr()))
+ << DescribeSecondInetSocket(*sockets);
+}
+
+PosixErrorOr<std::unique_ptr<FileDescriptor>> BoundUDPSocket() {
+ ASSIGN_OR_RETURN_ERRNO(std::unique_ptr<FileDescriptor> socket,
+ IPv4UDPUnboundSocket(0).Create());
+ struct sockaddr_in addr;
+ addr.sin_family = AF_INET;
+ addr.sin_addr.s_addr = htonl(INADDR_ANY);
+ addr.sin_port = 0;
+
+ int res = bind(socket->get(), reinterpret_cast<const struct sockaddr*>(&addr),
+ sizeof(addr));
+ if (res) {
+ return PosixError(errno, "bind()");
+ }
+ return socket;
+}
+
+TEST(ProcNetUDP, BoundEntry) {
+ std::unique_ptr<FileDescriptor> socket =
+ ASSERT_NO_ERRNO_AND_VALUE(BoundUDPSocket());
+ struct sockaddr addr;
+ socklen_t len = sizeof(addr);
+ ASSERT_THAT(getsockname(socket->get(), &addr, &len), SyscallSucceeds());
+ uint16_t port = PortFromInetSockaddr(&addr);
+
+ std::vector<UDPEntry> entries =
+ ASSERT_NO_ERRNO_AND_VALUE(ProcNetUDPEntries());
+ UDPEntry e;
+ ASSERT_TRUE(ASSERT_NO_ERRNO_AND_VALUE(FindByFD(entries, &e, socket->get())));
+ EXPECT_EQ(e.local_port, port);
+ EXPECT_EQ(e.remote_addr, 0);
+ EXPECT_EQ(e.remote_port, 0);
+}
+
+TEST(ProcNetUDP, BoundSocketStateClosed) {
+ std::unique_ptr<FileDescriptor> socket =
+ ASSERT_NO_ERRNO_AND_VALUE(BoundUDPSocket());
+ std::vector<UDPEntry> entries =
+ ASSERT_NO_ERRNO_AND_VALUE(ProcNetUDPEntries());
+ UDPEntry e;
+ ASSERT_TRUE(ASSERT_NO_ERRNO_AND_VALUE(FindByFD(entries, &e, socket->get())));
+ EXPECT_EQ(e.state, TCP_CLOSE);
+}
+
+TEST(ProcNetUDP, ConnectedSocketStateEstablished) {
+ auto sockets =
+ ASSERT_NO_ERRNO_AND_VALUE(IPv4UDPBidirectionalBindSocketPair(0).Create());
+ std::vector<UDPEntry> entries =
+ ASSERT_NO_ERRNO_AND_VALUE(ProcNetUDPEntries());
+
+ UDPEntry e;
+ ASSERT_TRUE(FindByLocalAddr(entries, &e, sockets->first_addr()))
+ << DescribeFirstInetSocket(*sockets);
+ EXPECT_EQ(e.state, TCP_ESTABLISHED);
+
+ ASSERT_TRUE(FindByLocalAddr(entries, &e, sockets->second_addr()))
+ << DescribeSecondInetSocket(*sockets);
+ EXPECT_EQ(e.state, TCP_ESTABLISHED);
+}
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/proc_net_unix.cc b/test/syscalls/linux/proc_net_unix.cc
new file mode 100644
index 000000000..a63067586
--- /dev/null
+++ b/test/syscalls/linux/proc_net_unix.cc
@@ -0,0 +1,443 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gtest/gtest.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/str_split.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+using absl::StrCat;
+using absl::StreamFormat;
+using absl::StrFormat;
+
+constexpr char kProcNetUnixHeader[] =
+ "Num RefCount Protocol Flags Type St Inode Path";
+
+// Possible values of the "st" field in a /proc/net/unix entry. Source: Linux
+// kernel, include/uapi/linux/net.h.
+enum {
+ SS_FREE = 0, // Not allocated
+ SS_UNCONNECTED, // Unconnected to any socket
+ SS_CONNECTING, // In process of connecting
+ SS_CONNECTED, // Connected to socket
+ SS_DISCONNECTING // In process of disconnecting
+};
+
+// UnixEntry represents a single entry from /proc/net/unix.
+struct UnixEntry {
+ uintptr_t addr;
+ uint64_t refs;
+ uint64_t protocol;
+ uint64_t flags;
+ uint64_t type;
+ uint64_t state;
+ uint64_t inode;
+ std::string path;
+};
+
+// Abstract socket paths can have either trailing null bytes or '@'s as padding
+// at the end, depending on the linux version. This function strips any such
+// padding.
+void StripAbstractPathPadding(std::string* s) {
+ const char pad_char = s->back();
+ if (pad_char != '\0' && pad_char != '@') {
+ return;
+ }
+
+ const auto last_pos = s->find_last_not_of(pad_char);
+ if (last_pos != std::string::npos) {
+ s->resize(last_pos + 1);
+ }
+}
+
+// Precondition: addr must be a unix socket address (i.e. sockaddr_un) and
+// addr->sun_path must be null-terminated. This is always the case if addr comes
+// from Linux:
+//
+// Per man unix(7):
+//
+// "When the address of a pathname socket is returned (by [getsockname(2)]), its
+// length is
+//
+// offsetof(struct sockaddr_un, sun_path) + strlen(sun_path) + 1
+//
+// and sun_path contains the null-terminated pathname."
+std::string ExtractPath(const struct sockaddr* addr) {
+ const char* path =
+ reinterpret_cast<const struct sockaddr_un*>(addr)->sun_path;
+ // Note: sockaddr_un.sun_path is an embedded character array of length
+ // UNIX_PATH_MAX, so we can always safely dereference the first 2 bytes below.
+ //
+ // We also rely on the path being null-terminated.
+ if (path[0] == 0) {
+ std::string abstract_path = StrCat("@", &path[1]);
+ StripAbstractPathPadding(&abstract_path);
+ return abstract_path;
+ }
+ return std::string(path);
+}
+
+// Returns a parsed representation of /proc/net/unix entries.
+PosixErrorOr<std::vector<UnixEntry>> ProcNetUnixEntries() {
+ std::string content;
+ RETURN_IF_ERRNO(GetContents("/proc/net/unix", &content));
+
+ bool skipped_header = false;
+ std::vector<UnixEntry> entries;
+ std::vector<std::string> lines = absl::StrSplit(content, '\n');
+ std::cerr << "<contents of /proc/net/unix>" << std::endl;
+ for (const std::string& line : lines) {
+ // Emit the proc entry to the test output to provide context for the test
+ // results.
+ std::cerr << line << std::endl;
+
+ if (!skipped_header) {
+ EXPECT_EQ(line, kProcNetUnixHeader);
+ skipped_header = true;
+ continue;
+ }
+ if (line.empty()) {
+ continue;
+ }
+
+ // Parse a single entry from /proc/net/unix.
+ //
+ // Sample file:
+ //
+ // clang-format off
+ //
+ // Num RefCount Protocol Flags Type St Inode Path"
+ // ffffa130e7041c00: 00000002 00000000 00010000 0001 01 1299413685 /tmp/control_server/13293772586877554487
+ // ffffa14f547dc400: 00000002 00000000 00010000 0001 01 3793 @remote_coredump
+ //
+ // clang-format on
+ //
+ // Note that from the second entry, the inode number can be padded using
+ // spaces, so we need to handle it separately during parsing. See
+ // net/unix/af_unix.c:unix_seq_show() for how these entries are produced. In
+ // particular, only the inode field is padded with spaces.
+ UnixEntry entry;
+
+ // Process the first 6 fields, up to but not including "Inode".
+ std::vector<std::string> fields =
+ absl::StrSplit(line, absl::MaxSplits(' ', 6));
+
+ if (fields.size() < 7) {
+ return PosixError(EINVAL, StrFormat("Invalid entry: '%s'\n", line));
+ }
+
+ // AtoiBase can't handle the ':' in the "Num" field, so strip it out.
+ std::vector<std::string> addr = absl::StrSplit(fields[0], ':');
+ ASSIGN_OR_RETURN_ERRNO(entry.addr, AtoiBase(addr[0], 16));
+
+ ASSIGN_OR_RETURN_ERRNO(entry.refs, AtoiBase(fields[1], 16));
+ ASSIGN_OR_RETURN_ERRNO(entry.protocol, AtoiBase(fields[2], 16));
+ ASSIGN_OR_RETURN_ERRNO(entry.flags, AtoiBase(fields[3], 16));
+ ASSIGN_OR_RETURN_ERRNO(entry.type, AtoiBase(fields[4], 16));
+ ASSIGN_OR_RETURN_ERRNO(entry.state, AtoiBase(fields[5], 16));
+
+ absl::string_view rest = absl::StripAsciiWhitespace(fields[6]);
+ fields = absl::StrSplit(rest, absl::MaxSplits(' ', 1));
+ if (fields.empty()) {
+ return PosixError(
+ EINVAL, StrFormat("Invalid entry, missing 'Inode': '%s'\n", line));
+ }
+ ASSIGN_OR_RETURN_ERRNO(entry.inode, AtoiBase(fields[0], 10));
+
+ entry.path = "";
+ if (fields.size() > 1) {
+ entry.path = fields[1];
+ StripAbstractPathPadding(&entry.path);
+ }
+
+ entries.push_back(entry);
+ }
+ std::cerr << "<end of /proc/net/unix>" << std::endl;
+
+ return entries;
+}
+
+// Finds the first entry in 'entries' for which 'predicate' returns true.
+// Returns true on match, and sets 'match' to point to the matching entry.
+bool FindBy(std::vector<UnixEntry> entries, UnixEntry* match,
+ std::function<bool(const UnixEntry&)> predicate) {
+ for (int i = 0; i < entries.size(); ++i) {
+ if (predicate(entries[i])) {
+ *match = entries[i];
+ return true;
+ }
+ }
+ return false;
+}
+
+bool FindByPath(std::vector<UnixEntry> entries, UnixEntry* match,
+ const std::string& path) {
+ return FindBy(entries, match,
+ [path](const UnixEntry& e) { return e.path == path; });
+}
+
+TEST(ProcNetUnix, Exists) {
+ const std::string content =
+ ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/net/unix"));
+ const std::string header_line = StrCat(kProcNetUnixHeader, "\n");
+ if (IsRunningOnGvisor()) {
+ // Should be just the header since we don't have any unix domain sockets
+ // yet.
+ EXPECT_EQ(content, header_line);
+ } else {
+ // However, on a general linux machine, we could have abitrary sockets on
+ // the system, so just check the header.
+ EXPECT_THAT(content, ::testing::StartsWith(header_line));
+ }
+}
+
+TEST(ProcNetUnix, FilesystemBindAcceptConnect) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(
+ FilesystemBoundUnixDomainSocketPair(SOCK_STREAM).Create());
+
+ std::string path1 = ExtractPath(sockets->first_addr());
+ std::string path2 = ExtractPath(sockets->second_addr());
+ std::cerr << StreamFormat("Server socket address (path1): %s\n", path1);
+ std::cerr << StreamFormat("Client socket address (path2): %s\n", path2);
+
+ std::vector<UnixEntry> entries =
+ ASSERT_NO_ERRNO_AND_VALUE(ProcNetUnixEntries());
+ if (IsRunningOnGvisor()) {
+ EXPECT_EQ(entries.size(), 2);
+ }
+
+ // The server-side socket's path is listed in the socket entry...
+ UnixEntry s1;
+ EXPECT_TRUE(FindByPath(entries, &s1, path1));
+
+ // ... but the client-side socket's path is not.
+ UnixEntry s2;
+ EXPECT_FALSE(FindByPath(entries, &s2, path2));
+}
+
+TEST(ProcNetUnix, AbstractBindAcceptConnect) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(
+ AbstractBoundUnixDomainSocketPair(SOCK_STREAM).Create());
+
+ std::string path1 = ExtractPath(sockets->first_addr());
+ std::string path2 = ExtractPath(sockets->second_addr());
+ std::cerr << StreamFormat("Server socket address (path1): '%s'\n", path1);
+ std::cerr << StreamFormat("Client socket address (path2): '%s'\n", path2);
+
+ std::vector<UnixEntry> entries =
+ ASSERT_NO_ERRNO_AND_VALUE(ProcNetUnixEntries());
+ if (IsRunningOnGvisor()) {
+ EXPECT_EQ(entries.size(), 2);
+ }
+
+ // The server-side socket's path is listed in the socket entry...
+ UnixEntry s1;
+ EXPECT_TRUE(FindByPath(entries, &s1, path1));
+
+ // ... but the client-side socket's path is not.
+ UnixEntry s2;
+ EXPECT_FALSE(FindByPath(entries, &s2, path2));
+}
+
+TEST(ProcNetUnix, SocketPair) {
+ // Under gvisor, ensure a socketpair() syscall creates exactly 2 new
+ // entries. We have no way to verify this under Linux, as we have no control
+ // over socket creation on a general Linux machine.
+ SKIP_IF(!IsRunningOnGvisor());
+
+ std::vector<UnixEntry> entries =
+ ASSERT_NO_ERRNO_AND_VALUE(ProcNetUnixEntries());
+ ASSERT_EQ(entries.size(), 0);
+
+ auto sockets =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_STREAM).Create());
+
+ entries = ASSERT_NO_ERRNO_AND_VALUE(ProcNetUnixEntries());
+ EXPECT_EQ(entries.size(), 2);
+}
+
+TEST(ProcNetUnix, StreamSocketStateUnconnectedOnBind) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(
+ AbstractUnboundUnixDomainSocketPair(SOCK_STREAM).Create());
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ std::vector<UnixEntry> entries =
+ ASSERT_NO_ERRNO_AND_VALUE(ProcNetUnixEntries());
+
+ const std::string address = ExtractPath(sockets->first_addr());
+ UnixEntry bind_entry;
+ ASSERT_TRUE(FindByPath(entries, &bind_entry, address));
+ EXPECT_EQ(bind_entry.state, SS_UNCONNECTED);
+}
+
+TEST(ProcNetUnix, StreamSocketStateStateUnconnectedOnListen) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(
+ AbstractUnboundUnixDomainSocketPair(SOCK_STREAM).Create());
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ std::vector<UnixEntry> entries =
+ ASSERT_NO_ERRNO_AND_VALUE(ProcNetUnixEntries());
+
+ const std::string address = ExtractPath(sockets->first_addr());
+ UnixEntry bind_entry;
+ ASSERT_TRUE(FindByPath(entries, &bind_entry, address));
+ EXPECT_EQ(bind_entry.state, SS_UNCONNECTED);
+
+ ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+ entries = ASSERT_NO_ERRNO_AND_VALUE(ProcNetUnixEntries());
+ UnixEntry listen_entry;
+ ASSERT_TRUE(
+ FindByPath(entries, &listen_entry, ExtractPath(sockets->first_addr())));
+ EXPECT_EQ(listen_entry.state, SS_UNCONNECTED);
+ // The bind and listen entries should refer to the same socket.
+ EXPECT_EQ(listen_entry.inode, bind_entry.inode);
+}
+
+TEST(ProcNetUnix, StreamSocketStateStateConnectedOnAccept) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(
+ AbstractUnboundUnixDomainSocketPair(SOCK_STREAM).Create());
+ const std::string address = ExtractPath(sockets->first_addr());
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+ ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+ std::vector<UnixEntry> entries =
+ ASSERT_NO_ERRNO_AND_VALUE(ProcNetUnixEntries());
+ UnixEntry listen_entry;
+ ASSERT_TRUE(
+ FindByPath(entries, &listen_entry, ExtractPath(sockets->first_addr())));
+
+ ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ int clientfd;
+ ASSERT_THAT(clientfd = accept(sockets->first_fd(), nullptr, nullptr),
+ SyscallSucceeds());
+
+ // Find the entry for the accepted socket. UDS proc entries don't have a
+ // remote address, so we distinguish the accepted socket from the listen
+ // socket by checking for a different inode.
+ entries = ASSERT_NO_ERRNO_AND_VALUE(ProcNetUnixEntries());
+ UnixEntry accept_entry;
+ ASSERT_TRUE(FindBy(
+ entries, &accept_entry, [address, listen_entry](const UnixEntry& e) {
+ return e.path == address && e.inode != listen_entry.inode;
+ }));
+ EXPECT_EQ(accept_entry.state, SS_CONNECTED);
+ // Listen entry should still be in SS_UNCONNECTED state.
+ ASSERT_TRUE(FindBy(entries, &listen_entry,
+ [&sockets, listen_entry](const UnixEntry& e) {
+ return e.path == ExtractPath(sockets->first_addr()) &&
+ e.inode == listen_entry.inode;
+ }));
+ EXPECT_EQ(listen_entry.state, SS_UNCONNECTED);
+}
+
+TEST(ProcNetUnix, DgramSocketStateDisconnectingOnBind) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(
+ AbstractUnboundUnixDomainSocketPair(SOCK_DGRAM).Create());
+
+ std::vector<UnixEntry> entries =
+ ASSERT_NO_ERRNO_AND_VALUE(ProcNetUnixEntries());
+
+ // On gVisor, the only two UDS on the system are the ones we just created and
+ // we rely on this to locate the test socket entries in the remainder of the
+ // test. On a generic Linux system, we have no easy way to locate the
+ // corresponding entries, as they don't have an address yet.
+ if (IsRunningOnGvisor()) {
+ ASSERT_EQ(entries.size(), 2);
+ for (const auto& e : entries) {
+ ASSERT_EQ(e.state, SS_DISCONNECTING);
+ }
+ }
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ entries = ASSERT_NO_ERRNO_AND_VALUE(ProcNetUnixEntries());
+ const std::string address = ExtractPath(sockets->first_addr());
+ UnixEntry bind_entry;
+ ASSERT_TRUE(FindByPath(entries, &bind_entry, address));
+ EXPECT_EQ(bind_entry.state, SS_UNCONNECTED);
+}
+
+TEST(ProcNetUnix, DgramSocketStateConnectingOnConnect) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(
+ AbstractUnboundUnixDomainSocketPair(SOCK_DGRAM).Create());
+
+ std::vector<UnixEntry> entries =
+ ASSERT_NO_ERRNO_AND_VALUE(ProcNetUnixEntries());
+
+ // On gVisor, the only two UDS on the system are the ones we just created and
+ // we rely on this to locate the test socket entries in the remainder of the
+ // test. On a generic Linux system, we have no easy way to locate the
+ // corresponding entries, as they don't have an address yet.
+ if (IsRunningOnGvisor()) {
+ ASSERT_EQ(entries.size(), 2);
+ for (const auto& e : entries) {
+ ASSERT_EQ(e.state, SS_DISCONNECTING);
+ }
+ }
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ entries = ASSERT_NO_ERRNO_AND_VALUE(ProcNetUnixEntries());
+ const std::string address = ExtractPath(sockets->first_addr());
+ UnixEntry bind_entry;
+ ASSERT_TRUE(FindByPath(entries, &bind_entry, address));
+
+ ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ entries = ASSERT_NO_ERRNO_AND_VALUE(ProcNetUnixEntries());
+
+ // Once again, we have no easy way to identify the connecting socket as it has
+ // no listed address. We can only identify the entry as the "non-bind socket
+ // entry" on gVisor, where we're guaranteed to have only the two entries we
+ // create during this test.
+ if (IsRunningOnGvisor()) {
+ ASSERT_EQ(entries.size(), 2);
+ UnixEntry connect_entry;
+ ASSERT_TRUE(
+ FindBy(entries, &connect_entry, [bind_entry](const UnixEntry& e) {
+ return e.inode != bind_entry.inode;
+ }));
+ EXPECT_EQ(connect_entry.state, SS_CONNECTING);
+ }
+}
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/proc_pid_oomscore.cc b/test/syscalls/linux/proc_pid_oomscore.cc
new file mode 100644
index 000000000..707821a3f
--- /dev/null
+++ b/test/syscalls/linux/proc_pid_oomscore.cc
@@ -0,0 +1,72 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+
+#include <exception>
+#include <iostream>
+#include <string>
+
+#include "test/util/fs_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+PosixErrorOr<int> ReadProcNumber(std::string path) {
+ ASSIGN_OR_RETURN_ERRNO(std::string contents, GetContents(path));
+ EXPECT_EQ(contents[contents.length() - 1], '\n');
+
+ int num;
+ if (!absl::SimpleAtoi(contents, &num)) {
+ return PosixError(EINVAL, "invalid value: " + contents);
+ }
+
+ return num;
+}
+
+TEST(ProcPidOomscoreTest, BasicRead) {
+ auto const oom_score =
+ ASSERT_NO_ERRNO_AND_VALUE(ReadProcNumber("/proc/self/oom_score"));
+ EXPECT_LE(oom_score, 1000);
+ EXPECT_GE(oom_score, -1000);
+}
+
+TEST(ProcPidOomscoreAdjTest, BasicRead) {
+ auto const oom_score =
+ ASSERT_NO_ERRNO_AND_VALUE(ReadProcNumber("/proc/self/oom_score_adj"));
+
+ // oom_score_adj defaults to 0.
+ EXPECT_EQ(oom_score, 0);
+}
+
+TEST(ProcPidOomscoreAdjTest, BasicWrite) {
+ constexpr int test_value = 7;
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/proc/self/oom_score_adj", O_WRONLY));
+ ASSERT_THAT(
+ RetryEINTR(write)(fd.get(), std::to_string(test_value).c_str(), 1),
+ SyscallSucceeds());
+
+ auto const oom_score =
+ ASSERT_NO_ERRNO_AND_VALUE(ReadProcNumber("/proc/self/oom_score_adj"));
+ EXPECT_EQ(oom_score, test_value);
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/proc_pid_smaps.cc b/test/syscalls/linux/proc_pid_smaps.cc
new file mode 100644
index 000000000..9fb1b3a2c
--- /dev/null
+++ b/test/syscalls/linux/proc_pid_smaps.cc
@@ -0,0 +1,468 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <iostream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/memory_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/proc_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+using ::testing::Contains;
+using ::testing::ElementsAreArray;
+using ::testing::IsSupersetOf;
+using ::testing::Not;
+using ::testing::Optional;
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+struct ProcPidSmapsEntry {
+ ProcMapsEntry maps_entry;
+
+ // These fields should always exist, as they were included in e070ad49f311
+ // "[PATCH] add /proc/pid/smaps".
+ size_t size_kb;
+ size_t rss_kb;
+ size_t shared_clean_kb;
+ size_t shared_dirty_kb;
+ size_t private_clean_kb;
+ size_t private_dirty_kb;
+
+ // These fields were added later and may not be present.
+ absl::optional<size_t> pss_kb;
+ absl::optional<size_t> referenced_kb;
+ absl::optional<size_t> anonymous_kb;
+ absl::optional<size_t> anon_huge_pages_kb;
+ absl::optional<size_t> shared_hugetlb_kb;
+ absl::optional<size_t> private_hugetlb_kb;
+ absl::optional<size_t> swap_kb;
+ absl::optional<size_t> swap_pss_kb;
+ absl::optional<size_t> kernel_page_size_kb;
+ absl::optional<size_t> mmu_page_size_kb;
+ absl::optional<size_t> locked_kb;
+
+ // Caution: "Note that there is no guarantee that every flag and associated
+ // mnemonic will be present in all further kernel releases. Things get
+ // changed, the flags may be vanished or the reverse -- new added." - Linux
+ // Documentation/filesystems/proc.txt, on VmFlags. Avoid checking for any
+ // flags that are not extremely well-established.
+ absl::optional<std::vector<std::string>> vm_flags;
+};
+
+// Given the value part of a /proc/[pid]/smaps field containing a value in kB
+// (for example, " 4 kB", returns the value in kB (in this example, 4).
+PosixErrorOr<size_t> SmapsValueKb(absl::string_view value) {
+ // TODO(jamieliu): let us use RE2 or <regex>
+ std::pair<absl::string_view, absl::string_view> parts =
+ absl::StrSplit(value, ' ', absl::SkipEmpty());
+ if (parts.second != "kB") {
+ return PosixError(EINVAL,
+ absl::StrCat("invalid smaps field value: ", value));
+ }
+ ASSIGN_OR_RETURN_ERRNO(auto val_kb, Atoi<size_t>(parts.first));
+ return val_kb;
+}
+
+PosixErrorOr<std::vector<ProcPidSmapsEntry>> ParseProcPidSmaps(
+ absl::string_view contents) {
+ std::vector<ProcPidSmapsEntry> entries;
+ absl::optional<ProcPidSmapsEntry> entry;
+ bool have_size_kb = false;
+ bool have_rss_kb = false;
+ bool have_shared_clean_kb = false;
+ bool have_shared_dirty_kb = false;
+ bool have_private_clean_kb = false;
+ bool have_private_dirty_kb = false;
+
+ auto const finish_entry = [&] {
+ if (entry) {
+ if (!have_size_kb) {
+ return PosixError(EINVAL, "smaps entry is missing Size");
+ }
+ if (!have_rss_kb) {
+ return PosixError(EINVAL, "smaps entry is missing Rss");
+ }
+ if (!have_shared_clean_kb) {
+ return PosixError(EINVAL, "smaps entry is missing Shared_Clean");
+ }
+ if (!have_shared_dirty_kb) {
+ return PosixError(EINVAL, "smaps entry is missing Shared_Dirty");
+ }
+ if (!have_private_clean_kb) {
+ return PosixError(EINVAL, "smaps entry is missing Private_Clean");
+ }
+ if (!have_private_dirty_kb) {
+ return PosixError(EINVAL, "smaps entry is missing Private_Dirty");
+ }
+ // std::move(entry.value()) instead of std::move(entry).value(), because
+ // otherwise tools may report a "use-after-move" warning, which is
+ // spurious because entry.emplace() below resets entry to a new
+ // ProcPidSmapsEntry.
+ entries.emplace_back(std::move(entry.value()));
+ }
+ entry.emplace();
+ have_size_kb = false;
+ have_rss_kb = false;
+ have_shared_clean_kb = false;
+ have_shared_dirty_kb = false;
+ have_private_clean_kb = false;
+ have_private_dirty_kb = false;
+ return NoError();
+ };
+
+ // Holds key/value pairs from smaps field lines. Declared here so it can be
+ // captured by reference by the following lambdas.
+ std::vector<absl::string_view> key_value;
+
+ auto const on_required_field_kb = [&](size_t* field, bool* have_field) {
+ if (*have_field) {
+ return PosixError(
+ EINVAL,
+ absl::StrFormat("smaps entry has duplicate %s line", key_value[0]));
+ }
+ ASSIGN_OR_RETURN_ERRNO(*field, SmapsValueKb(key_value[1]));
+ *have_field = true;
+ return NoError();
+ };
+
+ auto const on_optional_field_kb = [&](absl::optional<size_t>* field) {
+ if (*field) {
+ return PosixError(
+ EINVAL,
+ absl::StrFormat("smaps entry has duplicate %s line", key_value[0]));
+ }
+ ASSIGN_OR_RETURN_ERRNO(*field, SmapsValueKb(key_value[1]));
+ return NoError();
+ };
+
+ absl::flat_hash_set<std::string> unknown_fields;
+ auto const on_unknown_field = [&] {
+ absl::string_view key = key_value[0];
+ // Don't mention unknown fields more than once.
+ if (unknown_fields.count(key)) {
+ return;
+ }
+ unknown_fields.insert(std::string(key));
+ std::cerr << "skipping unknown smaps field " << key << std::endl;
+ };
+
+ auto lines = absl::StrSplit(contents, '\n', absl::SkipEmpty());
+ for (absl::string_view l : lines) {
+ // Is this line a valid /proc/[pid]/maps entry?
+ auto maybe_maps_entry = ParseProcMapsLine(l);
+ if (maybe_maps_entry.ok()) {
+ // This marks the beginning of a new /proc/[pid]/smaps entry.
+ RETURN_IF_ERRNO(finish_entry());
+ entry->maps_entry = std::move(maybe_maps_entry).ValueOrDie();
+ continue;
+ }
+ // Otherwise it's a field in an existing /proc/[pid]/smaps entry of the form
+ // "key:value" (where value in practice will be preceded by a variable
+ // amount of whitespace).
+ if (!entry) {
+ std::cerr << "smaps line not considered a maps line: "
+ << maybe_maps_entry.error_message() << std::endl;
+ return PosixError(
+ EINVAL,
+ absl::StrCat("smaps field line without preceding maps line: ", l));
+ }
+ key_value = absl::StrSplit(l, absl::MaxSplits(':', 1));
+ if (key_value.size() != 2) {
+ return PosixError(EINVAL, absl::StrCat("invalid smaps field line: ", l));
+ }
+ absl::string_view const key = key_value[0];
+ if (key == "Size") {
+ RETURN_IF_ERRNO(on_required_field_kb(&entry->size_kb, &have_size_kb));
+ } else if (key == "Rss") {
+ RETURN_IF_ERRNO(on_required_field_kb(&entry->rss_kb, &have_rss_kb));
+ } else if (key == "Shared_Clean") {
+ RETURN_IF_ERRNO(
+ on_required_field_kb(&entry->shared_clean_kb, &have_shared_clean_kb));
+ } else if (key == "Shared_Dirty") {
+ RETURN_IF_ERRNO(
+ on_required_field_kb(&entry->shared_dirty_kb, &have_shared_dirty_kb));
+ } else if (key == "Private_Clean") {
+ RETURN_IF_ERRNO(on_required_field_kb(&entry->private_clean_kb,
+ &have_private_clean_kb));
+ } else if (key == "Private_Dirty") {
+ RETURN_IF_ERRNO(on_required_field_kb(&entry->private_dirty_kb,
+ &have_private_dirty_kb));
+ } else if (key == "Pss") {
+ RETURN_IF_ERRNO(on_optional_field_kb(&entry->pss_kb));
+ } else if (key == "Referenced") {
+ RETURN_IF_ERRNO(on_optional_field_kb(&entry->referenced_kb));
+ } else if (key == "Anonymous") {
+ RETURN_IF_ERRNO(on_optional_field_kb(&entry->anonymous_kb));
+ } else if (key == "AnonHugePages") {
+ RETURN_IF_ERRNO(on_optional_field_kb(&entry->anon_huge_pages_kb));
+ } else if (key == "Shared_Hugetlb") {
+ RETURN_IF_ERRNO(on_optional_field_kb(&entry->shared_hugetlb_kb));
+ } else if (key == "Private_Hugetlb") {
+ RETURN_IF_ERRNO(on_optional_field_kb(&entry->private_hugetlb_kb));
+ } else if (key == "Swap") {
+ RETURN_IF_ERRNO(on_optional_field_kb(&entry->swap_kb));
+ } else if (key == "SwapPss") {
+ RETURN_IF_ERRNO(on_optional_field_kb(&entry->swap_pss_kb));
+ } else if (key == "KernelPageSize") {
+ RETURN_IF_ERRNO(on_optional_field_kb(&entry->kernel_page_size_kb));
+ } else if (key == "MMUPageSize") {
+ RETURN_IF_ERRNO(on_optional_field_kb(&entry->mmu_page_size_kb));
+ } else if (key == "Locked") {
+ RETURN_IF_ERRNO(on_optional_field_kb(&entry->locked_kb));
+ } else if (key == "VmFlags") {
+ if (entry->vm_flags) {
+ return PosixError(EINVAL, "duplicate VmFlags line");
+ }
+ entry->vm_flags = absl::StrSplit(key_value[1], ' ', absl::SkipEmpty());
+ } else {
+ on_unknown_field();
+ }
+ }
+ RETURN_IF_ERRNO(finish_entry());
+ return entries;
+};
+
+TEST(ParseProcPidSmapsTest, Correctness) {
+ auto entries = ASSERT_NO_ERRNO_AND_VALUE(
+ ParseProcPidSmaps("0-10000 rw-s 00000000 00:00 0 "
+ " /dev/zero (deleted)\n"
+ "Size: 0 kB\n"
+ "Rss: 1 kB\n"
+ "Pss: 2 kB\n"
+ "Shared_Clean: 3 kB\n"
+ "Shared_Dirty: 4 kB\n"
+ "Private_Clean: 5 kB\n"
+ "Private_Dirty: 6 kB\n"
+ "Referenced: 7 kB\n"
+ "Anonymous: 8 kB\n"
+ "AnonHugePages: 9 kB\n"
+ "Shared_Hugetlb: 10 kB\n"
+ "Private_Hugetlb: 11 kB\n"
+ "Swap: 12 kB\n"
+ "SwapPss: 13 kB\n"
+ "KernelPageSize: 14 kB\n"
+ "MMUPageSize: 15 kB\n"
+ "Locked: 16 kB\n"
+ "FutureUnknownKey: 17 kB\n"
+ "VmFlags: rd wr sh mr mw me ms lo ?? sd \n"));
+ ASSERT_EQ(entries.size(), 1);
+ auto& entry = entries[0];
+ EXPECT_EQ(entry.maps_entry.filename, "/dev/zero (deleted)");
+ EXPECT_EQ(entry.size_kb, 0);
+ EXPECT_EQ(entry.rss_kb, 1);
+ EXPECT_THAT(entry.pss_kb, Optional(2));
+ EXPECT_EQ(entry.shared_clean_kb, 3);
+ EXPECT_EQ(entry.shared_dirty_kb, 4);
+ EXPECT_EQ(entry.private_clean_kb, 5);
+ EXPECT_EQ(entry.private_dirty_kb, 6);
+ EXPECT_THAT(entry.referenced_kb, Optional(7));
+ EXPECT_THAT(entry.anonymous_kb, Optional(8));
+ EXPECT_THAT(entry.anon_huge_pages_kb, Optional(9));
+ EXPECT_THAT(entry.shared_hugetlb_kb, Optional(10));
+ EXPECT_THAT(entry.private_hugetlb_kb, Optional(11));
+ EXPECT_THAT(entry.swap_kb, Optional(12));
+ EXPECT_THAT(entry.swap_pss_kb, Optional(13));
+ EXPECT_THAT(entry.kernel_page_size_kb, Optional(14));
+ EXPECT_THAT(entry.mmu_page_size_kb, Optional(15));
+ EXPECT_THAT(entry.locked_kb, Optional(16));
+ EXPECT_THAT(entry.vm_flags,
+ Optional(ElementsAreArray({"rd", "wr", "sh", "mr", "mw", "me",
+ "ms", "lo", "??", "sd"})));
+}
+
+// Returns the unique entry in entries containing the given address.
+PosixErrorOr<ProcPidSmapsEntry> FindUniqueSmapsEntry(
+ std::vector<ProcPidSmapsEntry> const& entries, uintptr_t addr) {
+ auto const pred = [&](ProcPidSmapsEntry const& entry) {
+ return entry.maps_entry.start <= addr && addr < entry.maps_entry.end;
+ };
+ auto const it = std::find_if(entries.begin(), entries.end(), pred);
+ if (it == entries.end()) {
+ return PosixError(EINVAL,
+ absl::StrFormat("no entry contains address %#x", addr));
+ }
+ auto const it2 = std::find_if(it + 1, entries.end(), pred);
+ if (it2 != entries.end()) {
+ return PosixError(
+ EINVAL,
+ absl::StrFormat("overlapping entries [%#x-%#x) and [%#x-%#x) both "
+ "contain address %#x",
+ it->maps_entry.start, it->maps_entry.end,
+ it2->maps_entry.start, it2->maps_entry.end, addr));
+ }
+ return *it;
+}
+
+PosixErrorOr<std::vector<ProcPidSmapsEntry>> ReadProcSelfSmaps() {
+ ASSIGN_OR_RETURN_ERRNO(std::string contents, GetContents("/proc/self/smaps"));
+ return ParseProcPidSmaps(contents);
+}
+
+TEST(ProcPidSmapsTest, SharedAnon) {
+ // Map with MAP_POPULATE so we get some RSS.
+ Mapping const m = ASSERT_NO_ERRNO_AND_VALUE(MmapAnon(
+ 2 * kPageSize, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE));
+ auto const entries = ASSERT_NO_ERRNO_AND_VALUE(ReadProcSelfSmaps());
+ auto const entry =
+ ASSERT_NO_ERRNO_AND_VALUE(FindUniqueSmapsEntry(entries, m.addr()));
+
+ EXPECT_EQ(entry.size_kb, m.len() / 1024);
+ // It's possible that populated pages have been swapped out, so RSS might be
+ // less than size.
+ EXPECT_LE(entry.rss_kb, entry.size_kb);
+
+ if (entry.pss_kb) {
+ // PSS should be exactly equal to RSS since no other address spaces should
+ // be sharing our new mapping.
+ EXPECT_EQ(entry.pss_kb.value(), entry.rss_kb);
+ }
+
+ // "Shared" and "private" in smaps refers to whether or not *physical pages*
+ // are shared; thus all pages in our MAP_SHARED mapping should nevertheless
+ // be private.
+ EXPECT_EQ(entry.shared_clean_kb, 0);
+ EXPECT_EQ(entry.shared_dirty_kb, 0);
+ EXPECT_EQ(entry.private_clean_kb + entry.private_dirty_kb, entry.rss_kb)
+ << "Private_Clean = " << entry.private_clean_kb
+ << " kB, Private_Dirty = " << entry.private_dirty_kb << " kB";
+
+ // Shared anonymous mappings are implemented as a shmem file, so their pages
+ // are not PageAnon.
+ if (entry.anonymous_kb) {
+ EXPECT_EQ(entry.anonymous_kb.value(), 0);
+ }
+
+ if (entry.vm_flags) {
+ EXPECT_THAT(entry.vm_flags.value(),
+ IsSupersetOf({"rd", "wr", "sh", "mr", "mw", "me", "ms"}));
+ EXPECT_THAT(entry.vm_flags.value(), Not(Contains("ex")));
+ }
+}
+
+TEST(ProcPidSmapsTest, PrivateAnon) {
+ // Map with MAP_POPULATE so we get some RSS.
+ Mapping const m = ASSERT_NO_ERRNO_AND_VALUE(
+ MmapAnon(2 * kPageSize, PROT_WRITE, MAP_PRIVATE | MAP_POPULATE));
+ auto const entries = ASSERT_NO_ERRNO_AND_VALUE(ReadProcSelfSmaps());
+ auto const entry =
+ ASSERT_NO_ERRNO_AND_VALUE(FindUniqueSmapsEntry(entries, m.addr()));
+
+ // It's possible that our mapping was merged with another vma, so the smaps
+ // entry might be bigger than our original mapping.
+ EXPECT_GE(entry.size_kb, m.len() / 1024);
+ EXPECT_LE(entry.rss_kb, entry.size_kb);
+ if (entry.pss_kb) {
+ EXPECT_LE(entry.pss_kb.value(), entry.rss_kb);
+ }
+
+ if (entry.anonymous_kb) {
+ EXPECT_EQ(entry.anonymous_kb.value(), entry.rss_kb);
+ }
+
+ if (entry.vm_flags) {
+ EXPECT_THAT(entry.vm_flags.value(), IsSupersetOf({"wr", "mr", "mw", "me"}));
+ // We passed PROT_WRITE to mmap. On at least x86, the mapping is in
+ // practice readable because there is no way to configure the MMU to make
+ // pages writable but not readable. However, VmFlags should reflect the
+ // flags set on the VMA, so "rd" (VM_READ) should not appear in VmFlags.
+ EXPECT_THAT(entry.vm_flags.value(), Not(Contains("rd")));
+ EXPECT_THAT(entry.vm_flags.value(), Not(Contains("ex")));
+ EXPECT_THAT(entry.vm_flags.value(), Not(Contains("sh")));
+ EXPECT_THAT(entry.vm_flags.value(), Not(Contains("ms")));
+ }
+}
+
+TEST(ProcPidSmapsTest, SharedReadOnlyFile) {
+ size_t const kFileSize = kPageSize;
+
+ auto const temp_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ ASSERT_THAT(truncate(temp_file.path().c_str(), kFileSize), SyscallSucceeds());
+ auto const fd = ASSERT_NO_ERRNO_AND_VALUE(Open(temp_file.path(), O_RDONLY));
+
+ auto const m = ASSERT_NO_ERRNO_AND_VALUE(Mmap(
+ nullptr, kFileSize, PROT_READ, MAP_SHARED | MAP_POPULATE, fd.get(), 0));
+ auto const entries = ASSERT_NO_ERRNO_AND_VALUE(ReadProcSelfSmaps());
+ auto const entry =
+ ASSERT_NO_ERRNO_AND_VALUE(FindUniqueSmapsEntry(entries, m.addr()));
+
+ // Most of the same logic as the SharedAnon case applies.
+ EXPECT_EQ(entry.size_kb, kFileSize / 1024);
+ EXPECT_LE(entry.rss_kb, entry.size_kb);
+ if (entry.pss_kb) {
+ EXPECT_EQ(entry.pss_kb.value(), entry.rss_kb);
+ }
+ EXPECT_EQ(entry.shared_clean_kb, 0);
+ EXPECT_EQ(entry.shared_dirty_kb, 0);
+ EXPECT_EQ(entry.private_clean_kb + entry.private_dirty_kb, entry.rss_kb)
+ << "Private_Clean = " << entry.private_clean_kb
+ << " kB, Private_Dirty = " << entry.private_dirty_kb << " kB";
+ if (entry.anonymous_kb) {
+ EXPECT_EQ(entry.anonymous_kb.value(), 0);
+ }
+
+ if (entry.vm_flags) {
+ EXPECT_THAT(entry.vm_flags.value(), IsSupersetOf({"rd", "mr", "me", "ms"}));
+ EXPECT_THAT(entry.vm_flags.value(), Not(Contains("wr")));
+ EXPECT_THAT(entry.vm_flags.value(), Not(Contains("ex")));
+ // Because the mapped file was opened O_RDONLY, the VMA is !VM_MAYWRITE and
+ // also !VM_SHARED.
+ EXPECT_THAT(entry.vm_flags.value(), Not(Contains("sh")));
+ EXPECT_THAT(entry.vm_flags.value(), Not(Contains("mw")));
+ }
+}
+
+// Tests that gVisor's /proc/[pid]/smaps provides all of the fields we expect it
+// to, which as of this writing is all fields provided by Linux 4.4.
+TEST(ProcPidSmapsTest, GvisorFields) {
+ SKIP_IF(!IsRunningOnGvisor());
+ auto const entries = ASSERT_NO_ERRNO_AND_VALUE(ReadProcSelfSmaps());
+ for (auto const& entry : entries) {
+ EXPECT_TRUE(entry.pss_kb);
+ EXPECT_TRUE(entry.referenced_kb);
+ EXPECT_TRUE(entry.anonymous_kb);
+ EXPECT_TRUE(entry.anon_huge_pages_kb);
+ EXPECT_TRUE(entry.shared_hugetlb_kb);
+ EXPECT_TRUE(entry.private_hugetlb_kb);
+ EXPECT_TRUE(entry.swap_kb);
+ EXPECT_TRUE(entry.swap_pss_kb);
+ EXPECT_THAT(entry.kernel_page_size_kb, Optional(kPageSize / 1024));
+ EXPECT_THAT(entry.mmu_page_size_kb, Optional(kPageSize / 1024));
+ EXPECT_TRUE(entry.locked_kb);
+ EXPECT_TRUE(entry.vm_flags);
+ }
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/proc_pid_uid_gid_map.cc b/test/syscalls/linux/proc_pid_uid_gid_map.cc
new file mode 100644
index 000000000..748f7be58
--- /dev/null
+++ b/test/syscalls/linux/proc_pid_uid_gid_map.cc
@@ -0,0 +1,311 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <sched.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <functional>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "absl/strings/ascii.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_split.h"
+#include "test/util/capability_util.h"
+#include "test/util/cleanup.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/logging.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/save_util.h"
+#include "test/util/test_util.h"
+#include "test/util/time_util.h"
+
+namespace gvisor {
+namespace testing {
+
+PosixErrorOr<int> InNewUserNamespace(const std::function<void()>& fn) {
+ return InForkedProcess([&] {
+ TEST_PCHECK(unshare(CLONE_NEWUSER) == 0);
+ MaybeSave();
+ fn();
+ });
+}
+
+PosixErrorOr<std::tuple<pid_t, Cleanup>> CreateProcessInNewUserNamespace() {
+ int pipefd[2];
+ if (pipe(pipefd) < 0) {
+ return PosixError(errno, "pipe failed");
+ }
+ const auto cleanup_pipe_read =
+ Cleanup([&] { EXPECT_THAT(close(pipefd[0]), SyscallSucceeds()); });
+ auto cleanup_pipe_write =
+ Cleanup([&] { EXPECT_THAT(close(pipefd[1]), SyscallSucceeds()); });
+ pid_t child_pid = fork();
+ if (child_pid < 0) {
+ return PosixError(errno, "fork failed");
+ }
+ if (child_pid == 0) {
+ // Close our copy of the pipe's read end, which doesn't really matter.
+ TEST_PCHECK(close(pipefd[0]) >= 0);
+ TEST_PCHECK(unshare(CLONE_NEWUSER) == 0);
+ MaybeSave();
+ // Indicate that we've switched namespaces by unblocking the parent's read.
+ TEST_PCHECK(close(pipefd[1]) >= 0);
+ while (true) {
+ SleepSafe(absl::Minutes(1));
+ }
+ }
+ auto cleanup_child = Cleanup([child_pid] {
+ EXPECT_THAT(kill(child_pid, SIGKILL), SyscallSucceeds());
+ int status;
+ ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGKILL)
+ << "status = " << status;
+ });
+ // Close our copy of the pipe's write end, then wait for the child to close
+ // its copy, indicating that it's switched namespaces.
+ cleanup_pipe_write.Release()();
+ char buf;
+ if (RetryEINTR(read)(pipefd[0], &buf, 1) < 0) {
+ return PosixError(errno, "reading from pipe failed");
+ }
+ MaybeSave();
+ return std::make_tuple(child_pid, std::move(cleanup_child));
+}
+
+// TEST_CHECK-fails on error, since this function is used in contexts that
+// require async-signal-safety.
+void DenySetgroupsByPath(const char* path) {
+ int fd = open(path, O_WRONLY);
+ if (fd < 0 && errno == ENOENT) {
+ // On kernels where this file doesn't exist, writing "deny" to it isn't
+ // necessary to write to gid_map.
+ return;
+ }
+ TEST_PCHECK(fd >= 0);
+ MaybeSave();
+ char deny[] = "deny";
+ TEST_PCHECK(write(fd, deny, sizeof(deny)) == sizeof(deny));
+ MaybeSave();
+ TEST_PCHECK(close(fd) == 0);
+}
+
+void DenySelfSetgroups() { DenySetgroupsByPath("/proc/self/setgroups"); }
+
+void DenyPidSetgroups(pid_t pid) {
+ DenySetgroupsByPath(absl::StrCat("/proc/", pid, "/setgroups").c_str());
+}
+
+// Returns a valid UID/GID that isn't id.
+uint32_t another_id(uint32_t id) { return (id + 1) % 65535; }
+
+struct TestParam {
+ std::string desc;
+ int cap;
+ std::function<std::string(absl::string_view)> get_map_filename;
+ std::function<uint32_t()> get_current_id;
+};
+
+std::string DescribeTestParam(const ::testing::TestParamInfo<TestParam>& info) {
+ return info.param.desc;
+}
+
+std::vector<TestParam> UidGidMapTestParams() {
+ return {TestParam{"UID", CAP_SETUID,
+ [](absl::string_view pid) {
+ return absl::StrCat("/proc/", pid, "/uid_map");
+ },
+ []() -> uint32_t { return getuid(); }},
+ TestParam{"GID", CAP_SETGID,
+ [](absl::string_view pid) {
+ return absl::StrCat("/proc/", pid, "/gid_map");
+ },
+ []() -> uint32_t { return getgid(); }}};
+}
+
+class ProcUidGidMapTest : public ::testing::TestWithParam<TestParam> {
+ protected:
+ uint32_t CurrentID() { return GetParam().get_current_id(); }
+};
+
+class ProcSelfUidGidMapTest : public ProcUidGidMapTest {
+ protected:
+ PosixErrorOr<int> InNewUserNamespaceWithMapFD(
+ const std::function<void(int)>& fn) {
+ std::string map_filename = GetParam().get_map_filename("self");
+ return InNewUserNamespace([&] {
+ int fd = open(map_filename.c_str(), O_RDWR);
+ TEST_PCHECK(fd >= 0);
+ MaybeSave();
+ fn(fd);
+ TEST_PCHECK(close(fd) == 0);
+ });
+ }
+};
+
+class ProcPidUidGidMapTest : public ProcUidGidMapTest {
+ protected:
+ PosixErrorOr<bool> HaveSetIDCapability() {
+ return HaveCapability(GetParam().cap);
+ }
+
+ // Returns true if the caller is running in a user namespace with all IDs
+ // mapped. This matters for tests that expect to successfully map arbitrary
+ // IDs into a child user namespace, since even with CAP_SET*ID this is only
+ // possible if those IDs are mapped into the current one.
+ PosixErrorOr<bool> AllIDsMapped() {
+ ASSIGN_OR_RETURN_ERRNO(std::string id_map,
+ GetContents(GetParam().get_map_filename("self")));
+ absl::StripTrailingAsciiWhitespace(&id_map);
+ std::vector<std::string> id_map_parts =
+ absl::StrSplit(id_map, ' ', absl::SkipEmpty());
+ return id_map_parts == std::vector<std::string>({"0", "0", "4294967295"});
+ }
+
+ PosixErrorOr<FileDescriptor> OpenMapFile(pid_t pid) {
+ return Open(GetParam().get_map_filename(absl::StrCat(pid)), O_RDWR);
+ }
+};
+
+TEST_P(ProcSelfUidGidMapTest, IsInitiallyEmpty) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanCreateUserNamespace()));
+ EXPECT_THAT(InNewUserNamespaceWithMapFD([](int fd) {
+ char buf[64];
+ TEST_PCHECK(read(fd, buf, sizeof(buf)) == 0);
+ }),
+ IsPosixErrorOkAndHolds(0));
+}
+
+TEST_P(ProcSelfUidGidMapTest, IdentityMapOwnID) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanCreateUserNamespace()));
+ uint32_t id = CurrentID();
+ std::string line = absl::StrCat(id, " ", id, " 1");
+ EXPECT_THAT(
+ InNewUserNamespaceWithMapFD([&](int fd) {
+ DenySelfSetgroups();
+ TEST_PCHECK(write(fd, line.c_str(), line.size()) == line.size());
+ }),
+ IsPosixErrorOkAndHolds(0));
+}
+
+TEST_P(ProcSelfUidGidMapTest, TrailingNewlineAndNULIgnored) {
+ // This is identical to IdentityMapOwnID, except that a trailing newline, NUL,
+ // and an invalid (incomplete) map entry are appended to the valid entry. The
+ // newline should be accepted, and everything after the NUL should be ignored.
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanCreateUserNamespace()));
+ uint32_t id = CurrentID();
+ std::string line = absl::StrCat(id, " ", id, " 1\n\0 4 3");
+ EXPECT_THAT(
+ InNewUserNamespaceWithMapFD([&](int fd) {
+ DenySelfSetgroups();
+ // The write should return the full size of the write, even though
+ // characters after the NUL were ignored.
+ TEST_PCHECK(write(fd, line.c_str(), line.size()) == line.size());
+ }),
+ IsPosixErrorOkAndHolds(0));
+}
+
+TEST_P(ProcSelfUidGidMapTest, NonIdentityMapOwnID) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanCreateUserNamespace()));
+ uint32_t id = CurrentID();
+ uint32_t id2 = another_id(id);
+ std::string line = absl::StrCat(id2, " ", id, " 1");
+ EXPECT_THAT(
+ InNewUserNamespaceWithMapFD([&](int fd) {
+ DenySelfSetgroups();
+ TEST_PCHECK(write(fd, line.c_str(), line.size()) == line.size());
+ }),
+ IsPosixErrorOkAndHolds(0));
+}
+
+TEST_P(ProcSelfUidGidMapTest, MapOtherID) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanCreateUserNamespace()));
+ // Whether or not we have CAP_SET*ID is irrelevant: the process running in the
+ // new (child) user namespace won't have any capabilities in the current
+ // (parent) user namespace, which is needed.
+ uint32_t id = CurrentID();
+ uint32_t id2 = another_id(id);
+ std::string line = absl::StrCat(id, " ", id2, " 1");
+ EXPECT_THAT(InNewUserNamespaceWithMapFD([&](int fd) {
+ DenySelfSetgroups();
+ TEST_PCHECK(write(fd, line.c_str(), line.size()) < 0);
+ TEST_CHECK(errno == EPERM);
+ }),
+ IsPosixErrorOkAndHolds(0));
+}
+
+INSTANTIATE_TEST_SUITE_P(All, ProcSelfUidGidMapTest,
+ ::testing::ValuesIn(UidGidMapTestParams()),
+ DescribeTestParam);
+
+TEST_P(ProcPidUidGidMapTest, MapOtherIDPrivileged) {
+ // Like ProcSelfUidGidMapTest_MapOtherID, but since we have CAP_SET*ID in the
+ // parent user namespace (this one), we can map IDs that aren't ours.
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanCreateUserNamespace()));
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveSetIDCapability()));
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(AllIDsMapped()));
+
+ pid_t child_pid;
+ Cleanup cleanup_child;
+ std::tie(child_pid, cleanup_child) =
+ ASSERT_NO_ERRNO_AND_VALUE(CreateProcessInNewUserNamespace());
+
+ uint32_t id = CurrentID();
+ uint32_t id2 = another_id(id);
+ std::string line = absl::StrCat(id, " ", id2, " 1");
+ DenyPidSetgroups(child_pid);
+ auto fd = ASSERT_NO_ERRNO_AND_VALUE(OpenMapFile(child_pid));
+ EXPECT_THAT(write(fd.get(), line.c_str(), line.size()),
+ SyscallSucceedsWithValue(line.size()));
+}
+
+TEST_P(ProcPidUidGidMapTest, MapAnyIDsPrivileged) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(CanCreateUserNamespace()));
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveSetIDCapability()));
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(AllIDsMapped()));
+
+ pid_t child_pid;
+ Cleanup cleanup_child;
+ std::tie(child_pid, cleanup_child) =
+ ASSERT_NO_ERRNO_AND_VALUE(CreateProcessInNewUserNamespace());
+
+ // Test all of:
+ //
+ // - Mapping ranges of length > 1
+ //
+ // - Mapping multiple ranges
+ //
+ // - Non-identity mappings
+ char entries[] = "2 0 2\n4 6 2";
+ DenyPidSetgroups(child_pid);
+ auto fd = ASSERT_NO_ERRNO_AND_VALUE(OpenMapFile(child_pid));
+ EXPECT_THAT(write(fd.get(), entries, sizeof(entries)),
+ SyscallSucceedsWithValue(sizeof(entries)));
+}
+
+INSTANTIATE_TEST_SUITE_P(All, ProcPidUidGidMapTest,
+ ::testing::ValuesIn(UidGidMapTestParams()),
+ DescribeTestParam);
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/pselect.cc b/test/syscalls/linux/pselect.cc
new file mode 100644
index 000000000..4e43c4d7f
--- /dev/null
+++ b/test/syscalls/linux/pselect.cc
@@ -0,0 +1,190 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <signal.h>
+#include <sys/select.h>
+
+#include "gtest/gtest.h"
+#include "absl/time/time.h"
+#include "test/syscalls/linux/base_poll_test.h"
+#include "test/util/signal_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+struct MaskWithSize {
+ sigset_t* mask;
+ size_t mask_size;
+};
+
+// Linux and glibc have a different idea of the sizeof sigset_t. When calling
+// the syscall directly, use what the kernel expects.
+unsigned kSigsetSize = SIGRTMAX / 8;
+
+// Linux pselect(2) differs from the glibc wrapper function in that Linux
+// updates the timeout with the amount of time remaining. In order to test this
+// behavior we need to use the syscall directly.
+int syscallPselect6(int nfds, fd_set* readfds, fd_set* writefds,
+ fd_set* exceptfds, struct timespec* timeout,
+ const MaskWithSize* mask_with_size) {
+ return syscall(SYS_pselect6, nfds, readfds, writefds, exceptfds, timeout,
+ mask_with_size);
+}
+
+class PselectTest : public BasePollTest {
+ protected:
+ void SetUp() override { BasePollTest::SetUp(); }
+ void TearDown() override { BasePollTest::TearDown(); }
+};
+
+// See that when there are no FD sets, pselect behaves like sleep.
+TEST_F(PselectTest, NullFds) {
+ struct timespec timeout = absl::ToTimespec(absl::Milliseconds(10));
+ ASSERT_THAT(syscallPselect6(0, nullptr, nullptr, nullptr, &timeout, nullptr),
+ SyscallSucceeds());
+ EXPECT_EQ(timeout.tv_sec, 0);
+ EXPECT_EQ(timeout.tv_nsec, 0);
+
+ timeout = absl::ToTimespec(absl::Milliseconds(10));
+ ASSERT_THAT(syscallPselect6(1, nullptr, nullptr, nullptr, &timeout, nullptr),
+ SyscallSucceeds());
+ EXPECT_EQ(timeout.tv_sec, 0);
+ EXPECT_EQ(timeout.tv_nsec, 0);
+}
+
+TEST_F(PselectTest, ClosedFds) {
+ fd_set read_set;
+ FD_ZERO(&read_set);
+ int fd;
+ ASSERT_THAT(fd = dup(1), SyscallSucceeds());
+ ASSERT_THAT(close(fd), SyscallSucceeds());
+ FD_SET(fd, &read_set);
+ struct timespec timeout = absl::ToTimespec(absl::Milliseconds(10));
+ EXPECT_THAT(
+ syscallPselect6(fd + 1, &read_set, nullptr, nullptr, &timeout, nullptr),
+ SyscallFailsWithErrno(EBADF));
+}
+
+TEST_F(PselectTest, ZeroTimeout) {
+ struct timespec timeout = {};
+ ASSERT_THAT(syscallPselect6(1, nullptr, nullptr, nullptr, &timeout, nullptr),
+ SyscallSucceeds());
+ EXPECT_EQ(timeout.tv_sec, 0);
+ EXPECT_EQ(timeout.tv_nsec, 0);
+}
+
+// If random S/R interrupts the pselect, SIGALRM may be delivered before pselect
+// restarts, causing the pselect to hang forever.
+TEST_F(PselectTest, NoTimeout_NoRandomSave) {
+ // When there's no timeout, pselect may never return so set a timer.
+ SetTimer(absl::Milliseconds(100));
+ // See that we get interrupted by the timer.
+ ASSERT_THAT(syscallPselect6(1, nullptr, nullptr, nullptr, nullptr, nullptr),
+ SyscallFailsWithErrno(EINTR));
+ EXPECT_TRUE(TimerFired());
+}
+
+TEST_F(PselectTest, InvalidTimeoutNegative) {
+ struct timespec timeout = absl::ToTimespec(absl::Seconds(-1));
+ ASSERT_THAT(syscallPselect6(1, nullptr, nullptr, nullptr, &timeout, nullptr),
+ SyscallFailsWithErrno(EINVAL));
+ EXPECT_EQ(timeout.tv_sec, -1);
+ EXPECT_EQ(timeout.tv_nsec, 0);
+}
+
+TEST_F(PselectTest, InvalidTimeoutNotNormalized) {
+ struct timespec timeout = {0, 1000000001};
+ ASSERT_THAT(syscallPselect6(1, nullptr, nullptr, nullptr, &timeout, nullptr),
+ SyscallFailsWithErrno(EINVAL));
+ EXPECT_EQ(timeout.tv_sec, 0);
+ EXPECT_EQ(timeout.tv_nsec, 1000000001);
+}
+
+TEST_F(PselectTest, EmptySigMaskInvalidMaskSize) {
+ struct timespec timeout = {};
+ MaskWithSize invalid = {nullptr, 7};
+ EXPECT_THAT(syscallPselect6(0, nullptr, nullptr, nullptr, &timeout, &invalid),
+ SyscallSucceeds());
+}
+
+TEST_F(PselectTest, EmptySigMaskValidMaskSize) {
+ struct timespec timeout = {};
+ MaskWithSize invalid = {nullptr, 8};
+ EXPECT_THAT(syscallPselect6(0, nullptr, nullptr, nullptr, &timeout, &invalid),
+ SyscallSucceeds());
+}
+
+TEST_F(PselectTest, InvalidMaskSize) {
+ struct timespec timeout = {};
+ sigset_t sigmask;
+ ASSERT_THAT(sigemptyset(&sigmask), SyscallSucceeds());
+ MaskWithSize invalid = {&sigmask, 7};
+ EXPECT_THAT(syscallPselect6(1, nullptr, nullptr, nullptr, &timeout, &invalid),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+// Verify that signals blocked by the pselect mask (that would otherwise be
+// allowed) do not interrupt pselect.
+TEST_F(PselectTest, SignalMaskBlocksSignal) {
+ absl::Duration duration(absl::Seconds(30));
+ struct timespec timeout = absl::ToTimespec(duration);
+ absl::Duration timer_duration(absl::Seconds(10));
+
+ // Call with a mask that blocks SIGALRM. See that pselect is not interrupted
+ // (i.e. returns 0) and that upon completion, the timer has fired.
+ sigset_t mask;
+ ASSERT_THAT(sigprocmask(0, nullptr, &mask), SyscallSucceeds());
+ ASSERT_THAT(sigaddset(&mask, SIGALRM), SyscallSucceeds());
+ MaskWithSize mask_with_size = {&mask, kSigsetSize};
+ SetTimer(timer_duration);
+ MaybeSave();
+ ASSERT_FALSE(TimerFired());
+ ASSERT_THAT(
+ syscallPselect6(1, nullptr, nullptr, nullptr, &timeout, &mask_with_size),
+ SyscallSucceeds());
+ EXPECT_TRUE(TimerFired());
+ EXPECT_EQ(absl::DurationFromTimespec(timeout), absl::Duration());
+}
+
+// Verify that signals allowed by the pselect mask (that would otherwise be
+// blocked) interrupt pselect.
+TEST_F(PselectTest, SignalMaskAllowsSignal) {
+ absl::Duration duration = absl::Seconds(30);
+ struct timespec timeout = absl::ToTimespec(duration);
+ absl::Duration timer_duration = absl::Seconds(10);
+
+ sigset_t mask;
+ ASSERT_THAT(sigprocmask(0, nullptr, &mask), SyscallSucceeds());
+
+ // Block SIGALRM.
+ auto cleanup =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, SIGALRM));
+
+ // Call with a mask that unblocks SIGALRM. See that pselect is interrupted.
+ MaskWithSize mask_with_size = {&mask, kSigsetSize};
+ SetTimer(timer_duration);
+ MaybeSave();
+ ASSERT_FALSE(TimerFired());
+ ASSERT_THAT(
+ syscallPselect6(1, nullptr, nullptr, nullptr, &timeout, &mask_with_size),
+ SyscallFailsWithErrno(EINTR));
+ EXPECT_TRUE(TimerFired());
+ EXPECT_GT(absl::DurationFromTimespec(timeout), absl::Duration());
+}
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/ptrace.cc b/test/syscalls/linux/ptrace.cc
new file mode 100644
index 000000000..926690eb8
--- /dev/null
+++ b/test/syscalls/linux/ptrace.cc
@@ -0,0 +1,1229 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <elf.h>
+#include <signal.h>
+#include <stddef.h>
+#include <sys/ptrace.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/user.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include <iostream>
+#include <utility>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/flags/flag.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/logging.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/platform_util.h"
+#include "test/util/signal_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+#include "test/util/time_util.h"
+
+ABSL_FLAG(bool, ptrace_test_execve_child, false,
+ "If true, run the "
+ "PtraceExecveTest_Execve_GetRegs_PeekUser_SIGKILL_TraceClone_"
+ "TraceExit child workload.");
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// PTRACE_GETSIGMASK and PTRACE_SETSIGMASK are not defined until glibc 2.23
+// (fb53a27c5741 "Add new header definitions from Linux 4.4 (plus older ptrace
+// definitions)").
+constexpr auto kPtraceGetSigMask = static_cast<__ptrace_request>(0x420a);
+constexpr auto kPtraceSetSigMask = static_cast<__ptrace_request>(0x420b);
+
+// PTRACE_SYSEMU is not defined until glibc 2.27 (c48831d0eebf "linux/x86: sync
+// sys/ptrace.h with Linux 4.14 [BZ #22433]").
+constexpr auto kPtraceSysemu = static_cast<__ptrace_request>(31);
+
+// PTRACE_EVENT_STOP is not defined until glibc 2.26 (3f67d1a7021e "Add Linux
+// PTRACE_EVENT_STOP").
+constexpr int kPtraceEventStop = 128;
+
+// Sends sig to the current process with tgkill(2).
+//
+// glibc's raise(2) may change the signal mask before sending the signal. These
+// extra syscalls make tests of syscall, signal interception, etc. difficult to
+// write.
+void RaiseSignal(int sig) {
+ pid_t pid = getpid();
+ TEST_PCHECK(pid > 0);
+ pid_t tid = gettid();
+ TEST_PCHECK(tid > 0);
+ TEST_PCHECK(tgkill(pid, tid, sig) == 0);
+}
+
+// Returns the Yama ptrace scope.
+PosixErrorOr<int> YamaPtraceScope() {
+ constexpr char kYamaPtraceScopePath[] = "/proc/sys/kernel/yama/ptrace_scope";
+
+ ASSIGN_OR_RETURN_ERRNO(bool exists, Exists(kYamaPtraceScopePath));
+ if (!exists) {
+ // File doesn't exist means no Yama, so the scope is disabled -> 0.
+ return 0;
+ }
+
+ std::string contents;
+ RETURN_IF_ERRNO(GetContents(kYamaPtraceScopePath, &contents));
+
+ int scope;
+ if (!absl::SimpleAtoi(contents, &scope)) {
+ return PosixError(EINVAL, absl::StrCat(contents, ": not a valid number"));
+ }
+
+ return scope;
+}
+
+TEST(PtraceTest, AttachSelf) {
+ EXPECT_THAT(ptrace(PTRACE_ATTACH, gettid(), 0, 0),
+ SyscallFailsWithErrno(EPERM));
+}
+
+TEST(PtraceTest, AttachSameThreadGroup) {
+ pid_t const tid = gettid();
+ ScopedThread([&] {
+ EXPECT_THAT(ptrace(PTRACE_ATTACH, tid, 0, 0), SyscallFailsWithErrno(EPERM));
+ });
+}
+
+TEST(PtraceTest, AttachParent_PeekData_PokeData_SignalSuppression) {
+ // Yama prevents attaching to a parent. Skip the test if the scope is anything
+ // except disabled.
+ SKIP_IF(ASSERT_NO_ERRNO_AND_VALUE(YamaPtraceScope()) > 0);
+
+ constexpr long kBeforePokeDataValue = 10;
+ constexpr long kAfterPokeDataValue = 20;
+
+ volatile long word = kBeforePokeDataValue;
+
+ pid_t const child_pid = fork();
+ if (child_pid == 0) {
+ // In child process.
+
+ // Attach to the parent.
+ pid_t const parent_pid = getppid();
+ TEST_PCHECK(ptrace(PTRACE_ATTACH, parent_pid, 0, 0) == 0);
+ MaybeSave();
+
+ // Block until the parent enters signal-delivery-stop as a result of the
+ // SIGSTOP sent by PTRACE_ATTACH.
+ int status;
+ TEST_PCHECK(waitpid(parent_pid, &status, 0) == parent_pid);
+ MaybeSave();
+ TEST_CHECK(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP);
+
+ // Replace the value of word in the parent process with kAfterPokeDataValue.
+ long const parent_word = ptrace(PTRACE_PEEKDATA, parent_pid, &word, 0);
+ MaybeSave();
+ TEST_CHECK(parent_word == kBeforePokeDataValue);
+ TEST_PCHECK(
+ ptrace(PTRACE_POKEDATA, parent_pid, &word, kAfterPokeDataValue) == 0);
+ MaybeSave();
+
+ // Detach from the parent and suppress the SIGSTOP. If the SIGSTOP is not
+ // suppressed, the parent will hang in group-stop, causing the test to time
+ // out.
+ TEST_PCHECK(ptrace(PTRACE_DETACH, parent_pid, 0, 0) == 0);
+ MaybeSave();
+ _exit(0);
+ }
+ // In parent process.
+ ASSERT_THAT(child_pid, SyscallSucceeds());
+
+ // Wait for the child to complete.
+ int status;
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+ << " status " << status;
+
+ // Check that the child's PTRACE_POKEDATA was effective.
+ EXPECT_EQ(kAfterPokeDataValue, word);
+}
+
+TEST(PtraceTest, GetSigMask) {
+ // glibc and the Linux kernel define a sigset_t with different sizes. To avoid
+ // creating a kernel_sigset_t and recreating all the modification functions
+ // (sigemptyset, etc), we just hardcode the kernel sigset size.
+ constexpr int kSizeofKernelSigset = 8;
+ constexpr int kBlockSignal = SIGUSR1;
+ sigset_t blocked;
+ sigemptyset(&blocked);
+ sigaddset(&blocked, kBlockSignal);
+
+ pid_t const child_pid = fork();
+ if (child_pid == 0) {
+ // In child process.
+
+ // Install a signal handler for kBlockSignal to avoid termination and block
+ // it.
+ TEST_PCHECK(signal(
+ kBlockSignal, +[](int signo) {}) != SIG_ERR);
+ MaybeSave();
+ TEST_PCHECK(sigprocmask(SIG_SETMASK, &blocked, nullptr) == 0);
+ MaybeSave();
+
+ // Enable tracing.
+ TEST_PCHECK(ptrace(PTRACE_TRACEME, 0, 0, 0) == 0);
+ MaybeSave();
+
+ // This should be blocked.
+ RaiseSignal(kBlockSignal);
+
+ // This should be suppressed by parent, who will change signal mask in the
+ // meantime, which means kBlockSignal should be delivered once this resumes.
+ RaiseSignal(SIGSTOP);
+
+ _exit(0);
+ }
+ // In parent process.
+ ASSERT_THAT(child_pid, SyscallSucceeds());
+
+ // Wait for the child to send itself SIGSTOP and enter signal-delivery-stop.
+ int status;
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP)
+ << " status " << status;
+
+ // Get current signal mask.
+ sigset_t set;
+ EXPECT_THAT(ptrace(kPtraceGetSigMask, child_pid, kSizeofKernelSigset, &set),
+ SyscallSucceeds());
+ EXPECT_THAT(blocked, EqualsSigset(set));
+
+ // Try to get current signal mask with bad size argument.
+ EXPECT_THAT(ptrace(kPtraceGetSigMask, child_pid, 0, nullptr),
+ SyscallFailsWithErrno(EINVAL));
+
+ // Try to set bad signal mask.
+ sigset_t* bad_addr = reinterpret_cast<sigset_t*>(-1);
+ EXPECT_THAT(
+ ptrace(kPtraceSetSigMask, child_pid, kSizeofKernelSigset, bad_addr),
+ SyscallFailsWithErrno(EFAULT));
+
+ // Set signal mask to empty set.
+ sigset_t set1;
+ sigemptyset(&set1);
+ EXPECT_THAT(ptrace(kPtraceSetSigMask, child_pid, kSizeofKernelSigset, &set1),
+ SyscallSucceeds());
+
+ // Suppress SIGSTOP and resume the child. It should re-enter
+ // signal-delivery-stop for kBlockSignal.
+ ASSERT_THAT(ptrace(PTRACE_CONT, child_pid, 0, 0), SyscallSucceeds());
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == kBlockSignal)
+ << " status " << status;
+
+ ASSERT_THAT(ptrace(PTRACE_CONT, child_pid, 0, 0), SyscallSucceeds());
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ // Let's see that process exited normally.
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+ << " status " << status;
+}
+
+TEST(PtraceTest, GetSiginfo_SetSiginfo_SignalInjection) {
+ constexpr int kOriginalSigno = SIGUSR1;
+ constexpr int kInjectedSigno = SIGUSR2;
+
+ pid_t const child_pid = fork();
+ if (child_pid == 0) {
+ // In child process.
+
+ // Override all signal handlers.
+ struct sigaction sa = {};
+ sa.sa_handler = +[](int signo) { _exit(signo); };
+ TEST_PCHECK(sigfillset(&sa.sa_mask) == 0);
+ for (int signo = 1; signo < 32; signo++) {
+ if (signo == SIGKILL || signo == SIGSTOP) {
+ continue;
+ }
+ TEST_PCHECK(sigaction(signo, &sa, nullptr) == 0);
+ }
+ for (int signo = SIGRTMIN; signo <= SIGRTMAX; signo++) {
+ TEST_PCHECK(sigaction(signo, &sa, nullptr) == 0);
+ }
+
+ // Unblock all signals.
+ TEST_PCHECK(sigprocmask(SIG_UNBLOCK, &sa.sa_mask, nullptr) == 0);
+ MaybeSave();
+
+ // Send ourselves kOriginalSignal while ptraced and exit with the signal we
+ // actually receive via the signal handler, if any, or 0 if we don't receive
+ // a signal.
+ TEST_PCHECK(ptrace(PTRACE_TRACEME, 0, 0, 0) == 0);
+ MaybeSave();
+ RaiseSignal(kOriginalSigno);
+ _exit(0);
+ }
+ // In parent process.
+ ASSERT_THAT(child_pid, SyscallSucceeds());
+
+ // Wait for the child to send itself kOriginalSigno and enter
+ // signal-delivery-stop.
+ int status;
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == kOriginalSigno)
+ << " status " << status;
+
+ siginfo_t siginfo = {};
+ ASSERT_THAT(ptrace(PTRACE_GETSIGINFO, child_pid, 0, &siginfo),
+ SyscallSucceeds());
+ EXPECT_EQ(kOriginalSigno, siginfo.si_signo);
+ EXPECT_EQ(SI_TKILL, siginfo.si_code);
+
+ // Replace the signal with kInjectedSigno, and check that the child exits
+ // with kInjectedSigno, indicating that signal injection was successful.
+ siginfo.si_signo = kInjectedSigno;
+ ASSERT_THAT(ptrace(PTRACE_SETSIGINFO, child_pid, 0, &siginfo),
+ SyscallSucceeds());
+ ASSERT_THAT(ptrace(PTRACE_DETACH, child_pid, 0, kInjectedSigno),
+ SyscallSucceeds());
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == kInjectedSigno)
+ << " status " << status;
+}
+
+TEST(PtraceTest, SIGKILLDoesNotCauseSignalDeliveryStop) {
+ pid_t const child_pid = fork();
+ if (child_pid == 0) {
+ // In child process.
+ TEST_PCHECK(ptrace(PTRACE_TRACEME, 0, 0, 0) == 0);
+ MaybeSave();
+ RaiseSignal(SIGKILL);
+ TEST_CHECK_MSG(false, "Survived SIGKILL?");
+ _exit(1);
+ }
+ // In parent process.
+ ASSERT_THAT(child_pid, SyscallSucceeds());
+
+ // Expect the child to die to SIGKILL without entering signal-delivery-stop.
+ int status;
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGKILL)
+ << " status " << status;
+}
+
+TEST(PtraceTest, PtraceKill) {
+ constexpr int kOriginalSigno = SIGUSR1;
+
+ pid_t const child_pid = fork();
+ if (child_pid == 0) {
+ // In child process.
+ TEST_PCHECK(ptrace(PTRACE_TRACEME, 0, 0, 0) == 0);
+ MaybeSave();
+
+ // PTRACE_KILL only works if tracee has entered signal-delivery-stop.
+ RaiseSignal(kOriginalSigno);
+ TEST_CHECK_MSG(false, "Failed to kill the process?");
+ _exit(0);
+ }
+ // In parent process.
+ ASSERT_THAT(child_pid, SyscallSucceeds());
+
+ // Wait for the child to send itself kOriginalSigno and enter
+ // signal-delivery-stop.
+ int status;
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == kOriginalSigno)
+ << " status " << status;
+
+ ASSERT_THAT(ptrace(PTRACE_KILL, child_pid, 0, 0), SyscallSucceeds());
+
+ // Expect the child to die with SIGKILL.
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGKILL)
+ << " status " << status;
+}
+
+TEST(PtraceTest, GetRegSet) {
+ pid_t const child_pid = fork();
+ if (child_pid == 0) {
+ // In child process.
+
+ // Enable tracing.
+ TEST_PCHECK(ptrace(PTRACE_TRACEME, 0, 0, 0) == 0);
+ MaybeSave();
+
+ // Use kill explicitly because we check the syscall argument register below.
+ kill(getpid(), SIGSTOP);
+
+ _exit(0);
+ }
+ // In parent process.
+ ASSERT_THAT(child_pid, SyscallSucceeds());
+
+ // Wait for the child to send itself SIGSTOP and enter signal-delivery-stop.
+ int status;
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP)
+ << " status " << status;
+
+ // Get the general registers.
+ struct user_regs_struct regs;
+ struct iovec iov;
+ iov.iov_base = &regs;
+ iov.iov_len = sizeof(regs);
+ EXPECT_THAT(ptrace(PTRACE_GETREGSET, child_pid, NT_PRSTATUS, &iov),
+ SyscallSucceeds());
+
+ // Read exactly the full register set.
+ EXPECT_EQ(iov.iov_len, sizeof(regs));
+
+#if defined(__x86_64__)
+ // Child called kill(2), with SIGSTOP as arg 2.
+ EXPECT_EQ(regs.rsi, SIGSTOP);
+#elif defined(__aarch64__)
+ EXPECT_EQ(regs.regs[1], SIGSTOP);
+#endif
+
+ // Suppress SIGSTOP and resume the child.
+ ASSERT_THAT(ptrace(PTRACE_CONT, child_pid, 0, 0), SyscallSucceeds());
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ // Let's see that process exited normally.
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+ << " status " << status;
+}
+
+TEST(PtraceTest, AttachingConvertsGroupStopToPtraceStop) {
+ pid_t const child_pid = fork();
+ if (child_pid == 0) {
+ // In child process.
+ while (true) {
+ pause();
+ }
+ }
+ // In parent process.
+ ASSERT_THAT(child_pid, SyscallSucceeds());
+
+ // SIGSTOP the child and wait for it to stop.
+ ASSERT_THAT(kill(child_pid, SIGSTOP), SyscallSucceeds());
+ int status;
+ ASSERT_THAT(waitpid(child_pid, &status, WUNTRACED),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP)
+ << " status " << status;
+
+ // Attach to the child and expect it to re-enter a traced group-stop despite
+ // already being stopped.
+ ASSERT_THAT(ptrace(PTRACE_ATTACH, child_pid, 0, 0), SyscallSucceeds());
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP)
+ << " status " << status;
+
+ // Verify that the child is ptrace-stopped by checking that it can receive
+ // ptrace commands requiring a ptrace-stop.
+ EXPECT_THAT(ptrace(PTRACE_SETOPTIONS, child_pid, 0, 0), SyscallSucceeds());
+
+ // Group-stop is distinguished from signal-delivery-stop by PTRACE_GETSIGINFO
+ // failing with EINVAL.
+ siginfo_t siginfo = {};
+ EXPECT_THAT(ptrace(PTRACE_GETSIGINFO, child_pid, 0, &siginfo),
+ SyscallFailsWithErrno(EINVAL));
+
+ // Detach from the child and expect it to stay stopped without a notification.
+ ASSERT_THAT(ptrace(PTRACE_DETACH, child_pid, 0, 0), SyscallSucceeds());
+ ASSERT_THAT(waitpid(child_pid, &status, WUNTRACED | WNOHANG),
+ SyscallSucceedsWithValue(0));
+
+ // Sending it SIGCONT should cause it to leave its stop.
+ ASSERT_THAT(kill(child_pid, SIGCONT), SyscallSucceeds());
+ ASSERT_THAT(waitpid(child_pid, &status, WCONTINUED),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFCONTINUED(status)) << " status " << status;
+
+ // Clean up the child.
+ ASSERT_THAT(kill(child_pid, SIGKILL), SyscallSucceeds());
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGKILL)
+ << " status " << status;
+}
+
+// Fixture for tests parameterized by whether or not to use PTRACE_O_TRACEEXEC.
+class PtraceExecveTest : public ::testing::TestWithParam<bool> {
+ protected:
+ bool TraceExec() const { return GetParam(); }
+};
+
+TEST_P(PtraceExecveTest, Execve_GetRegs_PeekUser_SIGKILL_TraceClone_TraceExit) {
+ ExecveArray const owned_child_argv = {"/proc/self/exe",
+ "--ptrace_test_execve_child"};
+ char* const* const child_argv = owned_child_argv.get();
+
+ pid_t const child_pid = fork();
+ if (child_pid == 0) {
+ // In child process. The test relies on calling execve() in a non-leader
+ // thread; pthread_create() isn't async-signal-safe, so the safest way to
+ // do this is to execve() first, then enable tracing and run the expected
+ // child process behavior in the new subprocess.
+ execve(child_argv[0], child_argv, /* envp = */ nullptr);
+ TEST_PCHECK_MSG(false, "Survived execve to test child");
+ }
+ // In parent process.
+ ASSERT_THAT(child_pid, SyscallSucceeds());
+
+ // Wait for the child to send itself SIGSTOP and enter signal-delivery-stop.
+ int status;
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP)
+ << " status " << status;
+
+ // Enable PTRACE_O_TRACECLONE so we can get the ID of the child's non-leader
+ // thread, PTRACE_O_TRACEEXIT so we can observe the leader's death, and
+ // PTRACE_O_TRACEEXEC if required by the test. (The leader doesn't call
+ // execve, but options should be inherited across clone.)
+ long opts = PTRACE_O_TRACECLONE | PTRACE_O_TRACEEXIT;
+ if (TraceExec()) {
+ opts |= PTRACE_O_TRACEEXEC;
+ }
+ ASSERT_THAT(ptrace(PTRACE_SETOPTIONS, child_pid, 0, opts), SyscallSucceeds());
+
+ // Suppress the SIGSTOP and wait for the child's leader thread to report
+ // PTRACE_EVENT_CLONE. Get the new thread's ID from the event.
+ ASSERT_THAT(ptrace(PTRACE_CONT, child_pid, 0, 0), SyscallSucceeds());
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_EQ(SIGTRAP | (PTRACE_EVENT_CLONE << 8), status >> 8);
+ unsigned long eventmsg;
+ ASSERT_THAT(ptrace(PTRACE_GETEVENTMSG, child_pid, 0, &eventmsg),
+ SyscallSucceeds());
+ pid_t const nonleader_tid = eventmsg;
+ pid_t const leader_tid = child_pid;
+
+ // The new thread should be ptraced and in signal-delivery-stop by SIGSTOP due
+ // to PTRACE_O_TRACECLONE.
+ //
+ // Before bf959931ddb88c4e4366e96dd22e68fa0db9527c "wait/ptrace: assume __WALL
+ // if the child is traced" (4.7) , waiting on it requires __WCLONE since, as a
+ // non-leader, its termination signal is 0. After, a standard wait is
+ // sufficient.
+ ASSERT_THAT(waitpid(nonleader_tid, &status, __WCLONE),
+ SyscallSucceedsWithValue(nonleader_tid));
+ EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP)
+ << " status " << status;
+
+ // Resume both child threads.
+ for (pid_t const tid : {leader_tid, nonleader_tid}) {
+ ASSERT_THAT(ptrace(PTRACE_CONT, tid, 0, 0), SyscallSucceeds());
+ }
+
+ // The non-leader child thread should call execve, causing the leader thread
+ // to enter PTRACE_EVENT_EXIT with an apparent exit code of 0. At this point,
+ // the leader has not yet exited, so the non-leader should be blocked in
+ // execve.
+ ASSERT_THAT(waitpid(leader_tid, &status, 0),
+ SyscallSucceedsWithValue(leader_tid));
+ EXPECT_EQ(SIGTRAP | (PTRACE_EVENT_EXIT << 8), status >> 8);
+ ASSERT_THAT(ptrace(PTRACE_GETEVENTMSG, leader_tid, 0, &eventmsg),
+ SyscallSucceeds());
+ EXPECT_TRUE(WIFEXITED(eventmsg) && WEXITSTATUS(eventmsg) == 0)
+ << " eventmsg " << eventmsg;
+ EXPECT_THAT(waitpid(nonleader_tid, &status, __WCLONE | WNOHANG),
+ SyscallSucceedsWithValue(0));
+
+ // Allow the leader to continue exiting. This should allow the non-leader to
+ // complete its execve, causing the original leader to be reaped without
+ // further notice and the non-leader to steal its ID.
+ ASSERT_THAT(ptrace(PTRACE_CONT, leader_tid, 0, 0), SyscallSucceeds());
+ ASSERT_THAT(waitpid(leader_tid, &status, 0),
+ SyscallSucceedsWithValue(leader_tid));
+ if (TraceExec()) {
+ // If PTRACE_O_TRACEEXEC was enabled, the execing thread should be in
+ // PTRACE_EVENT_EXEC-stop, with the event message set to its old thread ID.
+ EXPECT_EQ(SIGTRAP | (PTRACE_EVENT_EXEC << 8), status >> 8);
+ ASSERT_THAT(ptrace(PTRACE_GETEVENTMSG, leader_tid, 0, &eventmsg),
+ SyscallSucceeds());
+ EXPECT_EQ(nonleader_tid, eventmsg);
+ } else {
+ // Otherwise, the execing thread should have received SIGTRAP and should now
+ // be in signal-delivery-stop.
+ EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP)
+ << " status " << status;
+ }
+
+#ifdef __x86_64__
+ {
+ // CS should be 0x33, indicating an 64-bit binary.
+ constexpr uint64_t kAMD64UserCS = 0x33;
+ EXPECT_THAT(ptrace(PTRACE_PEEKUSER, leader_tid,
+ offsetof(struct user_regs_struct, cs), 0),
+ SyscallSucceedsWithValue(kAMD64UserCS));
+ struct user_regs_struct regs = {};
+ ASSERT_THAT(ptrace(PTRACE_GETREGS, leader_tid, 0, &regs),
+ SyscallSucceeds());
+ EXPECT_EQ(kAMD64UserCS, regs.cs);
+ }
+#endif // defined(__x86_64__)
+
+ // PTRACE_O_TRACEEXIT should have been inherited across execve. Send SIGKILL,
+ // which should end the PTRACE_EVENT_EXEC-stop or signal-delivery-stop and
+ // leave the child in PTRACE_EVENT_EXIT-stop.
+ ASSERT_THAT(kill(leader_tid, SIGKILL), SyscallSucceeds());
+ ASSERT_THAT(waitpid(leader_tid, &status, 0),
+ SyscallSucceedsWithValue(leader_tid));
+ EXPECT_EQ(SIGTRAP | (PTRACE_EVENT_EXIT << 8), status >> 8);
+ ASSERT_THAT(ptrace(PTRACE_GETEVENTMSG, leader_tid, 0, &eventmsg),
+ SyscallSucceeds());
+ EXPECT_TRUE(WIFSIGNALED(eventmsg) && WTERMSIG(eventmsg) == SIGKILL)
+ << " eventmsg " << eventmsg;
+
+ // End the PTRACE_EVENT_EXIT stop, allowing the child to exit.
+ ASSERT_THAT(ptrace(PTRACE_CONT, leader_tid, 0, 0), SyscallSucceeds());
+ ASSERT_THAT(waitpid(leader_tid, &status, 0),
+ SyscallSucceedsWithValue(leader_tid));
+ EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGKILL)
+ << " status " << status;
+}
+
+[[noreturn]] void RunExecveChild() {
+ // Enable tracing, then raise SIGSTOP and expect our parent to suppress it.
+ TEST_PCHECK(ptrace(PTRACE_TRACEME, 0, 0, 0) == 0);
+ MaybeSave();
+ RaiseSignal(SIGSTOP);
+ MaybeSave();
+
+ // Call execve() in a non-leader thread. As long as execve() succeeds, what
+ // exactly we execve() shouldn't really matter, since the tracer should kill
+ // us after execve() completes.
+ ScopedThread t([&] {
+ ExecveArray const owned_child_argv = {"/proc/self/exe",
+ "--this_flag_shouldnt_exist"};
+ char* const* const child_argv = owned_child_argv.get();
+ execve(child_argv[0], child_argv, /* envp = */ nullptr);
+ TEST_PCHECK_MSG(false, "Survived execve? (thread)");
+ });
+ t.Join();
+ TEST_CHECK_MSG(false, "Survived execve? (main)");
+ _exit(1);
+}
+
+INSTANTIATE_TEST_SUITE_P(TraceExec, PtraceExecveTest, ::testing::Bool());
+
+// This test has expectations on when syscall-enter/exit-stops occur that are
+// violated if saving occurs, since saving interrupts all syscalls, causing
+// premature syscall-exit.
+TEST(PtraceTest,
+ ExitWhenParentIsNotTracer_Syscall_TraceVfork_TraceVforkDone_NoRandomSave) {
+ constexpr int kExitTraceeExitCode = 99;
+
+ pid_t const child_pid = fork();
+ if (child_pid == 0) {
+ // In child process.
+
+ // Block SIGCHLD so it doesn't interrupt wait4.
+ sigset_t mask;
+ TEST_PCHECK(sigemptyset(&mask) == 0);
+ TEST_PCHECK(sigaddset(&mask, SIGCHLD) == 0);
+ TEST_PCHECK(sigprocmask(SIG_SETMASK, &mask, nullptr) == 0);
+ MaybeSave();
+
+ // Enable tracing, then raise SIGSTOP and expect our parent to suppress it.
+ TEST_PCHECK(ptrace(PTRACE_TRACEME, 0, 0, 0) == 0);
+ MaybeSave();
+ RaiseSignal(SIGSTOP);
+ MaybeSave();
+
+ // Spawn a vfork child that exits immediately, and reap it. Don't save
+ // after vfork since the parent expects to see wait4 as the next syscall.
+ pid_t const pid = vfork();
+ if (pid == 0) {
+ _exit(kExitTraceeExitCode);
+ }
+ TEST_PCHECK_MSG(pid > 0, "vfork failed");
+
+ int status;
+ TEST_PCHECK(wait4(pid, &status, 0, nullptr) > 0);
+ MaybeSave();
+ TEST_CHECK(WIFEXITED(status) && WEXITSTATUS(status) == kExitTraceeExitCode);
+ _exit(0);
+ }
+ // In parent process.
+ ASSERT_THAT(child_pid, SyscallSucceeds());
+
+ // Wait for the child to send itself SIGSTOP and enter signal-delivery-stop.
+ int status;
+ ASSERT_THAT(child_pid, SyscallSucceeds());
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP)
+ << " status " << status;
+
+ // Enable PTRACE_O_TRACEVFORK so we can get the ID of the grandchild,
+ // PTRACE_O_TRACEVFORKDONE so we can observe PTRACE_EVENT_VFORK_DONE, and
+ // PTRACE_O_TRACESYSGOOD so syscall-enter/exit-stops are unambiguously
+ // indicated by a stop signal of SIGTRAP|0x80 rather than just SIGTRAP.
+ ASSERT_THAT(ptrace(PTRACE_SETOPTIONS, child_pid, 0,
+ PTRACE_O_TRACEVFORK | PTRACE_O_TRACEVFORKDONE |
+ PTRACE_O_TRACESYSGOOD),
+ SyscallSucceeds());
+
+ // Suppress the SIGSTOP and wait for the child to report PTRACE_EVENT_VFORK.
+ // Get the new process' ID from the event.
+ ASSERT_THAT(ptrace(PTRACE_CONT, child_pid, 0, 0), SyscallSucceeds());
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_EQ(SIGTRAP | (PTRACE_EVENT_VFORK << 8), status >> 8);
+ unsigned long eventmsg;
+ ASSERT_THAT(ptrace(PTRACE_GETEVENTMSG, child_pid, 0, &eventmsg),
+ SyscallSucceeds());
+ pid_t const grandchild_pid = eventmsg;
+
+ // The grandchild should be traced by us and in signal-delivery-stop by
+ // SIGSTOP due to PTRACE_O_TRACEVFORK. This allows us to wait on it even
+ // though we're not its parent.
+ ASSERT_THAT(waitpid(grandchild_pid, &status, 0),
+ SyscallSucceedsWithValue(grandchild_pid));
+ EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP)
+ << " status " << status;
+
+ // Resume the child with PTRACE_SYSCALL. Since the grandchild is still in
+ // signal-delivery-stop, the child should remain in vfork() waiting for the
+ // grandchild to exec or exit.
+ ASSERT_THAT(ptrace(PTRACE_SYSCALL, child_pid, 0, 0), SyscallSucceeds());
+ absl::SleepFor(absl::Seconds(1));
+ ASSERT_THAT(waitpid(child_pid, &status, WNOHANG),
+ SyscallSucceedsWithValue(0));
+
+ // Suppress the grandchild's SIGSTOP and wait for the grandchild to exit. Pass
+ // WNOWAIT to waitid() so that we don't acknowledge the grandchild's exit yet.
+ ASSERT_THAT(ptrace(PTRACE_CONT, grandchild_pid, 0, 0), SyscallSucceeds());
+ siginfo_t siginfo = {};
+ ASSERT_THAT(waitid(P_PID, grandchild_pid, &siginfo, WEXITED | WNOWAIT),
+ SyscallSucceeds());
+ EXPECT_EQ(SIGCHLD, siginfo.si_signo);
+ EXPECT_EQ(CLD_EXITED, siginfo.si_code);
+ EXPECT_EQ(kExitTraceeExitCode, siginfo.si_status);
+ EXPECT_EQ(grandchild_pid, siginfo.si_pid);
+ EXPECT_EQ(getuid(), siginfo.si_uid);
+
+ // The child should now be in PTRACE_EVENT_VFORK_DONE stop. The event
+ // message should still be the grandchild's PID.
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_EQ(SIGTRAP | (PTRACE_EVENT_VFORK_DONE << 8), status >> 8);
+ ASSERT_THAT(ptrace(PTRACE_GETEVENTMSG, child_pid, 0, &eventmsg),
+ SyscallSucceeds());
+ EXPECT_EQ(grandchild_pid, eventmsg);
+
+ // Resume the child with PTRACE_SYSCALL again and expect it to enter
+ // syscall-exit-stop for vfork() or clone(), either of which should return the
+ // grandchild's PID from the syscall. Aside from PTRACE_O_TRACESYSGOOD,
+ // syscall-stops are distinguished from signal-delivery-stop by
+ // PTRACE_GETSIGINFO returning a siginfo for which si_code == SIGTRAP or
+ // SIGTRAP|0x80.
+ ASSERT_THAT(ptrace(PTRACE_SYSCALL, child_pid, 0, 0), SyscallSucceeds());
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == (SIGTRAP | 0x80))
+ << " status " << status;
+ ASSERT_THAT(ptrace(PTRACE_GETSIGINFO, child_pid, 0, &siginfo),
+ SyscallSucceeds());
+ EXPECT_TRUE(siginfo.si_code == SIGTRAP || siginfo.si_code == (SIGTRAP | 0x80))
+ << "si_code = " << siginfo.si_code;
+
+ {
+ struct user_regs_struct regs = {};
+ struct iovec iov;
+ iov.iov_base = &regs;
+ iov.iov_len = sizeof(regs);
+ EXPECT_THAT(ptrace(PTRACE_GETREGSET, child_pid, NT_PRSTATUS, &iov),
+ SyscallSucceeds());
+#if defined(__x86_64__)
+ EXPECT_TRUE(regs.orig_rax == SYS_vfork || regs.orig_rax == SYS_clone)
+ << "orig_rax = " << regs.orig_rax;
+ EXPECT_EQ(grandchild_pid, regs.rax);
+#elif defined(__aarch64__)
+ EXPECT_TRUE(regs.regs[8] == SYS_clone) << "regs[8] = " << regs.regs[8];
+ EXPECT_EQ(grandchild_pid, regs.regs[0]);
+#endif // defined(__x86_64__)
+ }
+
+ // After this point, the child will be making wait4 syscalls that will be
+ // interrupted by saving, so saving is not permitted. Note that this is
+ // explicitly released below once the grandchild exits.
+ DisableSave ds;
+
+ // Resume the child with PTRACE_SYSCALL again and expect it to enter
+ // syscall-enter-stop for wait4().
+ ASSERT_THAT(ptrace(PTRACE_SYSCALL, child_pid, 0, 0), SyscallSucceeds());
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == (SIGTRAP | 0x80))
+ << " status " << status;
+ ASSERT_THAT(ptrace(PTRACE_GETSIGINFO, child_pid, 0, &siginfo),
+ SyscallSucceeds());
+ EXPECT_TRUE(siginfo.si_code == SIGTRAP || siginfo.si_code == (SIGTRAP | 0x80))
+ << "si_code = " << siginfo.si_code;
+#ifdef __x86_64__
+ {
+ EXPECT_THAT(ptrace(PTRACE_PEEKUSER, child_pid,
+ offsetof(struct user_regs_struct, orig_rax), 0),
+ SyscallSucceedsWithValue(SYS_wait4));
+ }
+#endif // defined(__x86_64__)
+
+ // Resume the child with PTRACE_SYSCALL again. Since the grandchild is
+ // waiting for the tracer (us) to acknowledge its exit first, wait4 should
+ // block.
+ ASSERT_THAT(ptrace(PTRACE_SYSCALL, child_pid, 0, 0), SyscallSucceeds());
+ absl::SleepFor(absl::Seconds(1));
+ ASSERT_THAT(waitpid(child_pid, &status, WNOHANG),
+ SyscallSucceedsWithValue(0));
+
+ // Acknowledge the grandchild's exit.
+ ASSERT_THAT(waitpid(grandchild_pid, &status, 0),
+ SyscallSucceedsWithValue(grandchild_pid));
+ ds.reset();
+
+ // Now the child should enter syscall-exit-stop for wait4, returning with the
+ // grandchild's PID.
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == (SIGTRAP | 0x80))
+ << " status " << status;
+ {
+ struct user_regs_struct regs = {};
+ struct iovec iov;
+ iov.iov_base = &regs;
+ iov.iov_len = sizeof(regs);
+ EXPECT_THAT(ptrace(PTRACE_GETREGSET, child_pid, NT_PRSTATUS, &iov),
+ SyscallSucceeds());
+#if defined(__x86_64__)
+ EXPECT_EQ(SYS_wait4, regs.orig_rax);
+ EXPECT_EQ(grandchild_pid, regs.rax);
+#elif defined(__aarch64__)
+ EXPECT_EQ(SYS_wait4, regs.regs[8]);
+ EXPECT_EQ(grandchild_pid, regs.regs[0]);
+#endif // defined(__x86_64__)
+ }
+
+ // Detach from the child and wait for it to exit.
+ ASSERT_THAT(ptrace(PTRACE_DETACH, child_pid, 0, 0), SyscallSucceeds());
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+ << " status " << status;
+}
+
+// These tests requires knowledge of architecture-specific syscall convention.
+#ifdef __x86_64__
+TEST(PtraceTest, Int3) {
+ SKIP_IF(PlatformSupportInt3() == PlatformSupport::NotSupported);
+
+ pid_t const child_pid = fork();
+ if (child_pid == 0) {
+ // In child process.
+
+ // Enable tracing.
+ TEST_PCHECK(ptrace(PTRACE_TRACEME, 0, 0, 0) == 0);
+
+ // Interrupt 3 - trap to debugger
+ asm("int3");
+
+ _exit(56);
+ }
+ // In parent process.
+ ASSERT_THAT(child_pid, SyscallSucceeds());
+
+ int status;
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP)
+ << " status " << status;
+
+ ASSERT_THAT(ptrace(PTRACE_CONT, child_pid, 0, 0), SyscallSucceeds());
+
+ // The child should validate the injected return value and then exit normally.
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 56)
+ << " status " << status;
+}
+
+TEST(PtraceTest, Sysemu_PokeUser) {
+ constexpr int kSysemuHelperFirstExitCode = 126;
+ constexpr uint64_t kSysemuInjectedExitGroupReturn = 42;
+
+ pid_t const child_pid = fork();
+ if (child_pid == 0) {
+ // In child process.
+
+ // Enable tracing, then raise SIGSTOP and expect our parent to suppress it.
+ TEST_PCHECK(ptrace(PTRACE_TRACEME, 0, 0, 0) == 0);
+ RaiseSignal(SIGSTOP);
+
+ // Try to exit_group, expecting the tracer to skip the syscall and set its
+ // own return value.
+ int const rv = syscall(SYS_exit_group, kSysemuHelperFirstExitCode);
+ TEST_PCHECK_MSG(rv == kSysemuInjectedExitGroupReturn,
+ "exit_group returned incorrect value");
+
+ _exit(0);
+ }
+ // In parent process.
+ ASSERT_THAT(child_pid, SyscallSucceeds());
+
+ // Wait for the child to send itself SIGSTOP and enter signal-delivery-stop.
+ int status;
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP)
+ << " status " << status;
+
+ // Suppress the SIGSTOP and wait for the child to enter syscall-enter-stop
+ // for its first exit_group syscall.
+ ASSERT_THAT(ptrace(kPtraceSysemu, child_pid, 0, 0), SyscallSucceeds());
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP)
+ << " status " << status;
+
+ struct user_regs_struct regs = {};
+ ASSERT_THAT(ptrace(PTRACE_GETREGS, child_pid, 0, &regs), SyscallSucceeds());
+ EXPECT_EQ(SYS_exit_group, regs.orig_rax);
+ EXPECT_EQ(-ENOSYS, regs.rax);
+ EXPECT_EQ(kSysemuHelperFirstExitCode, regs.rdi);
+
+ // Replace the exit_group return value, then resume the child, which should
+ // automatically skip the syscall.
+ ASSERT_THAT(
+ ptrace(PTRACE_POKEUSER, child_pid, offsetof(struct user_regs_struct, rax),
+ kSysemuInjectedExitGroupReturn),
+ SyscallSucceeds());
+ ASSERT_THAT(ptrace(PTRACE_DETACH, child_pid, 0, 0), SyscallSucceeds());
+
+ // The child should validate the injected return value and then exit normally.
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+ << " status " << status;
+}
+
+// This test also cares about syscall-exit-stop.
+TEST(PtraceTest, ERESTART_NoRandomSave) {
+ constexpr int kSigno = SIGUSR1;
+
+ pid_t const child_pid = fork();
+ if (child_pid == 0) {
+ // In child process.
+
+ // Ignore, but unblock, kSigno.
+ struct sigaction sa = {};
+ sa.sa_handler = SIG_IGN;
+ TEST_PCHECK(sigfillset(&sa.sa_mask) == 0);
+ TEST_PCHECK(sigaction(kSigno, &sa, nullptr) == 0);
+ MaybeSave();
+ TEST_PCHECK(sigprocmask(SIG_UNBLOCK, &sa.sa_mask, nullptr) == 0);
+ MaybeSave();
+
+ // Enable tracing, then raise SIGSTOP and expect our parent to suppress it.
+ TEST_PCHECK(ptrace(PTRACE_TRACEME, 0, 0, 0) == 0);
+ RaiseSignal(SIGSTOP);
+
+ // Invoke the pause syscall, which normally should not return until we
+ // receive a signal that "either terminates the process or causes the
+ // invocation of a signal-catching function".
+ pause();
+
+ _exit(0);
+ }
+ ASSERT_THAT(child_pid, SyscallSucceeds());
+
+ // Wait for the child to send itself SIGSTOP and enter signal-delivery-stop.
+ int status;
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP)
+ << " status " << status;
+
+ // After this point, the child's pause syscall will be interrupted by saving,
+ // so saving is not permitted. Note that this is explicitly released below
+ // once the child is stopped.
+ DisableSave ds;
+
+ // Suppress the SIGSTOP and wait for the child to enter syscall-enter-stop for
+ // its pause syscall.
+ ASSERT_THAT(ptrace(PTRACE_SYSCALL, child_pid, 0, 0), SyscallSucceeds());
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP)
+ << " status " << status;
+
+ struct user_regs_struct regs = {};
+ ASSERT_THAT(ptrace(PTRACE_GETREGS, child_pid, 0, &regs), SyscallSucceeds());
+ EXPECT_EQ(SYS_pause, regs.orig_rax);
+ EXPECT_EQ(-ENOSYS, regs.rax);
+
+ // Resume the child with PTRACE_SYSCALL and expect it to block in the pause
+ // syscall.
+ ASSERT_THAT(ptrace(PTRACE_SYSCALL, child_pid, 0, 0), SyscallSucceeds());
+ absl::SleepFor(absl::Seconds(1));
+ ASSERT_THAT(waitpid(child_pid, &status, WNOHANG),
+ SyscallSucceedsWithValue(0));
+
+ // Send the child kSigno, causing it to return ERESTARTNOHAND and enter
+ // syscall-exit-stop from the pause syscall.
+ constexpr int ERESTARTNOHAND = 514;
+ ASSERT_THAT(kill(child_pid, kSigno), SyscallSucceeds());
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP)
+ << " status " << status;
+ ds.reset();
+
+ ASSERT_THAT(ptrace(PTRACE_GETREGS, child_pid, 0, &regs), SyscallSucceeds());
+ EXPECT_EQ(SYS_pause, regs.orig_rax);
+ EXPECT_EQ(-ERESTARTNOHAND, regs.rax);
+
+ // Replace the return value from pause with 0, causing pause to not be
+ // restarted despite kSigno being ignored.
+ ASSERT_THAT(ptrace(PTRACE_POKEUSER, child_pid,
+ offsetof(struct user_regs_struct, rax), 0),
+ SyscallSucceeds());
+
+ // Detach from the child and wait for it to exit.
+ ASSERT_THAT(ptrace(PTRACE_DETACH, child_pid, 0, 0), SyscallSucceeds());
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+ << " status " << status;
+}
+#endif // defined(__x86_64__)
+
+TEST(PtraceTest, Seize_Interrupt_Listen) {
+ volatile long child_should_spin = 1;
+ pid_t const child_pid = fork();
+ if (child_pid == 0) {
+ // In child process.
+ while (child_should_spin) {
+ SleepSafe(absl::Seconds(1));
+ }
+ _exit(1);
+ }
+
+ // In parent process.
+ ASSERT_THAT(child_pid, SyscallSucceeds());
+
+ // Attach to the child with PTRACE_SEIZE; doing so should not stop the child.
+ ASSERT_THAT(ptrace(PTRACE_SEIZE, child_pid, 0, 0), SyscallSucceeds());
+ int status;
+ EXPECT_THAT(waitpid(child_pid, &status, WNOHANG),
+ SyscallSucceedsWithValue(0));
+
+ // Stop the child with PTRACE_INTERRUPT.
+ ASSERT_THAT(ptrace(PTRACE_INTERRUPT, child_pid, 0, 0), SyscallSucceeds());
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_EQ(SIGTRAP | (kPtraceEventStop << 8), status >> 8);
+
+ // Unset child_should_spin to verify that the child never leaves the spin
+ // loop.
+ ASSERT_THAT(ptrace(PTRACE_POKEDATA, child_pid, &child_should_spin, 0),
+ SyscallSucceeds());
+
+ // Send SIGSTOP to the child, then resume it, allowing it to proceed to
+ // signal-delivery-stop.
+ ASSERT_THAT(kill(child_pid, SIGSTOP), SyscallSucceeds());
+ ASSERT_THAT(ptrace(PTRACE_CONT, child_pid, 0, 0), SyscallSucceeds());
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP)
+ << " status " << status;
+
+ // Release the child from signal-delivery-stop without suppressing the
+ // SIGSTOP, causing it to enter group-stop.
+ ASSERT_THAT(ptrace(PTRACE_CONT, child_pid, 0, SIGSTOP), SyscallSucceeds());
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_EQ(SIGSTOP | (kPtraceEventStop << 8), status >> 8);
+
+ // "The state of the tracee after PTRACE_LISTEN is somewhat of a gray area: it
+ // is not in any ptrace-stop (ptrace commands won't work on it, and it will
+ // deliver waitpid(2) notifications), but it also may be considered 'stopped'
+ // because it is not executing instructions (is not scheduled), and if it was
+ // in group-stop before PTRACE_LISTEN, it will not respond to signals until
+ // SIGCONT is received." - ptrace(2).
+ ASSERT_THAT(ptrace(PTRACE_LISTEN, child_pid, 0, 0), SyscallSucceeds());
+ EXPECT_THAT(ptrace(PTRACE_CONT, child_pid, 0, 0),
+ SyscallFailsWithErrno(ESRCH));
+ EXPECT_THAT(waitpid(child_pid, &status, WNOHANG),
+ SyscallSucceedsWithValue(0));
+ EXPECT_THAT(kill(child_pid, SIGTERM), SyscallSucceeds());
+ absl::SleepFor(absl::Seconds(1));
+ EXPECT_THAT(waitpid(child_pid, &status, WNOHANG),
+ SyscallSucceedsWithValue(0));
+
+ // Send SIGCONT to the child, causing it to leave group-stop and re-trap due
+ // to PTRACE_LISTEN.
+ EXPECT_THAT(kill(child_pid, SIGCONT), SyscallSucceeds());
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_EQ(SIGTRAP | (kPtraceEventStop << 8), status >> 8);
+
+ // Detach the child and expect it to exit due to the SIGTERM we sent while
+ // it was stopped by PTRACE_LISTEN.
+ ASSERT_THAT(ptrace(PTRACE_DETACH, child_pid, 0, 0), SyscallSucceeds());
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGTERM)
+ << " status " << status;
+}
+
+TEST(PtraceTest, Interrupt_Listen_RequireSeize) {
+ pid_t const child_pid = fork();
+ if (child_pid == 0) {
+ // In child process.
+ TEST_PCHECK(ptrace(PTRACE_TRACEME, 0, 0, 0) == 0);
+ MaybeSave();
+ raise(SIGSTOP);
+ _exit(0);
+ }
+ // In parent process.
+ ASSERT_THAT(child_pid, SyscallSucceeds());
+
+ // Wait for the child to send itself SIGSTOP and enter signal-delivery-stop.
+ int status;
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP)
+ << " status " << status;
+
+ // PTRACE_INTERRUPT and PTRACE_LISTEN should fail since the child wasn't
+ // attached with PTRACE_SEIZE, leaving the child in signal-delivery-stop.
+ EXPECT_THAT(ptrace(PTRACE_INTERRUPT, child_pid, 0, 0),
+ SyscallFailsWithErrno(EIO));
+ EXPECT_THAT(ptrace(PTRACE_LISTEN, child_pid, 0, 0),
+ SyscallFailsWithErrno(EIO));
+
+ // Suppress SIGSTOP and detach from the child, expecting it to exit normally.
+ ASSERT_THAT(ptrace(PTRACE_DETACH, child_pid, 0, 0), SyscallSucceeds());
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+ << " status " << status;
+}
+
+TEST(PtraceTest, SeizeSetOptions) {
+ pid_t const child_pid = fork();
+ if (child_pid == 0) {
+ // In child process.
+ while (true) {
+ SleepSafe(absl::Seconds(1));
+ }
+ }
+
+ // In parent process.
+ ASSERT_THAT(child_pid, SyscallSucceeds());
+
+ // Attach to the child with PTRACE_SEIZE while setting PTRACE_O_TRACESYSGOOD.
+ ASSERT_THAT(ptrace(PTRACE_SEIZE, child_pid, 0, PTRACE_O_TRACESYSGOOD),
+ SyscallSucceeds());
+
+ // Stop the child with PTRACE_INTERRUPT.
+ ASSERT_THAT(ptrace(PTRACE_INTERRUPT, child_pid, 0, 0), SyscallSucceeds());
+ int status;
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_EQ(SIGTRAP | (kPtraceEventStop << 8), status >> 8);
+
+ // Resume the child with PTRACE_SYSCALL and wait for it to enter
+ // syscall-enter-stop. The stop signal status from the syscall stop should be
+ // SIGTRAP|0x80, reflecting PTRACE_O_TRACESYSGOOD.
+ ASSERT_THAT(ptrace(PTRACE_SYSCALL, child_pid, 0, 0), SyscallSucceeds());
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == (SIGTRAP | 0x80))
+ << " status " << status;
+
+ // Clean up the child.
+ ASSERT_THAT(kill(child_pid, SIGKILL), SyscallSucceeds());
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ if (WIFSTOPPED(status) && WSTOPSIG(status) == (SIGTRAP | 0x80)) {
+ // "SIGKILL kills even within system calls (syscall-exit-stop is not
+ // generated prior to death by SIGKILL). The net effect is that SIGKILL
+ // always kills the process (all its threads), even if some threads of the
+ // process are ptraced." - ptrace(2). This is technically true, but...
+ //
+ // When we send SIGKILL to the child, kernel/signal.c:complete_signal() =>
+ // signal_wake_up(resume=1) kicks the tracee out of the syscall-enter-stop.
+ // The pending SIGKILL causes the syscall to be skipped, but the child
+ // thread still reports syscall-exit before checking for pending signals; in
+ // current kernels, this is
+ // arch/x86/entry/common.c:syscall_return_slowpath() =>
+ // syscall_slow_exit_work() =>
+ // include/linux/tracehook.h:tracehook_report_syscall_exit() =>
+ // ptrace_report_syscall() => kernel/signal.c:ptrace_notify() =>
+ // ptrace_do_notify() => ptrace_stop().
+ //
+ // ptrace_stop() sets the task's state to TASK_TRACED and the task's
+ // exit_code to SIGTRAP|0x80 (passed by ptrace_report_syscall()), then calls
+ // freezable_schedule(). freezable_schedule() eventually reaches
+ // __schedule(), which detects signal_pending_state() due to the pending
+ // SIGKILL, sets the task's state back to TASK_RUNNING, and returns without
+ // descheduling. Thus, the task never enters syscall-exit-stop. However, if
+ // our wait4() => kernel/exit.c:wait_task_stopped() racily observes the
+ // TASK_TRACED state and the non-zero exit code set by ptrace_stop() before
+ // __schedule() sets the state back to TASK_RUNNING, it will return the
+ // task's exit_code as status W_STOPCODE(SIGTRAP|0x80). So we get a spurious
+ // syscall-exit-stop notification, and need to wait4() again for task exit.
+ //
+ // gVisor is not susceptible to this race because
+ // kernel.Task.waitCollectTraceeStopLocked() checks specifically for an
+ // active ptraceStop, which is not initiated if SIGKILL is pending.
+ std::cout << "Observed syscall-exit after SIGKILL" << std::endl;
+ ASSERT_THAT(waitpid(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ }
+ EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGKILL)
+ << " status " << status;
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
+
+int main(int argc, char** argv) {
+ gvisor::testing::TestInit(&argc, &argv);
+
+ if (absl::GetFlag(FLAGS_ptrace_test_execve_child)) {
+ gvisor::testing::RunExecveChild();
+ }
+
+ return gvisor::testing::RunAllTests();
+}
diff --git a/test/syscalls/linux/pty.cc b/test/syscalls/linux/pty.cc
new file mode 100644
index 000000000..f9392b9e0
--- /dev/null
+++ b/test/syscalls/linux/pty.cc
@@ -0,0 +1,1627 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <linux/capability.h>
+#include <linux/major.h>
+#include <poll.h>
+#include <sched.h>
+#include <signal.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <termios.h>
+#include <unistd.h>
+
+#include <iostream>
+
+#include "gtest/gtest.h"
+#include "absl/base/macros.h"
+#include "absl/strings/str_cat.h"
+#include "absl/synchronization/notification.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/posix_error.h"
+#include "test/util/pty_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+using ::testing::AnyOf;
+using ::testing::Contains;
+using ::testing::Eq;
+using ::testing::Not;
+
+// Tests Unix98 pseudoterminals.
+//
+// These tests assume that /dev/ptmx exists and is associated with a devpts
+// filesystem mounted at /dev/pts/. While a Linux distribution could
+// theoretically place those anywhere, glibc expects those locations, so they
+// are effectively fixed.
+
+// Minor device number for an unopened ptmx file.
+constexpr int kPtmxMinor = 2;
+
+// The timeout when polling for data from a pty. When data is written to one end
+// of a pty, Linux asynchronously makes it available to the other end, so we
+// have to wait.
+constexpr absl::Duration kTimeout = absl::Seconds(20);
+
+// The maximum line size in bytes returned per read from a pty file.
+constexpr int kMaxLineSize = 4096;
+
+constexpr char kMasterPath[] = "/dev/ptmx";
+
+// glibc defines its own, different, version of struct termios. We care about
+// what the kernel does, not glibc.
+#define KERNEL_NCCS 19
+struct kernel_termios {
+ tcflag_t c_iflag;
+ tcflag_t c_oflag;
+ tcflag_t c_cflag;
+ tcflag_t c_lflag;
+ cc_t c_line;
+ cc_t c_cc[KERNEL_NCCS];
+};
+
+bool operator==(struct kernel_termios const& a,
+ struct kernel_termios const& b) {
+ return memcmp(&a, &b, sizeof(a)) == 0;
+}
+
+// Returns the termios-style control character for the passed character.
+//
+// e.g., for Ctrl-C, i.e., ^C, call ControlCharacter('C').
+//
+// Standard control characters are ASCII bytes 0 through 31.
+constexpr char ControlCharacter(char c) {
+ // A is 1, B is 2, etc.
+ return c - 'A' + 1;
+}
+
+// Returns the printable character the given control character represents.
+constexpr char FromControlCharacter(char c) { return c + 'A' - 1; }
+
+// Returns true if c is a control character.
+//
+// Standard control characters are ASCII bytes 0 through 31.
+constexpr bool IsControlCharacter(char c) { return c <= 31; }
+
+struct Field {
+ const char* name;
+ uint64_t mask;
+ uint64_t value;
+};
+
+// ParseFields returns a string representation of value, using the names in
+// fields.
+std::string ParseFields(const Field* fields, size_t len, uint64_t value) {
+ bool first = true;
+ std::string s;
+ for (size_t i = 0; i < len; i++) {
+ const Field f = fields[i];
+ if ((value & f.mask) == f.value) {
+ if (!first) {
+ s += "|";
+ }
+ s += f.name;
+ first = false;
+ value &= ~f.mask;
+ }
+ }
+
+ if (value) {
+ if (!first) {
+ s += "|";
+ }
+ absl::StrAppend(&s, value);
+ }
+
+ return s;
+}
+
+const Field kIflagFields[] = {
+ {"IGNBRK", IGNBRK, IGNBRK}, {"BRKINT", BRKINT, BRKINT},
+ {"IGNPAR", IGNPAR, IGNPAR}, {"PARMRK", PARMRK, PARMRK},
+ {"INPCK", INPCK, INPCK}, {"ISTRIP", ISTRIP, ISTRIP},
+ {"INLCR", INLCR, INLCR}, {"IGNCR", IGNCR, IGNCR},
+ {"ICRNL", ICRNL, ICRNL}, {"IUCLC", IUCLC, IUCLC},
+ {"IXON", IXON, IXON}, {"IXANY", IXANY, IXANY},
+ {"IXOFF", IXOFF, IXOFF}, {"IMAXBEL", IMAXBEL, IMAXBEL},
+ {"IUTF8", IUTF8, IUTF8},
+};
+
+const Field kOflagFields[] = {
+ {"OPOST", OPOST, OPOST}, {"OLCUC", OLCUC, OLCUC},
+ {"ONLCR", ONLCR, ONLCR}, {"OCRNL", OCRNL, OCRNL},
+ {"ONOCR", ONOCR, ONOCR}, {"ONLRET", ONLRET, ONLRET},
+ {"OFILL", OFILL, OFILL}, {"OFDEL", OFDEL, OFDEL},
+ {"NL0", NLDLY, NL0}, {"NL1", NLDLY, NL1},
+ {"CR0", CRDLY, CR0}, {"CR1", CRDLY, CR1},
+ {"CR2", CRDLY, CR2}, {"CR3", CRDLY, CR3},
+ {"TAB0", TABDLY, TAB0}, {"TAB1", TABDLY, TAB1},
+ {"TAB2", TABDLY, TAB2}, {"TAB3", TABDLY, TAB3},
+ {"BS0", BSDLY, BS0}, {"BS1", BSDLY, BS1},
+ {"FF0", FFDLY, FF0}, {"FF1", FFDLY, FF1},
+ {"VT0", VTDLY, VT0}, {"VT1", VTDLY, VT1},
+ {"XTABS", XTABS, XTABS},
+};
+
+#ifndef IBSHIFT
+// Shift from CBAUD to CIBAUD.
+#define IBSHIFT 16
+#endif
+
+const Field kCflagFields[] = {
+ {"B0", CBAUD, B0},
+ {"B50", CBAUD, B50},
+ {"B75", CBAUD, B75},
+ {"B110", CBAUD, B110},
+ {"B134", CBAUD, B134},
+ {"B150", CBAUD, B150},
+ {"B200", CBAUD, B200},
+ {"B300", CBAUD, B300},
+ {"B600", CBAUD, B600},
+ {"B1200", CBAUD, B1200},
+ {"B1800", CBAUD, B1800},
+ {"B2400", CBAUD, B2400},
+ {"B4800", CBAUD, B4800},
+ {"B9600", CBAUD, B9600},
+ {"B19200", CBAUD, B19200},
+ {"B38400", CBAUD, B38400},
+ {"CS5", CSIZE, CS5},
+ {"CS6", CSIZE, CS6},
+ {"CS7", CSIZE, CS7},
+ {"CS8", CSIZE, CS8},
+ {"CSTOPB", CSTOPB, CSTOPB},
+ {"CREAD", CREAD, CREAD},
+ {"PARENB", PARENB, PARENB},
+ {"PARODD", PARODD, PARODD},
+ {"HUPCL", HUPCL, HUPCL},
+ {"CLOCAL", CLOCAL, CLOCAL},
+ {"B57600", CBAUD, B57600},
+ {"B115200", CBAUD, B115200},
+ {"B230400", CBAUD, B230400},
+ {"B460800", CBAUD, B460800},
+ {"B500000", CBAUD, B500000},
+ {"B576000", CBAUD, B576000},
+ {"B921600", CBAUD, B921600},
+ {"B1000000", CBAUD, B1000000},
+ {"B1152000", CBAUD, B1152000},
+ {"B1500000", CBAUD, B1500000},
+ {"B2000000", CBAUD, B2000000},
+ {"B2500000", CBAUD, B2500000},
+ {"B3000000", CBAUD, B3000000},
+ {"B3500000", CBAUD, B3500000},
+ {"B4000000", CBAUD, B4000000},
+ {"CMSPAR", CMSPAR, CMSPAR},
+ {"CRTSCTS", CRTSCTS, CRTSCTS},
+ {"IB0", CIBAUD, B0 << IBSHIFT},
+ {"IB50", CIBAUD, B50 << IBSHIFT},
+ {"IB75", CIBAUD, B75 << IBSHIFT},
+ {"IB110", CIBAUD, B110 << IBSHIFT},
+ {"IB134", CIBAUD, B134 << IBSHIFT},
+ {"IB150", CIBAUD, B150 << IBSHIFT},
+ {"IB200", CIBAUD, B200 << IBSHIFT},
+ {"IB300", CIBAUD, B300 << IBSHIFT},
+ {"IB600", CIBAUD, B600 << IBSHIFT},
+ {"IB1200", CIBAUD, B1200 << IBSHIFT},
+ {"IB1800", CIBAUD, B1800 << IBSHIFT},
+ {"IB2400", CIBAUD, B2400 << IBSHIFT},
+ {"IB4800", CIBAUD, B4800 << IBSHIFT},
+ {"IB9600", CIBAUD, B9600 << IBSHIFT},
+ {"IB19200", CIBAUD, B19200 << IBSHIFT},
+ {"IB38400", CIBAUD, B38400 << IBSHIFT},
+ {"IB57600", CIBAUD, B57600 << IBSHIFT},
+ {"IB115200", CIBAUD, B115200 << IBSHIFT},
+ {"IB230400", CIBAUD, B230400 << IBSHIFT},
+ {"IB460800", CIBAUD, B460800 << IBSHIFT},
+ {"IB500000", CIBAUD, B500000 << IBSHIFT},
+ {"IB576000", CIBAUD, B576000 << IBSHIFT},
+ {"IB921600", CIBAUD, B921600 << IBSHIFT},
+ {"IB1000000", CIBAUD, B1000000 << IBSHIFT},
+ {"IB1152000", CIBAUD, B1152000 << IBSHIFT},
+ {"IB1500000", CIBAUD, B1500000 << IBSHIFT},
+ {"IB2000000", CIBAUD, B2000000 << IBSHIFT},
+ {"IB2500000", CIBAUD, B2500000 << IBSHIFT},
+ {"IB3000000", CIBAUD, B3000000 << IBSHIFT},
+ {"IB3500000", CIBAUD, B3500000 << IBSHIFT},
+ {"IB4000000", CIBAUD, B4000000 << IBSHIFT},
+};
+
+const Field kLflagFields[] = {
+ {"ISIG", ISIG, ISIG}, {"ICANON", ICANON, ICANON},
+ {"XCASE", XCASE, XCASE}, {"ECHO", ECHO, ECHO},
+ {"ECHOE", ECHOE, ECHOE}, {"ECHOK", ECHOK, ECHOK},
+ {"ECHONL", ECHONL, ECHONL}, {"NOFLSH", NOFLSH, NOFLSH},
+ {"TOSTOP", TOSTOP, TOSTOP}, {"ECHOCTL", ECHOCTL, ECHOCTL},
+ {"ECHOPRT", ECHOPRT, ECHOPRT}, {"ECHOKE", ECHOKE, ECHOKE},
+ {"FLUSHO", FLUSHO, FLUSHO}, {"PENDIN", PENDIN, PENDIN},
+ {"IEXTEN", IEXTEN, IEXTEN}, {"EXTPROC", EXTPROC, EXTPROC},
+};
+
+std::string FormatCC(char c) {
+ if (isgraph(c)) {
+ return std::string(1, c);
+ } else if (c == ' ') {
+ return " ";
+ } else if (c == '\t') {
+ return "\\t";
+ } else if (c == '\r') {
+ return "\\r";
+ } else if (c == '\n') {
+ return "\\n";
+ } else if (c == '\0') {
+ return "\\0";
+ } else if (IsControlCharacter(c)) {
+ return absl::StrCat("^", std::string(1, FromControlCharacter(c)));
+ }
+ return absl::StrCat("\\x", absl::Hex(c));
+}
+
+std::ostream& operator<<(std::ostream& os, struct kernel_termios const& a) {
+ os << "{ c_iflag = "
+ << ParseFields(kIflagFields, ABSL_ARRAYSIZE(kIflagFields), a.c_iflag);
+ os << ", c_oflag = "
+ << ParseFields(kOflagFields, ABSL_ARRAYSIZE(kOflagFields), a.c_oflag);
+ os << ", c_cflag = "
+ << ParseFields(kCflagFields, ABSL_ARRAYSIZE(kCflagFields), a.c_cflag);
+ os << ", c_lflag = "
+ << ParseFields(kLflagFields, ABSL_ARRAYSIZE(kLflagFields), a.c_lflag);
+ os << ", c_line = " << a.c_line;
+ os << ", c_cc = { [VINTR] = '" << FormatCC(a.c_cc[VINTR]);
+ os << "', [VQUIT] = '" << FormatCC(a.c_cc[VQUIT]);
+ os << "', [VERASE] = '" << FormatCC(a.c_cc[VERASE]);
+ os << "', [VKILL] = '" << FormatCC(a.c_cc[VKILL]);
+ os << "', [VEOF] = '" << FormatCC(a.c_cc[VEOF]);
+ os << "', [VTIME] = '" << static_cast<int>(a.c_cc[VTIME]);
+ os << "', [VMIN] = " << static_cast<int>(a.c_cc[VMIN]);
+ os << ", [VSWTC] = '" << FormatCC(a.c_cc[VSWTC]);
+ os << "', [VSTART] = '" << FormatCC(a.c_cc[VSTART]);
+ os << "', [VSTOP] = '" << FormatCC(a.c_cc[VSTOP]);
+ os << "', [VSUSP] = '" << FormatCC(a.c_cc[VSUSP]);
+ os << "', [VEOL] = '" << FormatCC(a.c_cc[VEOL]);
+ os << "', [VREPRINT] = '" << FormatCC(a.c_cc[VREPRINT]);
+ os << "', [VDISCARD] = '" << FormatCC(a.c_cc[VDISCARD]);
+ os << "', [VWERASE] = '" << FormatCC(a.c_cc[VWERASE]);
+ os << "', [VLNEXT] = '" << FormatCC(a.c_cc[VLNEXT]);
+ os << "', [VEOL2] = '" << FormatCC(a.c_cc[VEOL2]);
+ os << "'}";
+ return os;
+}
+
+// Return the default termios settings for a new terminal.
+struct kernel_termios DefaultTermios() {
+ struct kernel_termios t = {};
+ t.c_iflag = IXON | ICRNL;
+ t.c_oflag = OPOST | ONLCR;
+ t.c_cflag = B38400 | CSIZE | CS8 | CREAD;
+ t.c_lflag = ISIG | ICANON | ECHO | ECHOE | ECHOK | ECHOCTL | ECHOKE | IEXTEN;
+ t.c_line = 0;
+ t.c_cc[VINTR] = ControlCharacter('C');
+ t.c_cc[VQUIT] = ControlCharacter('\\');
+ t.c_cc[VERASE] = '\x7f';
+ t.c_cc[VKILL] = ControlCharacter('U');
+ t.c_cc[VEOF] = ControlCharacter('D');
+ t.c_cc[VTIME] = '\0';
+ t.c_cc[VMIN] = 1;
+ t.c_cc[VSWTC] = '\0';
+ t.c_cc[VSTART] = ControlCharacter('Q');
+ t.c_cc[VSTOP] = ControlCharacter('S');
+ t.c_cc[VSUSP] = ControlCharacter('Z');
+ t.c_cc[VEOL] = '\0';
+ t.c_cc[VREPRINT] = ControlCharacter('R');
+ t.c_cc[VDISCARD] = ControlCharacter('O');
+ t.c_cc[VWERASE] = ControlCharacter('W');
+ t.c_cc[VLNEXT] = ControlCharacter('V');
+ t.c_cc[VEOL2] = '\0';
+ return t;
+}
+
+// PollAndReadFd tries to read count bytes from buf within timeout.
+//
+// Returns a partial read if some bytes were read.
+//
+// fd must be non-blocking.
+PosixErrorOr<size_t> PollAndReadFd(int fd, void* buf, size_t count,
+ absl::Duration timeout) {
+ absl::Time end = absl::Now() + timeout;
+
+ size_t completed = 0;
+ absl::Duration remaining;
+ while ((remaining = end - absl::Now()) > absl::ZeroDuration()) {
+ struct pollfd pfd = {fd, POLLIN, 0};
+ int ret = RetryEINTR(poll)(&pfd, 1, absl::ToInt64Milliseconds(remaining));
+ if (ret < 0) {
+ return PosixError(errno, "poll failed");
+ } else if (ret == 0) {
+ // Timed out.
+ continue;
+ } else if (ret != 1) {
+ return PosixError(EINVAL, absl::StrCat("Bad poll ret ", ret));
+ }
+
+ ssize_t n =
+ ReadFd(fd, static_cast<char*>(buf) + completed, count - completed);
+ if (n < 0) {
+ if (errno == EAGAIN) {
+ // Linux sometimes returns EAGAIN from this read, despite the fact that
+ // poll returned success. Let's just do what do as we are told and try
+ // again.
+ continue;
+ }
+ return PosixError(errno, "read failed");
+ }
+ completed += n;
+ if (completed >= count) {
+ return completed;
+ }
+ }
+
+ if (completed) {
+ return completed;
+ }
+ return PosixError(ETIMEDOUT, "Poll timed out");
+}
+
+TEST(PtyTrunc, Truncate) {
+ // Opening PTYs with O_TRUNC shouldn't cause an error, but calls to
+ // (f)truncate should.
+ FileDescriptor master =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(kMasterPath, O_RDWR | O_TRUNC));
+ int n = ASSERT_NO_ERRNO_AND_VALUE(SlaveID(master));
+ std::string spath = absl::StrCat("/dev/pts/", n);
+ FileDescriptor slave =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(spath, O_RDWR | O_NONBLOCK | O_TRUNC));
+
+ EXPECT_THAT(truncate(kMasterPath, 0), SyscallFailsWithErrno(EINVAL));
+ EXPECT_THAT(truncate(spath.c_str(), 0), SyscallFailsWithErrno(EINVAL));
+ EXPECT_THAT(ftruncate(master.get(), 0), SyscallFailsWithErrno(EINVAL));
+ EXPECT_THAT(ftruncate(slave.get(), 0), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(BasicPtyTest, StatUnopenedMaster) {
+ struct stat s;
+ ASSERT_THAT(stat(kMasterPath, &s), SyscallSucceeds());
+
+ EXPECT_EQ(s.st_rdev, makedev(TTYAUX_MAJOR, kPtmxMinor));
+ EXPECT_EQ(s.st_size, 0);
+ EXPECT_EQ(s.st_blocks, 0);
+
+ // ptmx attached to a specific devpts mount uses block size 1024. See
+ // fs/devpts/inode.c:devpts_fill_super.
+ //
+ // The global ptmx device uses the block size of the filesystem it is created
+ // on (which is usually 4096 for disk filesystems).
+ EXPECT_THAT(s.st_blksize, AnyOf(Eq(1024), Eq(4096)));
+}
+
+// Waits for count bytes to be readable from fd. Unlike poll, which can return
+// before all data is moved into a pty's read buffer, this function waits for
+// all count bytes to become readable.
+PosixErrorOr<int> WaitUntilReceived(int fd, int count) {
+ int buffered = -1;
+ absl::Duration remaining;
+ absl::Time end = absl::Now() + kTimeout;
+ while ((remaining = end - absl::Now()) > absl::ZeroDuration()) {
+ if (ioctl(fd, FIONREAD, &buffered) < 0) {
+ return PosixError(errno, "failed FIONREAD ioctl");
+ }
+ if (buffered >= count) {
+ return buffered;
+ }
+ absl::SleepFor(absl::Milliseconds(500));
+ }
+ return PosixError(
+ ETIMEDOUT,
+ absl::StrFormat(
+ "FIONREAD timed out, receiving only %d of %d expected bytes",
+ buffered, count));
+}
+
+// Verifies that there is nothing left to read from fd.
+void ExpectFinished(const FileDescriptor& fd) {
+ // Nothing more to read.
+ char c;
+ EXPECT_THAT(ReadFd(fd.get(), &c, 1), SyscallFailsWithErrno(EAGAIN));
+}
+
+// Verifies that we can read expected bytes from fd into buf.
+void ExpectReadable(const FileDescriptor& fd, int expected, char* buf) {
+ size_t n = ASSERT_NO_ERRNO_AND_VALUE(
+ PollAndReadFd(fd.get(), buf, expected, kTimeout));
+ EXPECT_EQ(expected, n);
+}
+
+TEST(BasicPtyTest, OpenMasterSlave) {
+ FileDescriptor master = ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/ptmx", O_RDWR));
+ FileDescriptor slave = ASSERT_NO_ERRNO_AND_VALUE(OpenSlave(master));
+}
+
+// The slave entry in /dev/pts/ disappears when the master is closed, even if
+// the slave is still open.
+TEST(BasicPtyTest, SlaveEntryGoneAfterMasterClose) {
+ FileDescriptor master = ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/ptmx", O_RDWR));
+ FileDescriptor slave = ASSERT_NO_ERRNO_AND_VALUE(OpenSlave(master));
+
+ // Get pty index.
+ int index = -1;
+ ASSERT_THAT(ioctl(master.get(), TIOCGPTN, &index), SyscallSucceeds());
+
+ std::string path = absl::StrCat("/dev/pts/", index);
+
+ struct stat st;
+ EXPECT_THAT(stat(path.c_str(), &st), SyscallSucceeds());
+
+ master.reset();
+
+ EXPECT_THAT(stat(path.c_str(), &st), SyscallFailsWithErrno(ENOENT));
+}
+
+TEST(BasicPtyTest, Getdents) {
+ FileDescriptor master1 = ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/ptmx", O_RDWR));
+ int index1 = -1;
+ ASSERT_THAT(ioctl(master1.get(), TIOCGPTN, &index1), SyscallSucceeds());
+ FileDescriptor slave1 = ASSERT_NO_ERRNO_AND_VALUE(OpenSlave(master1));
+
+ FileDescriptor master2 = ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/ptmx", O_RDWR));
+ int index2 = -1;
+ ASSERT_THAT(ioctl(master2.get(), TIOCGPTN, &index2), SyscallSucceeds());
+ FileDescriptor slave2 = ASSERT_NO_ERRNO_AND_VALUE(OpenSlave(master2));
+
+ // The directory contains ptmx, index1, and index2. (Plus any additional PTYs
+ // unrelated to this test.)
+
+ std::vector<std::string> contents =
+ ASSERT_NO_ERRNO_AND_VALUE(ListDir("/dev/pts/", true));
+ EXPECT_THAT(contents, Contains(absl::StrCat(index1)));
+ EXPECT_THAT(contents, Contains(absl::StrCat(index2)));
+
+ master2.reset();
+
+ // The directory contains ptmx and index1, but not index2 since the master is
+ // closed. (Plus any additional PTYs unrelated to this test.)
+
+ contents = ASSERT_NO_ERRNO_AND_VALUE(ListDir("/dev/pts/", true));
+ EXPECT_THAT(contents, Contains(absl::StrCat(index1)));
+ EXPECT_THAT(contents, Not(Contains(absl::StrCat(index2))));
+
+ // N.B. devpts supports legacy "single-instance" mode and new "multi-instance"
+ // mode. In legacy mode, devpts does not contain a "ptmx" device (the distro
+ // must use mknod to create it somewhere, presumably /dev/ptmx).
+ // Multi-instance mode does include a "ptmx" device tied to that mount.
+ //
+ // We don't check for the presence or absence of "ptmx", as distros vary in
+ // their usage of the two modes.
+}
+
+class PtyTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ master_ = ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/ptmx", O_RDWR | O_NONBLOCK));
+ slave_ = ASSERT_NO_ERRNO_AND_VALUE(OpenSlave(master_));
+ }
+
+ void DisableCanonical() {
+ struct kernel_termios t = {};
+ EXPECT_THAT(ioctl(slave_.get(), TCGETS, &t), SyscallSucceeds());
+ t.c_lflag &= ~ICANON;
+ EXPECT_THAT(ioctl(slave_.get(), TCSETS, &t), SyscallSucceeds());
+ }
+
+ void EnableCanonical() {
+ struct kernel_termios t = {};
+ EXPECT_THAT(ioctl(slave_.get(), TCGETS, &t), SyscallSucceeds());
+ t.c_lflag |= ICANON;
+ EXPECT_THAT(ioctl(slave_.get(), TCSETS, &t), SyscallSucceeds());
+ }
+
+ // Master and slave ends of the PTY. Non-blocking.
+ FileDescriptor master_;
+ FileDescriptor slave_;
+};
+
+// Master to slave sanity test.
+TEST_F(PtyTest, WriteMasterToSlave) {
+ // N.B. by default, the slave reads nothing until the master writes a newline.
+ constexpr char kBuf[] = "hello\n";
+
+ EXPECT_THAT(WriteFd(master_.get(), kBuf, sizeof(kBuf) - 1),
+ SyscallSucceedsWithValue(sizeof(kBuf) - 1));
+
+ // Linux moves data from the master to the slave via async work scheduled via
+ // tty_flip_buffer_push. Since it is asynchronous, the data may not be
+ // available for reading immediately. Instead we must poll and assert that it
+ // becomes available "soon".
+
+ char buf[sizeof(kBuf)] = {};
+ ExpectReadable(slave_, sizeof(buf) - 1, buf);
+
+ EXPECT_EQ(memcmp(buf, kBuf, sizeof(kBuf)), 0);
+}
+
+// Slave to master sanity test.
+TEST_F(PtyTest, WriteSlaveToMaster) {
+ // N.B. by default, the master reads nothing until the slave writes a newline,
+ // and the master gets a carriage return.
+ constexpr char kInput[] = "hello\n";
+ constexpr char kExpected[] = "hello\r\n";
+
+ EXPECT_THAT(WriteFd(slave_.get(), kInput, sizeof(kInput) - 1),
+ SyscallSucceedsWithValue(sizeof(kInput) - 1));
+
+ // Linux moves data from the master to the slave via async work scheduled via
+ // tty_flip_buffer_push. Since it is asynchronous, the data may not be
+ // available for reading immediately. Instead we must poll and assert that it
+ // becomes available "soon".
+
+ char buf[sizeof(kExpected)] = {};
+ ExpectReadable(master_, sizeof(buf) - 1, buf);
+
+ EXPECT_EQ(memcmp(buf, kExpected, sizeof(kExpected)), 0);
+}
+
+TEST_F(PtyTest, WriteInvalidUTF8) {
+ char c = 0xff;
+ ASSERT_THAT(syscall(__NR_write, master_.get(), &c, sizeof(c)),
+ SyscallSucceedsWithValue(sizeof(c)));
+}
+
+// Both the master and slave report the standard default termios settings.
+//
+// Note that TCGETS on the master actually redirects to the slave (see comment
+// on MasterTermiosUnchangable).
+TEST_F(PtyTest, DefaultTermios) {
+ struct kernel_termios t = {};
+ EXPECT_THAT(ioctl(slave_.get(), TCGETS, &t), SyscallSucceeds());
+ EXPECT_EQ(t, DefaultTermios());
+
+ EXPECT_THAT(ioctl(master_.get(), TCGETS, &t), SyscallSucceeds());
+ EXPECT_EQ(t, DefaultTermios());
+}
+
+// Changing termios from the master actually affects the slave.
+//
+// TCSETS on the master actually redirects to the slave (see comment on
+// MasterTermiosUnchangable).
+TEST_F(PtyTest, TermiosAffectsSlave) {
+ struct kernel_termios master_termios = {};
+ EXPECT_THAT(ioctl(master_.get(), TCGETS, &master_termios), SyscallSucceeds());
+ master_termios.c_lflag ^= ICANON;
+ EXPECT_THAT(ioctl(master_.get(), TCSETS, &master_termios), SyscallSucceeds());
+
+ struct kernel_termios slave_termios = {};
+ EXPECT_THAT(ioctl(slave_.get(), TCGETS, &slave_termios), SyscallSucceeds());
+ EXPECT_EQ(master_termios, slave_termios);
+}
+
+// The master end of the pty has termios:
+//
+// struct kernel_termios t = {
+// .c_iflag = 0;
+// .c_oflag = 0;
+// .c_cflag = B38400 | CS8 | CREAD;
+// .c_lflag = 0;
+// .c_cc = /* same as DefaultTermios */
+// }
+//
+// (From drivers/tty/pty.c:unix98_pty_init)
+//
+// All termios control ioctls on the master actually redirect to the slave
+// (drivers/tty/tty_ioctl.c:tty_mode_ioctl), making it impossible to change the
+// master termios.
+//
+// Verify this by setting ICRNL (which rewrites input \r to \n) and verify that
+// it has no effect on the master.
+TEST_F(PtyTest, MasterTermiosUnchangable) {
+ struct kernel_termios master_termios = {};
+ EXPECT_THAT(ioctl(master_.get(), TCGETS, &master_termios), SyscallSucceeds());
+ master_termios.c_lflag |= ICRNL;
+ EXPECT_THAT(ioctl(master_.get(), TCSETS, &master_termios), SyscallSucceeds());
+
+ char c = '\r';
+ ASSERT_THAT(WriteFd(slave_.get(), &c, 1), SyscallSucceedsWithValue(1));
+
+ ExpectReadable(master_, 1, &c);
+ EXPECT_EQ(c, '\r'); // ICRNL had no effect!
+
+ ExpectFinished(master_);
+}
+
+// ICRNL rewrites input \r to \n.
+TEST_F(PtyTest, TermiosICRNL) {
+ struct kernel_termios t = DefaultTermios();
+ t.c_iflag |= ICRNL;
+ t.c_lflag &= ~ICANON; // for byte-by-byte reading.
+ ASSERT_THAT(ioctl(slave_.get(), TCSETS, &t), SyscallSucceeds());
+
+ char c = '\r';
+ ASSERT_THAT(WriteFd(master_.get(), &c, 1), SyscallSucceedsWithValue(1));
+
+ ExpectReadable(slave_, 1, &c);
+ EXPECT_EQ(c, '\n');
+
+ ExpectFinished(slave_);
+}
+
+// ONLCR rewrites output \n to \r\n.
+TEST_F(PtyTest, TermiosONLCR) {
+ struct kernel_termios t = DefaultTermios();
+ t.c_oflag |= ONLCR;
+ t.c_lflag &= ~ICANON; // for byte-by-byte reading.
+ ASSERT_THAT(ioctl(slave_.get(), TCSETS, &t), SyscallSucceeds());
+
+ char c = '\n';
+ ASSERT_THAT(WriteFd(slave_.get(), &c, 1), SyscallSucceedsWithValue(1));
+
+ // Extra byte for NUL for EXPECT_STREQ.
+ char buf[3] = {};
+ ExpectReadable(master_, 2, buf);
+ EXPECT_STREQ(buf, "\r\n");
+
+ ExpectFinished(slave_);
+}
+
+TEST_F(PtyTest, TermiosIGNCR) {
+ struct kernel_termios t = DefaultTermios();
+ t.c_iflag |= IGNCR;
+ t.c_lflag &= ~ICANON; // for byte-by-byte reading.
+ ASSERT_THAT(ioctl(slave_.get(), TCSETS, &t), SyscallSucceeds());
+
+ char c = '\r';
+ ASSERT_THAT(WriteFd(master_.get(), &c, 1), SyscallSucceedsWithValue(1));
+
+ // Nothing to read.
+ ASSERT_THAT(PollAndReadFd(slave_.get(), &c, 1, kTimeout),
+ PosixErrorIs(ETIMEDOUT, ::testing::StrEq("Poll timed out")));
+}
+
+// Test that we can successfully poll for readable data from the slave.
+TEST_F(PtyTest, TermiosPollSlave) {
+ struct kernel_termios t = DefaultTermios();
+ t.c_iflag |= IGNCR;
+ t.c_lflag &= ~ICANON; // for byte-by-byte reading.
+ ASSERT_THAT(ioctl(slave_.get(), TCSETS, &t), SyscallSucceeds());
+
+ absl::Notification notify;
+ int sfd = slave_.get();
+ ScopedThread th([sfd, &notify]() {
+ notify.Notify();
+
+ // Poll on the reader fd with POLLIN event.
+ struct pollfd poll_fd = {sfd, POLLIN, 0};
+ EXPECT_THAT(
+ RetryEINTR(poll)(&poll_fd, 1, absl::ToInt64Milliseconds(kTimeout)),
+ SyscallSucceedsWithValue(1));
+
+ // Should trigger POLLIN event.
+ EXPECT_EQ(poll_fd.revents & POLLIN, POLLIN);
+ });
+
+ notify.WaitForNotification();
+ // Sleep ensures that poll begins waiting before we write to the FD.
+ absl::SleepFor(absl::Seconds(1));
+
+ char s[] = "foo\n";
+ ASSERT_THAT(WriteFd(master_.get(), s, strlen(s) + 1), SyscallSucceeds());
+}
+
+// Test that we can successfully poll for readable data from the master.
+TEST_F(PtyTest, TermiosPollMaster) {
+ struct kernel_termios t = DefaultTermios();
+ t.c_iflag |= IGNCR;
+ t.c_lflag &= ~ICANON; // for byte-by-byte reading.
+ ASSERT_THAT(ioctl(master_.get(), TCSETS, &t), SyscallSucceeds());
+
+ absl::Notification notify;
+ int mfd = master_.get();
+ ScopedThread th([mfd, &notify]() {
+ notify.Notify();
+
+ // Poll on the reader fd with POLLIN event.
+ struct pollfd poll_fd = {mfd, POLLIN, 0};
+ EXPECT_THAT(
+ RetryEINTR(poll)(&poll_fd, 1, absl::ToInt64Milliseconds(kTimeout)),
+ SyscallSucceedsWithValue(1));
+
+ // Should trigger POLLIN event.
+ EXPECT_EQ(poll_fd.revents & POLLIN, POLLIN);
+ });
+
+ notify.WaitForNotification();
+ // Sleep ensures that poll begins waiting before we write to the FD.
+ absl::SleepFor(absl::Seconds(1));
+
+ char s[] = "foo\n";
+ ASSERT_THAT(WriteFd(slave_.get(), s, strlen(s) + 1), SyscallSucceeds());
+}
+
+TEST_F(PtyTest, TermiosINLCR) {
+ struct kernel_termios t = DefaultTermios();
+ t.c_iflag |= INLCR;
+ t.c_lflag &= ~ICANON; // for byte-by-byte reading.
+ ASSERT_THAT(ioctl(slave_.get(), TCSETS, &t), SyscallSucceeds());
+
+ char c = '\n';
+ ASSERT_THAT(WriteFd(master_.get(), &c, 1), SyscallSucceedsWithValue(1));
+
+ ExpectReadable(slave_, 1, &c);
+ EXPECT_EQ(c, '\r');
+
+ ExpectFinished(slave_);
+}
+
+TEST_F(PtyTest, TermiosONOCR) {
+ struct kernel_termios t = DefaultTermios();
+ t.c_oflag |= ONOCR;
+ t.c_lflag &= ~ICANON; // for byte-by-byte reading.
+ ASSERT_THAT(ioctl(slave_.get(), TCSETS, &t), SyscallSucceeds());
+
+ // The terminal is at column 0, so there should be no CR to read.
+ char c = '\r';
+ ASSERT_THAT(WriteFd(slave_.get(), &c, 1), SyscallSucceedsWithValue(1));
+
+ // Nothing to read.
+ ASSERT_THAT(PollAndReadFd(master_.get(), &c, 1, kTimeout),
+ PosixErrorIs(ETIMEDOUT, ::testing::StrEq("Poll timed out")));
+
+ // This time the column is greater than 0, so we should be able to read the CR
+ // out of the other end.
+ constexpr char kInput[] = "foo\r";
+ constexpr int kInputSize = sizeof(kInput) - 1;
+ ASSERT_THAT(WriteFd(slave_.get(), kInput, kInputSize),
+ SyscallSucceedsWithValue(kInputSize));
+
+ char buf[kInputSize] = {};
+ ExpectReadable(master_, kInputSize, buf);
+
+ EXPECT_EQ(memcmp(buf, kInput, kInputSize), 0);
+
+ ExpectFinished(master_);
+
+ // Terminal should be at column 0 again, so no CR can be read.
+ ASSERT_THAT(WriteFd(slave_.get(), &c, 1), SyscallSucceedsWithValue(1));
+
+ // Nothing to read.
+ ASSERT_THAT(PollAndReadFd(master_.get(), &c, 1, kTimeout),
+ PosixErrorIs(ETIMEDOUT, ::testing::StrEq("Poll timed out")));
+}
+
+TEST_F(PtyTest, TermiosOCRNL) {
+ struct kernel_termios t = DefaultTermios();
+ t.c_oflag |= OCRNL;
+ t.c_lflag &= ~ICANON; // for byte-by-byte reading.
+ ASSERT_THAT(ioctl(slave_.get(), TCSETS, &t), SyscallSucceeds());
+
+ // The terminal is at column 0, so there should be no CR to read.
+ char c = '\r';
+ ASSERT_THAT(WriteFd(slave_.get(), &c, 1), SyscallSucceedsWithValue(1));
+
+ ExpectReadable(master_, 1, &c);
+ EXPECT_EQ(c, '\n');
+
+ ExpectFinished(master_);
+}
+
+// Tests that VEOL is disabled when we start, and that we can set it to enable
+// it.
+TEST_F(PtyTest, VEOLTermination) {
+ // Write a few bytes ending with '\0', and confirm that we can't read.
+ constexpr char kInput[] = "hello";
+ ASSERT_THAT(WriteFd(master_.get(), kInput, sizeof(kInput)),
+ SyscallSucceedsWithValue(sizeof(kInput)));
+ char buf[sizeof(kInput)] = {};
+ ASSERT_THAT(PollAndReadFd(slave_.get(), buf, sizeof(kInput), kTimeout),
+ PosixErrorIs(ETIMEDOUT, ::testing::StrEq("Poll timed out")));
+
+ // Set the EOL character to '=' and write it.
+ constexpr char delim = '=';
+ struct kernel_termios t = DefaultTermios();
+ t.c_cc[VEOL] = delim;
+ ASSERT_THAT(ioctl(slave_.get(), TCSETS, &t), SyscallSucceeds());
+ ASSERT_THAT(WriteFd(master_.get(), &delim, 1), SyscallSucceedsWithValue(1));
+
+ // Now we can read, as sending EOL caused the line to become available.
+ ExpectReadable(slave_, sizeof(kInput), buf);
+ EXPECT_EQ(memcmp(buf, kInput, sizeof(kInput)), 0);
+
+ ExpectReadable(slave_, 1, buf);
+ EXPECT_EQ(buf[0], '=');
+
+ ExpectFinished(slave_);
+}
+
+// Tests that we can write more than the 4096 character limit, then a
+// terminating character, then read out just the first 4095 bytes plus the
+// terminator.
+TEST_F(PtyTest, CanonBigWrite) {
+ constexpr int kWriteLen = kMaxLineSize + 4;
+ char input[kWriteLen];
+ memset(input, 'M', kWriteLen - 1);
+ input[kWriteLen - 1] = '\n';
+ ASSERT_THAT(WriteFd(master_.get(), input, kWriteLen),
+ SyscallSucceedsWithValue(kWriteLen));
+
+ // We can read the line.
+ char buf[kMaxLineSize] = {};
+ ExpectReadable(slave_, kMaxLineSize, buf);
+
+ ExpectFinished(slave_);
+}
+
+// Tests that data written in canonical mode can be read immediately once
+// switched to noncanonical mode.
+TEST_F(PtyTest, SwitchCanonToNoncanon) {
+ // Write a few bytes without a terminating character, switch to noncanonical
+ // mode, and read them.
+ constexpr char kInput[] = "hello";
+ ASSERT_THAT(WriteFd(master_.get(), kInput, sizeof(kInput)),
+ SyscallSucceedsWithValue(sizeof(kInput)));
+
+ // Nothing available yet.
+ char buf[sizeof(kInput)] = {};
+ ASSERT_THAT(PollAndReadFd(slave_.get(), buf, sizeof(kInput), kTimeout),
+ PosixErrorIs(ETIMEDOUT, ::testing::StrEq("Poll timed out")));
+
+ DisableCanonical();
+
+ ExpectReadable(slave_, sizeof(kInput), buf);
+ EXPECT_STREQ(buf, kInput);
+
+ ExpectFinished(slave_);
+}
+
+TEST_F(PtyTest, SwitchCanonToNonCanonNewline) {
+ // Write a few bytes with a terminating character.
+ constexpr char kInput[] = "hello\n";
+ ASSERT_THAT(WriteFd(master_.get(), kInput, sizeof(kInput)),
+ SyscallSucceedsWithValue(sizeof(kInput)));
+
+ DisableCanonical();
+
+ // We can read the line.
+ char buf[sizeof(kInput)] = {};
+ ExpectReadable(slave_, sizeof(kInput), buf);
+ EXPECT_STREQ(buf, kInput);
+
+ ExpectFinished(slave_);
+}
+
+TEST_F(PtyTest, SwitchNoncanonToCanonNewlineBig) {
+ DisableCanonical();
+
+ // Write more than the maximum line size, then write a delimiter.
+ constexpr int kWriteLen = 4100;
+ char input[kWriteLen];
+ memset(input, 'M', kWriteLen);
+ ASSERT_THAT(WriteFd(master_.get(), input, kWriteLen),
+ SyscallSucceedsWithValue(kWriteLen));
+ // Wait for the input queue to fill.
+ ASSERT_NO_ERRNO(WaitUntilReceived(slave_.get(), kMaxLineSize - 1));
+ constexpr char delim = '\n';
+ ASSERT_THAT(WriteFd(master_.get(), &delim, 1), SyscallSucceedsWithValue(1));
+
+ EnableCanonical();
+
+ // We can read the line.
+ char buf[kMaxLineSize] = {};
+ ExpectReadable(slave_, kMaxLineSize - 1, buf);
+
+ // We can also read the remaining characters.
+ ExpectReadable(slave_, 6, buf);
+
+ ExpectFinished(slave_);
+}
+
+TEST_F(PtyTest, SwitchNoncanonToCanonNoNewline) {
+ DisableCanonical();
+
+ // Write a few bytes without a terminating character.
+ // mode, and read them.
+ constexpr char kInput[] = "hello";
+ ASSERT_THAT(WriteFd(master_.get(), kInput, sizeof(kInput) - 1),
+ SyscallSucceedsWithValue(sizeof(kInput) - 1));
+
+ ASSERT_NO_ERRNO(WaitUntilReceived(slave_.get(), sizeof(kInput) - 1));
+ EnableCanonical();
+
+ // We can read the line.
+ char buf[sizeof(kInput)] = {};
+ ExpectReadable(slave_, sizeof(kInput) - 1, buf);
+ EXPECT_STREQ(buf, kInput);
+
+ ExpectFinished(slave_);
+}
+
+TEST_F(PtyTest, SwitchNoncanonToCanonNoNewlineBig) {
+ DisableCanonical();
+
+ // Write a few bytes without a terminating character.
+ // mode, and read them.
+ constexpr int kWriteLen = 4100;
+ char input[kWriteLen];
+ memset(input, 'M', kWriteLen);
+ ASSERT_THAT(WriteFd(master_.get(), input, kWriteLen),
+ SyscallSucceedsWithValue(kWriteLen));
+
+ ASSERT_NO_ERRNO(WaitUntilReceived(slave_.get(), kMaxLineSize - 1));
+ EnableCanonical();
+
+ // We can read the line.
+ char buf[kMaxLineSize] = {};
+ ExpectReadable(slave_, kMaxLineSize - 1, buf);
+
+ ExpectFinished(slave_);
+}
+
+// Tests that we can write over the 4095 noncanonical limit, then read out
+// everything.
+TEST_F(PtyTest, NoncanonBigWrite) {
+ DisableCanonical();
+
+ // Write well over the 4095 internal buffer limit.
+ constexpr char kInput = 'M';
+ constexpr int kInputSize = kMaxLineSize * 2;
+ for (int i = 0; i < kInputSize; i++) {
+ // This makes too many syscalls for save/restore.
+ const DisableSave ds;
+ ASSERT_THAT(WriteFd(master_.get(), &kInput, sizeof(kInput)),
+ SyscallSucceedsWithValue(sizeof(kInput)));
+ }
+
+ // We should be able to read out everything. Sleep a bit so that Linux has a
+ // chance to move data from the master to the slave.
+ ASSERT_NO_ERRNO(WaitUntilReceived(slave_.get(), kMaxLineSize - 1));
+ for (int i = 0; i < kInputSize; i++) {
+ // This makes too many syscalls for save/restore.
+ const DisableSave ds;
+ char c;
+ ExpectReadable(slave_, 1, &c);
+ ASSERT_EQ(c, kInput);
+ }
+
+ ExpectFinished(slave_);
+}
+
+// ICANON doesn't make input available until a line delimiter is typed.
+//
+// Test newline.
+TEST_F(PtyTest, TermiosICANONNewline) {
+ char input[3] = {'a', 'b', 'c'};
+ ASSERT_THAT(WriteFd(master_.get(), input, sizeof(input)),
+ SyscallSucceedsWithValue(sizeof(input)));
+
+ // Extra bytes for newline (written later) and NUL for EXPECT_STREQ.
+ char buf[5] = {};
+
+ // Nothing available yet.
+ ASSERT_THAT(PollAndReadFd(slave_.get(), buf, sizeof(input), kTimeout),
+ PosixErrorIs(ETIMEDOUT, ::testing::StrEq("Poll timed out")));
+
+ char delim = '\n';
+ ASSERT_THAT(WriteFd(master_.get(), &delim, 1), SyscallSucceedsWithValue(1));
+
+ // Now it is available.
+ ASSERT_NO_ERRNO(WaitUntilReceived(slave_.get(), sizeof(input) + 1));
+ ExpectReadable(slave_, sizeof(input) + 1, buf);
+ EXPECT_STREQ(buf, "abc\n");
+
+ ExpectFinished(slave_);
+}
+
+// ICANON doesn't make input available until a line delimiter is typed.
+//
+// Test EOF (^D).
+TEST_F(PtyTest, TermiosICANONEOF) {
+ char input[3] = {'a', 'b', 'c'};
+ ASSERT_THAT(WriteFd(master_.get(), input, sizeof(input)),
+ SyscallSucceedsWithValue(sizeof(input)));
+
+ // Extra byte for NUL for EXPECT_STREQ.
+ char buf[4] = {};
+
+ // Nothing available yet.
+ ASSERT_THAT(PollAndReadFd(slave_.get(), buf, sizeof(input), kTimeout),
+ PosixErrorIs(ETIMEDOUT, ::testing::StrEq("Poll timed out")));
+ char delim = ControlCharacter('D');
+ ASSERT_THAT(WriteFd(master_.get(), &delim, 1), SyscallSucceedsWithValue(1));
+
+ // Now it is available. Note that ^D is not included.
+ ExpectReadable(slave_, sizeof(input), buf);
+ EXPECT_STREQ(buf, "abc");
+
+ ExpectFinished(slave_);
+}
+
+// ICANON limits us to 4096 bytes including a terminating character. Anything
+// after and 4095th character is discarded (although still processed for
+// signals and echoing).
+TEST_F(PtyTest, CanonDiscard) {
+ constexpr char kInput = 'M';
+ constexpr int kInputSize = 4100;
+ constexpr int kIter = 3;
+
+ // A few times write more than the 4096 character maximum, then a newline.
+ constexpr char delim = '\n';
+ for (int i = 0; i < kIter; i++) {
+ // This makes too many syscalls for save/restore.
+ const DisableSave ds;
+ for (int i = 0; i < kInputSize; i++) {
+ ASSERT_THAT(WriteFd(master_.get(), &kInput, sizeof(kInput)),
+ SyscallSucceedsWithValue(sizeof(kInput)));
+ }
+ ASSERT_THAT(WriteFd(master_.get(), &delim, 1), SyscallSucceedsWithValue(1));
+ }
+
+ // There should be multiple truncated lines available to read.
+ for (int i = 0; i < kIter; i++) {
+ char buf[kInputSize] = {};
+ ExpectReadable(slave_, kMaxLineSize, buf);
+ EXPECT_EQ(buf[kMaxLineSize - 1], delim);
+ EXPECT_EQ(buf[kMaxLineSize - 2], kInput);
+ }
+
+ ExpectFinished(slave_);
+}
+
+TEST_F(PtyTest, CanonMultiline) {
+ constexpr char kInput1[] = "GO\n";
+ constexpr char kInput2[] = "BLUE\n";
+
+ // Write both lines.
+ ASSERT_THAT(WriteFd(master_.get(), kInput1, sizeof(kInput1) - 1),
+ SyscallSucceedsWithValue(sizeof(kInput1) - 1));
+ ASSERT_THAT(WriteFd(master_.get(), kInput2, sizeof(kInput2) - 1),
+ SyscallSucceedsWithValue(sizeof(kInput2) - 1));
+
+ // Get the first line.
+ char line1[8] = {};
+ ExpectReadable(slave_, sizeof(kInput1) - 1, line1);
+ EXPECT_STREQ(line1, kInput1);
+
+ // Get the second line.
+ char line2[8] = {};
+ ExpectReadable(slave_, sizeof(kInput2) - 1, line2);
+ EXPECT_STREQ(line2, kInput2);
+
+ ExpectFinished(slave_);
+}
+
+TEST_F(PtyTest, SwitchNoncanonToCanonMultiline) {
+ DisableCanonical();
+
+ constexpr char kInput1[] = "GO\n";
+ constexpr char kInput2[] = "BLUE\n";
+ constexpr char kExpected[] = "GO\nBLUE\n";
+
+ // Write both lines.
+ ASSERT_THAT(WriteFd(master_.get(), kInput1, sizeof(kInput1) - 1),
+ SyscallSucceedsWithValue(sizeof(kInput1) - 1));
+ ASSERT_THAT(WriteFd(master_.get(), kInput2, sizeof(kInput2) - 1),
+ SyscallSucceedsWithValue(sizeof(kInput2) - 1));
+
+ ASSERT_NO_ERRNO(
+ WaitUntilReceived(slave_.get(), sizeof(kInput1) + sizeof(kInput2) - 2));
+ EnableCanonical();
+
+ // Get all together as one line.
+ char line[9] = {};
+ ExpectReadable(slave_, 8, line);
+ EXPECT_STREQ(line, kExpected);
+
+ ExpectFinished(slave_);
+}
+
+TEST_F(PtyTest, SwitchTwiceMultiline) {
+ std::string kInputs[] = {"GO\n", "BLUE\n", "!"};
+ std::string kExpected = "GO\nBLUE\n!";
+
+ // Write each line.
+ for (const std::string& input : kInputs) {
+ ASSERT_THAT(WriteFd(master_.get(), input.c_str(), input.size()),
+ SyscallSucceedsWithValue(input.size()));
+ }
+
+ DisableCanonical();
+ // All written characters have to make it into the input queue before
+ // canonical mode is re-enabled. If the final '!' character hasn't been
+ // enqueued before canonical mode is re-enabled, it won't be readable.
+ ASSERT_NO_ERRNO(WaitUntilReceived(slave_.get(), kExpected.size()));
+ EnableCanonical();
+
+ // Get all together as one line.
+ char line[10] = {};
+ ExpectReadable(slave_, 9, line);
+ EXPECT_STREQ(line, kExpected.c_str());
+
+ ExpectFinished(slave_);
+}
+
+TEST_F(PtyTest, QueueSize) {
+ // Write the line.
+ constexpr char kInput1[] = "GO\n";
+ ASSERT_THAT(WriteFd(master_.get(), kInput1, sizeof(kInput1) - 1),
+ SyscallSucceedsWithValue(sizeof(kInput1) - 1));
+ ASSERT_NO_ERRNO(WaitUntilReceived(slave_.get(), sizeof(kInput1) - 1));
+
+ // Ensure that writing more (beyond what is readable) does not impact the
+ // readable size.
+ char input[kMaxLineSize];
+ memset(input, 'M', kMaxLineSize);
+ ASSERT_THAT(WriteFd(master_.get(), input, kMaxLineSize),
+ SyscallSucceedsWithValue(kMaxLineSize));
+ int inputBufSize = ASSERT_NO_ERRNO_AND_VALUE(
+ WaitUntilReceived(slave_.get(), sizeof(kInput1) - 1));
+ EXPECT_EQ(inputBufSize, sizeof(kInput1) - 1);
+}
+
+TEST_F(PtyTest, PartialBadBuffer) {
+ // Allocate 2 pages.
+ void* addr = mmap(nullptr, 2 * kPageSize, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ ASSERT_NE(addr, MAP_FAILED);
+ char* buf = reinterpret_cast<char*>(addr);
+
+ // Guard the 2nd page for our read to run into.
+ ASSERT_THAT(
+ mprotect(reinterpret_cast<void*>(buf + kPageSize), kPageSize, PROT_NONE),
+ SyscallSucceeds());
+
+ // Leave only one free byte in the buffer.
+ char* bad_buffer = buf + kPageSize - 1;
+
+ // Write to the master.
+ constexpr char kBuf[] = "hello\n";
+ constexpr size_t size = sizeof(kBuf) - 1;
+ EXPECT_THAT(WriteFd(master_.get(), kBuf, size),
+ SyscallSucceedsWithValue(size));
+
+ // Read from the slave into bad_buffer.
+ ASSERT_NO_ERRNO(WaitUntilReceived(slave_.get(), size));
+ EXPECT_THAT(ReadFd(slave_.get(), bad_buffer, size),
+ SyscallFailsWithErrno(EFAULT));
+
+ EXPECT_THAT(munmap(addr, 2 * kPageSize), SyscallSucceeds()) << addr;
+}
+
+TEST_F(PtyTest, SimpleEcho) {
+ constexpr char kInput[] = "Mr. Eko";
+ EXPECT_THAT(WriteFd(master_.get(), kInput, strlen(kInput)),
+ SyscallSucceedsWithValue(strlen(kInput)));
+
+ char buf[100] = {};
+ ExpectReadable(master_, strlen(kInput), buf);
+
+ EXPECT_STREQ(buf, kInput);
+ ExpectFinished(master_);
+}
+
+TEST_F(PtyTest, GetWindowSize) {
+ struct winsize ws;
+ ASSERT_THAT(ioctl(slave_.get(), TIOCGWINSZ, &ws), SyscallSucceeds());
+ EXPECT_EQ(ws.ws_row, 0);
+ EXPECT_EQ(ws.ws_col, 0);
+}
+
+TEST_F(PtyTest, SetSlaveWindowSize) {
+ constexpr uint16_t kRows = 343;
+ constexpr uint16_t kCols = 2401;
+ struct winsize ws = {.ws_row = kRows, .ws_col = kCols};
+ ASSERT_THAT(ioctl(slave_.get(), TIOCSWINSZ, &ws), SyscallSucceeds());
+
+ struct winsize retrieved_ws = {};
+ ASSERT_THAT(ioctl(master_.get(), TIOCGWINSZ, &retrieved_ws),
+ SyscallSucceeds());
+ EXPECT_EQ(retrieved_ws.ws_row, kRows);
+ EXPECT_EQ(retrieved_ws.ws_col, kCols);
+}
+
+TEST_F(PtyTest, SetMasterWindowSize) {
+ constexpr uint16_t kRows = 343;
+ constexpr uint16_t kCols = 2401;
+ struct winsize ws = {.ws_row = kRows, .ws_col = kCols};
+ ASSERT_THAT(ioctl(master_.get(), TIOCSWINSZ, &ws), SyscallSucceeds());
+
+ struct winsize retrieved_ws = {};
+ ASSERT_THAT(ioctl(slave_.get(), TIOCGWINSZ, &retrieved_ws),
+ SyscallSucceeds());
+ EXPECT_EQ(retrieved_ws.ws_row, kRows);
+ EXPECT_EQ(retrieved_ws.ws_col, kCols);
+}
+
+class JobControlTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ master_ = ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/ptmx", O_RDWR | O_NONBLOCK));
+ slave_ = ASSERT_NO_ERRNO_AND_VALUE(OpenSlave(master_));
+
+ // Make this a session leader, which also drops the controlling terminal.
+ // In the gVisor test environment, this test will be run as the session
+ // leader already (as the sentry init process).
+ if (!IsRunningOnGvisor()) {
+ ASSERT_THAT(setsid(), SyscallSucceeds());
+ }
+ }
+
+ // Master and slave ends of the PTY. Non-blocking.
+ FileDescriptor master_;
+ FileDescriptor slave_;
+};
+
+TEST_F(JobControlTest, SetTTYMaster) {
+ ASSERT_THAT(ioctl(master_.get(), TIOCSCTTY, 0), SyscallSucceeds());
+}
+
+TEST_F(JobControlTest, SetTTY) {
+ ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
+}
+
+TEST_F(JobControlTest, SetTTYNonLeader) {
+ // Fork a process that won't be the session leader.
+ pid_t child = fork();
+ if (!child) {
+ // We shouldn't be able to set the terminal.
+ TEST_PCHECK(ioctl(slave_.get(), TIOCSCTTY, 0));
+ _exit(0);
+ }
+
+ int wstatus;
+ ASSERT_THAT(waitpid(child, &wstatus, 0), SyscallSucceedsWithValue(child));
+ ASSERT_EQ(wstatus, 0);
+}
+
+TEST_F(JobControlTest, SetTTYBadArg) {
+ // Despite the man page saying arg should be 0 here, Linux doesn't actually
+ // check.
+ ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 1), SyscallSucceeds());
+}
+
+TEST_F(JobControlTest, SetTTYDifferentSession) {
+ SKIP_IF(ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+ ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
+
+ // Fork, join a new session, and try to steal the parent's controlling
+ // terminal, which should fail.
+ pid_t child = fork();
+ if (!child) {
+ TEST_PCHECK(setsid() >= 0);
+ // We shouldn't be able to steal the terminal.
+ TEST_PCHECK(ioctl(slave_.get(), TIOCSCTTY, 1));
+ _exit(0);
+ }
+
+ int wstatus;
+ ASSERT_THAT(waitpid(child, &wstatus, 0), SyscallSucceedsWithValue(child));
+ ASSERT_EQ(wstatus, 0);
+}
+
+TEST_F(JobControlTest, ReleaseTTY) {
+ ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
+
+ // Make sure we're ignoring SIGHUP, which will be sent to this process once we
+ // disconnect they TTY.
+ struct sigaction sa = {};
+ sa.sa_handler = SIG_IGN;
+ sa.sa_flags = 0;
+ sigemptyset(&sa.sa_mask);
+ struct sigaction old_sa;
+ EXPECT_THAT(sigaction(SIGHUP, &sa, &old_sa), SyscallSucceeds());
+ EXPECT_THAT(ioctl(slave_.get(), TIOCNOTTY), SyscallSucceeds());
+ EXPECT_THAT(sigaction(SIGHUP, &old_sa, NULL), SyscallSucceeds());
+}
+
+TEST_F(JobControlTest, ReleaseUnsetTTY) {
+ ASSERT_THAT(ioctl(slave_.get(), TIOCNOTTY), SyscallFailsWithErrno(ENOTTY));
+}
+
+TEST_F(JobControlTest, ReleaseWrongTTY) {
+ ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
+
+ ASSERT_THAT(ioctl(master_.get(), TIOCNOTTY), SyscallFailsWithErrno(ENOTTY));
+}
+
+TEST_F(JobControlTest, ReleaseTTYNonLeader) {
+ ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
+
+ pid_t child = fork();
+ if (!child) {
+ TEST_PCHECK(!ioctl(slave_.get(), TIOCNOTTY));
+ _exit(0);
+ }
+
+ int wstatus;
+ ASSERT_THAT(waitpid(child, &wstatus, 0), SyscallSucceedsWithValue(child));
+ ASSERT_EQ(wstatus, 0);
+}
+
+TEST_F(JobControlTest, ReleaseTTYDifferentSession) {
+ ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
+
+ pid_t child = fork();
+ if (!child) {
+ // Join a new session, then try to disconnect.
+ TEST_PCHECK(setsid() >= 0);
+ TEST_PCHECK(ioctl(slave_.get(), TIOCNOTTY));
+ _exit(0);
+ }
+
+ int wstatus;
+ ASSERT_THAT(waitpid(child, &wstatus, 0), SyscallSucceedsWithValue(child));
+ ASSERT_EQ(wstatus, 0);
+}
+
+// Used by the child process spawned in ReleaseTTYSignals to track received
+// signals.
+static int received;
+
+void sig_handler(int signum) { received |= signum; }
+
+// When the session leader releases its controlling terminal, the foreground
+// process group gets SIGHUP, then SIGCONT. This test:
+// - Spawns 2 threads
+// - Has thread 1 return 0 if it gets both SIGHUP and SIGCONT
+// - Has thread 2 leave the foreground process group, and return non-zero if it
+// receives any signals.
+// - Has the parent thread release its controlling terminal
+// - Checks that thread 1 got both signals
+// - Checks that thread 2 didn't get any signals.
+TEST_F(JobControlTest, ReleaseTTYSignals) {
+ ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
+
+ received = 0;
+ struct sigaction sa = {};
+ sa.sa_handler = sig_handler;
+ sa.sa_flags = 0;
+ sigemptyset(&sa.sa_mask);
+ sigaddset(&sa.sa_mask, SIGHUP);
+ sigaddset(&sa.sa_mask, SIGCONT);
+ sigprocmask(SIG_BLOCK, &sa.sa_mask, NULL);
+
+ pid_t same_pgrp_child = fork();
+ if (!same_pgrp_child) {
+ // The child will wait for SIGHUP and SIGCONT, then return 0. It begins with
+ // SIGHUP and SIGCONT blocked. We install signal handlers for those signals,
+ // then use sigsuspend to wait for those specific signals.
+ TEST_PCHECK(!sigaction(SIGHUP, &sa, NULL));
+ TEST_PCHECK(!sigaction(SIGCONT, &sa, NULL));
+ sigset_t mask;
+ sigfillset(&mask);
+ sigdelset(&mask, SIGHUP);
+ sigdelset(&mask, SIGCONT);
+ while (received != (SIGHUP | SIGCONT)) {
+ sigsuspend(&mask);
+ }
+ _exit(0);
+ }
+
+ // We don't want to block these anymore.
+ sigprocmask(SIG_UNBLOCK, &sa.sa_mask, NULL);
+
+ // This child will return non-zero if either SIGHUP or SIGCONT are received.
+ pid_t diff_pgrp_child = fork();
+ if (!diff_pgrp_child) {
+ TEST_PCHECK(!setpgid(0, 0));
+ TEST_PCHECK(pause());
+ _exit(1);
+ }
+
+ EXPECT_THAT(setpgid(diff_pgrp_child, diff_pgrp_child), SyscallSucceeds());
+
+ // Make sure we're ignoring SIGHUP, which will be sent to this process once we
+ // disconnect they TTY.
+ struct sigaction sighup_sa = {};
+ sighup_sa.sa_handler = SIG_IGN;
+ sighup_sa.sa_flags = 0;
+ sigemptyset(&sighup_sa.sa_mask);
+ struct sigaction old_sa;
+ EXPECT_THAT(sigaction(SIGHUP, &sighup_sa, &old_sa), SyscallSucceeds());
+
+ // Release the controlling terminal, sending SIGHUP and SIGCONT to all other
+ // processes in this process group.
+ EXPECT_THAT(ioctl(slave_.get(), TIOCNOTTY), SyscallSucceeds());
+
+ EXPECT_THAT(sigaction(SIGHUP, &old_sa, NULL), SyscallSucceeds());
+
+ // The child in the same process group will get signaled.
+ int wstatus;
+ EXPECT_THAT(waitpid(same_pgrp_child, &wstatus, 0),
+ SyscallSucceedsWithValue(same_pgrp_child));
+ EXPECT_EQ(wstatus, 0);
+
+ // The other child will not get signaled.
+ EXPECT_THAT(waitpid(diff_pgrp_child, &wstatus, WNOHANG),
+ SyscallSucceedsWithValue(0));
+ EXPECT_THAT(kill(diff_pgrp_child, SIGKILL), SyscallSucceeds());
+}
+
+TEST_F(JobControlTest, GetForegroundProcessGroup) {
+ ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
+ pid_t foreground_pgid;
+ pid_t pid;
+ ASSERT_THAT(ioctl(slave_.get(), TIOCGPGRP, &foreground_pgid),
+ SyscallSucceeds());
+ ASSERT_THAT(pid = getpid(), SyscallSucceeds());
+
+ ASSERT_EQ(foreground_pgid, pid);
+}
+
+TEST_F(JobControlTest, GetForegroundProcessGroupNonControlling) {
+ // At this point there's no controlling terminal, so TIOCGPGRP should fail.
+ pid_t foreground_pgid;
+ ASSERT_THAT(ioctl(slave_.get(), TIOCGPGRP, &foreground_pgid),
+ SyscallFailsWithErrno(ENOTTY));
+}
+
+// This test:
+// - sets itself as the foreground process group
+// - creates a child process in a new process group
+// - sets that child as the foreground process group
+// - kills its child and sets itself as the foreground process group.
+TEST_F(JobControlTest, SetForegroundProcessGroup) {
+ ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
+
+ // Ignore SIGTTOU so that we don't stop ourself when calling tcsetpgrp.
+ struct sigaction sa = {};
+ sa.sa_handler = SIG_IGN;
+ sa.sa_flags = 0;
+ sigemptyset(&sa.sa_mask);
+ sigaction(SIGTTOU, &sa, NULL);
+
+ // Set ourself as the foreground process group.
+ ASSERT_THAT(tcsetpgrp(slave_.get(), getpgid(0)), SyscallSucceeds());
+
+ // Create a new process that just waits to be signaled.
+ pid_t child = fork();
+ if (!child) {
+ TEST_PCHECK(!pause());
+ // We should never reach this.
+ _exit(1);
+ }
+
+ // Make the child its own process group, then make it the controlling process
+ // group of the terminal.
+ ASSERT_THAT(setpgid(child, child), SyscallSucceeds());
+ ASSERT_THAT(tcsetpgrp(slave_.get(), child), SyscallSucceeds());
+
+ // Sanity check - we're still the controlling session.
+ ASSERT_EQ(getsid(0), getsid(child));
+
+ // Signal the child, wait for it to exit, then retake the terminal.
+ ASSERT_THAT(kill(child, SIGTERM), SyscallSucceeds());
+ int wstatus;
+ ASSERT_THAT(waitpid(child, &wstatus, 0), SyscallSucceedsWithValue(child));
+ ASSERT_TRUE(WIFSIGNALED(wstatus));
+ ASSERT_EQ(WTERMSIG(wstatus), SIGTERM);
+
+ // Set ourself as the foreground process.
+ pid_t pgid;
+ ASSERT_THAT(pgid = getpgid(0), SyscallSucceeds());
+ ASSERT_THAT(tcsetpgrp(slave_.get(), pgid), SyscallSucceeds());
+}
+
+TEST_F(JobControlTest, SetForegroundProcessGroupWrongTTY) {
+ pid_t pid = getpid();
+ ASSERT_THAT(ioctl(slave_.get(), TIOCSPGRP, &pid),
+ SyscallFailsWithErrno(ENOTTY));
+}
+
+TEST_F(JobControlTest, SetForegroundProcessGroupNegPgid) {
+ ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
+
+ pid_t pid = -1;
+ ASSERT_THAT(ioctl(slave_.get(), TIOCSPGRP, &pid),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_F(JobControlTest, SetForegroundProcessGroupEmptyProcessGroup) {
+ ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
+
+ // Create a new process, put it in a new process group, make that group the
+ // foreground process group, then have the process wait.
+ pid_t child = fork();
+ if (!child) {
+ TEST_PCHECK(!setpgid(0, 0));
+ _exit(0);
+ }
+
+ // Wait for the child to exit.
+ int wstatus;
+ EXPECT_THAT(waitpid(child, &wstatus, 0), SyscallSucceedsWithValue(child));
+ // The child's process group doesn't exist anymore - this should fail.
+ ASSERT_THAT(ioctl(slave_.get(), TIOCSPGRP, &child),
+ SyscallFailsWithErrno(ESRCH));
+}
+
+TEST_F(JobControlTest, SetForegroundProcessGroupDifferentSession) {
+ ASSERT_THAT(ioctl(slave_.get(), TIOCSCTTY, 0), SyscallSucceeds());
+
+ int sync_setsid[2];
+ int sync_exit[2];
+ ASSERT_THAT(pipe(sync_setsid), SyscallSucceeds());
+ ASSERT_THAT(pipe(sync_exit), SyscallSucceeds());
+
+ // Create a new process and put it in a new session.
+ pid_t child = fork();
+ if (!child) {
+ TEST_PCHECK(setsid() >= 0);
+ // Tell the parent we're in a new session.
+ char c = 'c';
+ TEST_PCHECK(WriteFd(sync_setsid[1], &c, 1) == 1);
+ TEST_PCHECK(ReadFd(sync_exit[0], &c, 1) == 1);
+ _exit(0);
+ }
+
+ // Wait for the child to tell us it's in a new session.
+ char c = 'c';
+ ASSERT_THAT(ReadFd(sync_setsid[0], &c, 1), SyscallSucceedsWithValue(1));
+
+ // Child is in a new session, so we can't make it the foregroup process group.
+ EXPECT_THAT(ioctl(slave_.get(), TIOCSPGRP, &child),
+ SyscallFailsWithErrno(EPERM));
+
+ EXPECT_THAT(WriteFd(sync_exit[1], &c, 1), SyscallSucceedsWithValue(1));
+
+ int wstatus;
+ EXPECT_THAT(waitpid(child, &wstatus, 0), SyscallSucceedsWithValue(child));
+ EXPECT_TRUE(WIFEXITED(wstatus));
+ EXPECT_EQ(WEXITSTATUS(wstatus), 0);
+}
+
+// Verify that we don't hang when creating a new session from an orphaned
+// process group (b/139968068). Calling setsid() creates an orphaned process
+// group, as process groups that contain the session's leading process are
+// orphans.
+//
+// We create 2 sessions in this test. The init process in gVisor is considered
+// not to be an orphan (see sessions.go), so we have to create a session from
+// which to create a session. The latter session is being created from an
+// orphaned process group.
+TEST_F(JobControlTest, OrphanRegression) {
+ pid_t session_2_leader = fork();
+ if (!session_2_leader) {
+ TEST_PCHECK(setsid() >= 0);
+
+ pid_t session_3_leader = fork();
+ if (!session_3_leader) {
+ TEST_PCHECK(setsid() >= 0);
+
+ _exit(0);
+ }
+
+ int wstatus;
+ TEST_PCHECK(waitpid(session_3_leader, &wstatus, 0) == session_3_leader);
+ TEST_PCHECK(wstatus == 0);
+
+ _exit(0);
+ }
+
+ int wstatus;
+ ASSERT_THAT(waitpid(session_2_leader, &wstatus, 0),
+ SyscallSucceedsWithValue(session_2_leader));
+ ASSERT_EQ(wstatus, 0);
+}
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/pty_root.cc b/test/syscalls/linux/pty_root.cc
new file mode 100644
index 000000000..1d7dbefdb
--- /dev/null
+++ b/test/syscalls/linux/pty_root.cc
@@ -0,0 +1,78 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/ioctl.h>
+#include <termios.h>
+
+#include "gtest/gtest.h"
+#include "absl/base/macros.h"
+#include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/posix_error.h"
+#include "test/util/pty_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// StealTTY tests whether privileged processes can steal controlling terminals.
+// If the stealing process has CAP_SYS_ADMIN in the root user namespace, the
+// test ensures that stealing works. If it has non-root CAP_SYS_ADMIN, it
+// ensures stealing fails.
+TEST(JobControlRootTest, StealTTY) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+ bool true_root = true;
+ if (!IsRunningOnGvisor()) {
+ // If running in Linux, we may only have CAP_SYS_ADMIN in a non-root user
+ // namespace (i.e. we are not truly root). We use init_module as a proxy for
+ // whether we are true root, as it returns EPERM immediately.
+ ASSERT_THAT(syscall(SYS_init_module, nullptr, 0, nullptr), SyscallFails());
+ true_root = errno != EPERM;
+
+ // Make this a session leader, which also drops the controlling terminal.
+ // In the gVisor test environment, this test will be run as the session
+ // leader already (as the sentry init process).
+ ASSERT_THAT(setsid(), SyscallSucceeds());
+ }
+
+ FileDescriptor master =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/ptmx", O_RDWR | O_NONBLOCK));
+ FileDescriptor slave = ASSERT_NO_ERRNO_AND_VALUE(OpenSlave(master));
+
+ // Make slave the controlling terminal.
+ ASSERT_THAT(ioctl(slave.get(), TIOCSCTTY, 0), SyscallSucceeds());
+
+ // Fork, join a new session, and try to steal the parent's controlling
+ // terminal, which should succeed when we have CAP_SYS_ADMIN and pass an arg
+ // of 1.
+ pid_t child = fork();
+ if (!child) {
+ ASSERT_THAT(setsid(), SyscallSucceeds());
+ // We shouldn't be able to steal the terminal with the wrong arg value.
+ TEST_PCHECK(ioctl(slave.get(), TIOCSCTTY, 0));
+ // We should be able to steal it if we are true root.
+ TEST_PCHECK(true_root == !ioctl(slave.get(), TIOCSCTTY, 1));
+ _exit(0);
+ }
+
+ int wstatus;
+ ASSERT_THAT(waitpid(child, &wstatus, 0), SyscallSucceedsWithValue(child));
+ ASSERT_EQ(wstatus, 0);
+}
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/pwrite64.cc b/test/syscalls/linux/pwrite64.cc
new file mode 100644
index 000000000..e69794910
--- /dev/null
+++ b/test/syscalls/linux/pwrite64.cc
@@ -0,0 +1,83 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/unistd.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// TODO(gvisor.dev/issue/2370): This test is currently very rudimentary.
+class Pwrite64 : public ::testing::Test {
+ void SetUp() override {
+ name_ = NewTempAbsPath();
+ int fd;
+ ASSERT_THAT(fd = open(name_.c_str(), O_CREAT, 0644), SyscallSucceeds());
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+ }
+
+ void TearDown() override { unlink(name_.c_str()); }
+
+ public:
+ std::string name_;
+};
+
+TEST_F(Pwrite64, AppendOnly) {
+ int fd;
+ ASSERT_THAT(fd = open(name_.c_str(), O_APPEND | O_RDWR), SyscallSucceeds());
+ constexpr int64_t kBufSize = 1024;
+ std::vector<char> buf(kBufSize);
+ std::fill(buf.begin(), buf.end(), 'a');
+ EXPECT_THAT(PwriteFd(fd, buf.data(), buf.size(), 0),
+ SyscallSucceedsWithValue(buf.size()));
+ EXPECT_THAT(lseek(fd, 0, SEEK_CUR), SyscallSucceedsWithValue(0));
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
+TEST_F(Pwrite64, InvalidArgs) {
+ int fd;
+ ASSERT_THAT(fd = open(name_.c_str(), O_APPEND | O_RDWR), SyscallSucceeds());
+ constexpr int64_t kBufSize = 1024;
+ std::vector<char> buf(kBufSize);
+ std::fill(buf.begin(), buf.end(), 'a');
+ EXPECT_THAT(PwriteFd(fd, buf.data(), buf.size(), -1),
+ SyscallFailsWithErrno(EINVAL));
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
+TEST_F(Pwrite64, Overflow) {
+ int fd;
+ ASSERT_THAT(fd = open(name_.c_str(), O_APPEND | O_RDWR), SyscallSucceeds());
+ constexpr int64_t kBufSize = 1024;
+ std::vector<char> buf(kBufSize);
+ std::fill(buf.begin(), buf.end(), 'a');
+ EXPECT_THAT(PwriteFd(fd, buf.data(), buf.size(), 0x7fffffffffffffffull),
+ SyscallFailsWithErrno(EINVAL));
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/pwritev2.cc b/test/syscalls/linux/pwritev2.cc
new file mode 100644
index 000000000..63b686c62
--- /dev/null
+++ b/test/syscalls/linux/pwritev2.cc
@@ -0,0 +1,307 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/file_base.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+#ifndef SYS_pwritev2
+#if defined(__x86_64__)
+#define SYS_pwritev2 328
+#elif defined(__aarch64__)
+#define SYS_pwritev2 287
+#else
+#error "Unknown architecture"
+#endif
+#endif // SYS_pwrite2
+
+#ifndef RWF_HIPRI
+#define RWF_HIPRI 0x1
+#endif // RWF_HIPRI
+
+#ifndef RWF_DSYNC
+#define RWF_DSYNC 0x2
+#endif // RWF_DSYNC
+
+#ifndef RWF_SYNC
+#define RWF_SYNC 0x4
+#endif // RWF_SYNC
+
+constexpr int kBufSize = 1024;
+
+void SetContent(std::vector<char>& content) {
+ for (uint i = 0; i < content.size(); i++) {
+ content[i] = static_cast<char>((i % 10) + '0');
+ }
+}
+
+ssize_t pwritev2(unsigned long fd, const struct iovec* iov,
+ unsigned long iovcnt, off_t offset, unsigned long flags) {
+ // syscall on pwritev2 does some weird things (see man syscall and search
+ // pwritev2), so we insert a 0 to word align the flags argument on native.
+ return syscall(SYS_pwritev2, fd, iov, iovcnt, offset, 0, flags);
+}
+
+// This test is the base case where we call pwritev (no offset, no flags).
+TEST(Writev2Test, BaseCall) {
+ SKIP_IF(pwritev2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS);
+
+ const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), "", TempPath::kDefaultFileMode));
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR));
+
+ std::vector<char> content(kBufSize);
+ SetContent(content);
+ struct iovec iov[2];
+ iov[0].iov_base = content.data();
+ iov[0].iov_len = content.size() / 2;
+ iov[1].iov_base = static_cast<char*>(iov[0].iov_base) + (content.size() / 2);
+ iov[1].iov_len = content.size() / 2;
+
+ ASSERT_THAT(pwritev2(fd.get(), iov, /*iovcnt=*/2,
+ /*offset=*/0, /*flags=*/0),
+ SyscallSucceedsWithValue(kBufSize));
+
+ std::vector<char> buf(kBufSize);
+ EXPECT_THAT(read(fd.get(), buf.data(), kBufSize),
+ SyscallSucceedsWithValue(kBufSize));
+
+ EXPECT_EQ(content, buf);
+}
+
+// This test is where we call pwritev2 with a positive offset and no flags.
+TEST(Pwritev2Test, ValidPositiveOffset) {
+ SKIP_IF(pwritev2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS);
+
+ std::string prefix(kBufSize, '0');
+
+ const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), prefix, TempPath::kDefaultFileMode));
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR));
+
+ std::vector<char> content(kBufSize);
+ SetContent(content);
+ struct iovec iov;
+ iov.iov_base = content.data();
+ iov.iov_len = content.size();
+
+ ASSERT_THAT(pwritev2(fd.get(), &iov, /*iovcnt=*/1,
+ /*offset=*/prefix.size(), /*flags=*/0),
+ SyscallSucceedsWithValue(content.size()));
+
+ std::vector<char> buf(prefix.size() + content.size());
+ EXPECT_THAT(read(fd.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(buf.size()));
+
+ std::vector<char> want(prefix.begin(), prefix.end());
+ want.insert(want.end(), content.begin(), content.end());
+ EXPECT_EQ(want, buf);
+}
+
+// This test is the base case where we call writev by using -1 as the offset.
+// The write should use the file offset, so the test increments the file offset
+// prior to call pwritev2.
+TEST(Pwritev2Test, NegativeOneOffset) {
+ SKIP_IF(pwritev2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS);
+
+ const std::string prefix = "00";
+ const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), prefix.data(), TempPath::kDefaultFileMode));
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR));
+ ASSERT_THAT(lseek(fd.get(), prefix.size(), SEEK_SET),
+ SyscallSucceedsWithValue(prefix.size()));
+
+ std::vector<char> content(kBufSize);
+ SetContent(content);
+ struct iovec iov;
+ iov.iov_base = content.data();
+ iov.iov_len = content.size();
+
+ ASSERT_THAT(pwritev2(fd.get(), &iov, /*iovcnt*/ 1,
+ /*offset=*/static_cast<off_t>(-1), /*flags=*/0),
+ SyscallSucceedsWithValue(content.size()));
+
+ ASSERT_THAT(lseek(fd.get(), 0, SEEK_CUR),
+ SyscallSucceedsWithValue(prefix.size() + content.size()));
+
+ std::vector<char> buf(prefix.size() + content.size());
+ EXPECT_THAT(pread(fd.get(), buf.data(), buf.size(), /*offset=*/0),
+ SyscallSucceedsWithValue(buf.size()));
+
+ std::vector<char> want(prefix.begin(), prefix.end());
+ want.insert(want.end(), content.begin(), content.end());
+ EXPECT_EQ(want, buf);
+}
+
+// pwritev2 requires if the RWF_HIPRI flag is passed, the fd must be opened with
+// O_DIRECT. This test implements a correct call with the RWF_HIPRI flag.
+TEST(Pwritev2Test, CallWithRWF_HIPRI) {
+ SKIP_IF(pwritev2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS);
+
+ const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), "", TempPath::kDefaultFileMode));
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR));
+
+ std::vector<char> content(kBufSize);
+ SetContent(content);
+ struct iovec iov;
+ iov.iov_base = content.data();
+ iov.iov_len = content.size();
+
+ EXPECT_THAT(pwritev2(fd.get(), &iov, /*iovcnt=*/1,
+ /*offset=*/0, /*flags=*/RWF_HIPRI),
+ SyscallSucceedsWithValue(kBufSize));
+
+ std::vector<char> buf(content.size());
+ EXPECT_THAT(read(fd.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(buf.size()));
+
+ EXPECT_EQ(buf, content);
+}
+
+// This test calls pwritev2 with a bad file descriptor.
+TEST(Writev2Test, BadFile) {
+ SKIP_IF(pwritev2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS);
+ ASSERT_THAT(pwritev2(/*fd=*/-1, /*iov=*/nullptr, /*iovcnt=*/0,
+ /*offset=*/0, /*flags=*/0),
+ SyscallFailsWithErrno(EBADF));
+}
+
+// This test calls pwrite2 with an invalid offset.
+TEST(Pwritev2Test, InvalidOffset) {
+ SKIP_IF(pwritev2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS);
+
+ const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), "", TempPath::kDefaultFileMode));
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR));
+
+ char buf[16];
+ struct iovec iov;
+ iov.iov_base = buf;
+ iov.iov_len = sizeof(buf);
+
+ EXPECT_THAT(pwritev2(fd.get(), &iov, /*iovcnt=*/1,
+ /*offset=*/static_cast<off_t>(-8), /*flags=*/0),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(Pwritev2Test, UnseekableFileValid) {
+ SKIP_IF(pwritev2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS);
+
+ int pipe_fds[2];
+
+ ASSERT_THAT(pipe(pipe_fds), SyscallSucceeds());
+
+ std::vector<char> content(32, '0');
+ SetContent(content);
+ struct iovec iov;
+ iov.iov_base = content.data();
+ iov.iov_len = content.size();
+
+ EXPECT_THAT(pwritev2(pipe_fds[1], &iov, /*iovcnt=*/1,
+ /*offset=*/static_cast<off_t>(-1), /*flags=*/0),
+ SyscallSucceedsWithValue(content.size()));
+
+ std::vector<char> buf(content.size());
+ EXPECT_THAT(read(pipe_fds[0], buf.data(), buf.size()),
+ SyscallSucceedsWithValue(buf.size()));
+
+ EXPECT_EQ(content, buf);
+
+ EXPECT_THAT(close(pipe_fds[0]), SyscallSucceeds());
+ EXPECT_THAT(close(pipe_fds[1]), SyscallSucceeds());
+}
+
+// Calling pwritev2 with a non-negative offset calls pwritev. Calling pwritev
+// with an unseekable file is not allowed. A pipe is used for an unseekable
+// file.
+TEST(Pwritev2Test, UnseekableFileInvalid) {
+ SKIP_IF(pwritev2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS);
+
+ int pipe_fds[2];
+ char buf[16];
+ struct iovec iov;
+ iov.iov_base = buf;
+ iov.iov_len = sizeof(buf);
+
+ ASSERT_THAT(pipe(pipe_fds), SyscallSucceeds());
+
+ EXPECT_THAT(pwritev2(pipe_fds[1], &iov, /*iovcnt=*/1,
+ /*offset=*/2, /*flags=*/0),
+ SyscallFailsWithErrno(ESPIPE));
+
+ EXPECT_THAT(close(pipe_fds[0]), SyscallSucceeds());
+ EXPECT_THAT(close(pipe_fds[1]), SyscallSucceeds());
+}
+
+TEST(Pwritev2Test, ReadOnlyFile) {
+ SKIP_IF(pwritev2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS);
+
+ const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), "", TempPath::kDefaultFileMode));
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY));
+
+ char buf[16];
+ struct iovec iov;
+ iov.iov_base = buf;
+ iov.iov_len = sizeof(buf);
+
+ EXPECT_THAT(pwritev2(fd.get(), &iov, /*iovcnt=*/1,
+ /*offset=*/0, /*flags=*/0),
+ SyscallFailsWithErrno(EBADF));
+}
+
+// This test calls pwritev2 with an invalid flag.
+TEST(Pwritev2Test, InvalidFlag) {
+ SKIP_IF(pwritev2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS);
+
+ const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), "", TempPath::kDefaultFileMode));
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR | O_DIRECT));
+
+ char buf[16];
+ struct iovec iov;
+ iov.iov_base = buf;
+ iov.iov_len = sizeof(buf);
+
+ EXPECT_THAT(pwritev2(fd.get(), &iov, /*iovcnt=*/1,
+ /*offset=*/0, /*flags=*/0xF0),
+ SyscallFailsWithErrno(EOPNOTSUPP));
+}
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/raw_socket.cc b/test/syscalls/linux/raw_socket.cc
new file mode 100644
index 000000000..05c4ed03f
--- /dev/null
+++ b/test/syscalls/linux/raw_socket.cc
@@ -0,0 +1,819 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <linux/capability.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <netinet/ip_icmp.h>
+#include <poll.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <algorithm>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/test_util.h"
+
+// Note: in order to run these tests, /proc/sys/net/ipv4/ping_group_range will
+// need to be configured to let the superuser create ping sockets (see icmp(7)).
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Fixture for tests parameterized by protocol.
+class RawSocketTest : public ::testing::TestWithParam<std::tuple<int, int>> {
+ protected:
+ // Creates a socket to be used in tests.
+ void SetUp() override;
+
+ // Closes the socket created by SetUp().
+ void TearDown() override;
+
+ // Sends buf via s_.
+ void SendBuf(const char* buf, int buf_len);
+
+ // Reads from s_ into recv_buf.
+ void ReceiveBuf(char* recv_buf, size_t recv_buf_len);
+
+ void ReceiveBufFrom(int sock, char* recv_buf, size_t recv_buf_len);
+
+ int Protocol() { return std::get<0>(GetParam()); }
+
+ int Family() { return std::get<1>(GetParam()); }
+
+ socklen_t AddrLen() {
+ if (Family() == AF_INET) {
+ return sizeof(sockaddr_in);
+ }
+ return sizeof(sockaddr_in6);
+ }
+
+ int HdrLen() {
+ if (Family() == AF_INET) {
+ return sizeof(struct iphdr);
+ }
+ // IPv6 raw sockets don't include the header.
+ return 0;
+ }
+
+ // The socket used for both reading and writing.
+ int s_;
+
+ // The loopback address.
+ struct sockaddr_storage addr_;
+};
+
+void RawSocketTest::SetUp() {
+ if (!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) {
+ ASSERT_THAT(socket(Family(), SOCK_RAW, Protocol()),
+ SyscallFailsWithErrno(EPERM));
+ GTEST_SKIP();
+ }
+
+ ASSERT_THAT(s_ = socket(Family(), SOCK_RAW, Protocol()), SyscallSucceeds());
+
+ addr_ = {};
+
+ // We don't set ports because raw sockets don't have a notion of ports.
+ if (Family() == AF_INET) {
+ struct sockaddr_in* sin = reinterpret_cast<struct sockaddr_in*>(&addr_);
+ sin->sin_family = AF_INET;
+ sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ } else {
+ struct sockaddr_in6* sin6 = reinterpret_cast<struct sockaddr_in6*>(&addr_);
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_addr = in6addr_loopback;
+ }
+}
+
+void RawSocketTest::TearDown() {
+ // TearDown will be run even if we skip the test.
+ if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) {
+ EXPECT_THAT(close(s_), SyscallSucceeds());
+ }
+}
+
+// We should be able to create multiple raw sockets for the same protocol.
+// BasicRawSocket::Setup creates the first one, so we only have to create one
+// more here.
+TEST_P(RawSocketTest, MultipleCreation) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ int s2;
+ ASSERT_THAT(s2 = socket(Family(), SOCK_RAW, Protocol()), SyscallSucceeds());
+
+ ASSERT_THAT(close(s2), SyscallSucceeds());
+}
+
+// Test that shutting down an unconnected socket fails.
+TEST_P(RawSocketTest, FailShutdownWithoutConnect) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ ASSERT_THAT(shutdown(s_, SHUT_WR), SyscallFailsWithErrno(ENOTCONN));
+ ASSERT_THAT(shutdown(s_, SHUT_RD), SyscallFailsWithErrno(ENOTCONN));
+}
+
+// Shutdown is a no-op for raw sockets (and datagram sockets in general).
+TEST_P(RawSocketTest, ShutdownWriteNoop) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ ASSERT_THAT(
+ connect(s_, reinterpret_cast<struct sockaddr*>(&addr_), AddrLen()),
+ SyscallSucceeds());
+ ASSERT_THAT(shutdown(s_, SHUT_WR), SyscallSucceeds());
+
+ // Arbitrary.
+ constexpr char kBuf[] = "noop";
+ ASSERT_THAT(RetryEINTR(write)(s_, kBuf, sizeof(kBuf)),
+ SyscallSucceedsWithValue(sizeof(kBuf)));
+}
+
+// Shutdown is a no-op for raw sockets (and datagram sockets in general).
+TEST_P(RawSocketTest, ShutdownReadNoop) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ ASSERT_THAT(
+ connect(s_, reinterpret_cast<struct sockaddr*>(&addr_), AddrLen()),
+ SyscallSucceeds());
+ ASSERT_THAT(shutdown(s_, SHUT_RD), SyscallSucceeds());
+
+ // Arbitrary.
+ constexpr char kBuf[] = "gdg";
+ ASSERT_NO_FATAL_FAILURE(SendBuf(kBuf, sizeof(kBuf)));
+
+ std::vector<char> c(sizeof(kBuf) + HdrLen());
+ ASSERT_THAT(read(s_, c.data(), c.size()), SyscallSucceedsWithValue(c.size()));
+}
+
+// Test that listen() fails.
+TEST_P(RawSocketTest, FailListen) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ ASSERT_THAT(listen(s_, 1), SyscallFailsWithErrno(ENOTSUP));
+}
+
+// Test that accept() fails.
+TEST_P(RawSocketTest, FailAccept) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ struct sockaddr saddr;
+ socklen_t addrlen;
+ ASSERT_THAT(accept(s_, &saddr, &addrlen), SyscallFailsWithErrno(ENOTSUP));
+}
+
+// Test that getpeername() returns nothing before connect().
+TEST_P(RawSocketTest, FailGetPeerNameBeforeConnect) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ struct sockaddr saddr;
+ socklen_t addrlen = sizeof(saddr);
+ ASSERT_THAT(getpeername(s_, &saddr, &addrlen),
+ SyscallFailsWithErrno(ENOTCONN));
+}
+
+// Test that getpeername() returns something after connect().
+TEST_P(RawSocketTest, GetPeerName) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ ASSERT_THAT(
+ connect(s_, reinterpret_cast<struct sockaddr*>(&addr_), AddrLen()),
+ SyscallSucceeds());
+ struct sockaddr saddr;
+ socklen_t addrlen = sizeof(saddr);
+ ASSERT_THAT(getpeername(s_, &saddr, &addrlen),
+ SyscallFailsWithErrno(ENOTCONN));
+ ASSERT_GT(addrlen, 0);
+}
+
+// Test that the socket is writable immediately.
+TEST_P(RawSocketTest, PollWritableImmediately) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ struct pollfd pfd = {};
+ pfd.fd = s_;
+ pfd.events = POLLOUT;
+ ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, 10000), SyscallSucceedsWithValue(1));
+}
+
+// Test that the socket isn't readable before receiving anything.
+TEST_P(RawSocketTest, PollNotReadableInitially) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ // Try to receive data with MSG_DONTWAIT, which returns immediately if there's
+ // nothing to be read.
+ char buf[117];
+ ASSERT_THAT(RetryEINTR(recv)(s_, buf, sizeof(buf), MSG_DONTWAIT),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+// Test that the socket becomes readable once something is written to it.
+TEST_P(RawSocketTest, PollTriggeredOnWrite) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ // Write something so that there's data to be read.
+ // Arbitrary.
+ constexpr char kBuf[] = "JP5";
+ ASSERT_NO_FATAL_FAILURE(SendBuf(kBuf, sizeof(kBuf)));
+
+ struct pollfd pfd = {};
+ pfd.fd = s_;
+ pfd.events = POLLIN;
+ ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, 10000), SyscallSucceedsWithValue(1));
+}
+
+// Test that we can connect() to a valid IP (loopback).
+TEST_P(RawSocketTest, ConnectToLoopback) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ ASSERT_THAT(
+ connect(s_, reinterpret_cast<struct sockaddr*>(&addr_), AddrLen()),
+ SyscallSucceeds());
+}
+
+// Test that calling send() without connect() fails.
+TEST_P(RawSocketTest, SendWithoutConnectFails) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ // Arbitrary.
+ constexpr char kBuf[] = "Endgame was good";
+ ASSERT_THAT(send(s_, kBuf, sizeof(kBuf), 0),
+ SyscallFailsWithErrno(EDESTADDRREQ));
+}
+
+// Bind to localhost.
+TEST_P(RawSocketTest, BindToLocalhost) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ ASSERT_THAT(
+ bind(s_, reinterpret_cast<struct sockaddr*>(&addr_), AddrLen()),
+ SyscallSucceeds());
+}
+
+// Bind to a different address.
+TEST_P(RawSocketTest, BindToInvalid) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ struct sockaddr_storage bind_addr = addr_;
+ if (Family() == AF_INET) {
+ struct sockaddr_in* sin = reinterpret_cast<struct sockaddr_in*>(&bind_addr);
+ sin->sin_addr = {1}; // 1.0.0.0 - An address that we can't bind to.
+ } else {
+ struct sockaddr_in6* sin6 =
+ reinterpret_cast<struct sockaddr_in6*>(&bind_addr);
+ memset(&sin6->sin6_addr.s6_addr, 0, sizeof(sin6->sin6_addr.s6_addr));
+ sin6->sin6_addr.s6_addr[0] = 1; // 1: - An address that we can't bind to.
+ }
+ ASSERT_THAT(bind(s_, reinterpret_cast<struct sockaddr*>(&bind_addr),
+ AddrLen()), SyscallFailsWithErrno(EADDRNOTAVAIL));
+}
+
+// Send and receive an packet.
+TEST_P(RawSocketTest, SendAndReceive) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ // Arbitrary.
+ constexpr char kBuf[] = "TB12";
+ ASSERT_NO_FATAL_FAILURE(SendBuf(kBuf, sizeof(kBuf)));
+
+ // Receive the packet and make sure it's identical.
+ std::vector<char> recv_buf(sizeof(kBuf) + HdrLen());
+ ASSERT_NO_FATAL_FAILURE(ReceiveBuf(recv_buf.data(), recv_buf.size()));
+ EXPECT_EQ(memcmp(recv_buf.data() + HdrLen(), kBuf, sizeof(kBuf)), 0);
+}
+
+// We should be able to create multiple raw sockets for the same protocol and
+// receive the same packet on both.
+TEST_P(RawSocketTest, MultipleSocketReceive) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ int s2;
+ ASSERT_THAT(s2 = socket(Family(), SOCK_RAW, Protocol()), SyscallSucceeds());
+
+ // Arbitrary.
+ constexpr char kBuf[] = "TB10";
+ ASSERT_NO_FATAL_FAILURE(SendBuf(kBuf, sizeof(kBuf)));
+
+ // Receive it on socket 1.
+ std::vector<char> recv_buf1(sizeof(kBuf) + HdrLen());
+ ASSERT_NO_FATAL_FAILURE(ReceiveBuf(recv_buf1.data(), recv_buf1.size()));
+
+ // Receive it on socket 2.
+ std::vector<char> recv_buf2(sizeof(kBuf) + HdrLen());
+ ASSERT_NO_FATAL_FAILURE(ReceiveBufFrom(s2, recv_buf2.data(),
+ recv_buf2.size()));
+
+ EXPECT_EQ(memcmp(recv_buf1.data() + HdrLen(),
+ recv_buf2.data() + HdrLen(), sizeof(kBuf)),
+ 0);
+
+ ASSERT_THAT(close(s2), SyscallSucceeds());
+}
+
+// Test that connect sends packets to the right place.
+TEST_P(RawSocketTest, SendAndReceiveViaConnect) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ ASSERT_THAT(
+ connect(s_, reinterpret_cast<struct sockaddr*>(&addr_), AddrLen()),
+ SyscallSucceeds());
+
+ // Arbitrary.
+ constexpr char kBuf[] = "JH4";
+ ASSERT_THAT(send(s_, kBuf, sizeof(kBuf), 0),
+ SyscallSucceedsWithValue(sizeof(kBuf)));
+
+ // Receive the packet and make sure it's identical.
+ std::vector<char> recv_buf(sizeof(kBuf) + HdrLen());
+ ASSERT_NO_FATAL_FAILURE(ReceiveBuf(recv_buf.data(), recv_buf.size()));
+ EXPECT_EQ(memcmp(recv_buf.data() + HdrLen(), kBuf, sizeof(kBuf)), 0);
+}
+
+// Bind to localhost, then send and receive packets.
+TEST_P(RawSocketTest, BindSendAndReceive) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ ASSERT_THAT(
+ bind(s_, reinterpret_cast<struct sockaddr*>(&addr_), AddrLen()),
+ SyscallSucceeds());
+
+ // Arbitrary.
+ constexpr char kBuf[] = "DR16";
+ ASSERT_NO_FATAL_FAILURE(SendBuf(kBuf, sizeof(kBuf)));
+
+ // Receive the packet and make sure it's identical.
+ std::vector<char> recv_buf(sizeof(kBuf) + HdrLen());
+ ASSERT_NO_FATAL_FAILURE(ReceiveBuf(recv_buf.data(), recv_buf.size()));
+ EXPECT_EQ(memcmp(recv_buf.data() + HdrLen(), kBuf, sizeof(kBuf)), 0);
+}
+
+// Bind and connect to localhost and send/receive packets.
+TEST_P(RawSocketTest, BindConnectSendAndReceive) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ ASSERT_THAT(
+ bind(s_, reinterpret_cast<struct sockaddr*>(&addr_), AddrLen()),
+ SyscallSucceeds());
+ ASSERT_THAT(
+ connect(s_, reinterpret_cast<struct sockaddr*>(&addr_), AddrLen()),
+ SyscallSucceeds());
+
+ // Arbitrary.
+ constexpr char kBuf[] = "DG88";
+ ASSERT_NO_FATAL_FAILURE(SendBuf(kBuf, sizeof(kBuf)));
+
+ // Receive the packet and make sure it's identical.
+ std::vector<char> recv_buf(sizeof(kBuf) + HdrLen());
+ ASSERT_NO_FATAL_FAILURE(ReceiveBuf(recv_buf.data(), recv_buf.size()));
+ EXPECT_EQ(memcmp(recv_buf.data() + HdrLen(), kBuf, sizeof(kBuf)), 0);
+}
+
+// Check that setting SO_RCVBUF below min is clamped to the minimum
+// receive buffer size.
+TEST_P(RawSocketTest, SetSocketRecvBufBelowMin) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ // Discover minimum receive buf size by trying to set it to zero.
+ // See:
+ // https://github.com/torvalds/linux/blob/a5dc8300df75e8b8384b4c82225f1e4a0b4d9b55/net/core/sock.c#L820
+ constexpr int kRcvBufSz = 0;
+ ASSERT_THAT(
+ setsockopt(s_, SOL_SOCKET, SO_RCVBUF, &kRcvBufSz, sizeof(kRcvBufSz)),
+ SyscallSucceeds());
+
+ int min = 0;
+ socklen_t min_len = sizeof(min);
+ ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_RCVBUF, &min, &min_len),
+ SyscallSucceeds());
+
+ // Linux doubles the value so let's use a value that when doubled will still
+ // be smaller than min.
+ int below_min = min / 2 - 1;
+ ASSERT_THAT(
+ setsockopt(s_, SOL_SOCKET, SO_RCVBUF, &below_min, sizeof(below_min)),
+ SyscallSucceeds());
+
+ int val = 0;
+ socklen_t val_len = sizeof(val);
+ ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_RCVBUF, &val, &val_len),
+ SyscallSucceeds());
+
+ ASSERT_EQ(min, val);
+}
+
+// Check that setting SO_RCVBUF above max is clamped to the maximum
+// receive buffer size.
+TEST_P(RawSocketTest, SetSocketRecvBufAboveMax) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ // Discover max buf size by trying to set the largest possible buffer size.
+ constexpr int kRcvBufSz = 0xffffffff;
+ ASSERT_THAT(
+ setsockopt(s_, SOL_SOCKET, SO_RCVBUF, &kRcvBufSz, sizeof(kRcvBufSz)),
+ SyscallSucceeds());
+
+ int max = 0;
+ socklen_t max_len = sizeof(max);
+ ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_RCVBUF, &max, &max_len),
+ SyscallSucceeds());
+
+ int above_max = max + 1;
+ ASSERT_THAT(
+ setsockopt(s_, SOL_SOCKET, SO_RCVBUF, &above_max, sizeof(above_max)),
+ SyscallSucceeds());
+
+ int val = 0;
+ socklen_t val_len = sizeof(val);
+ ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_RCVBUF, &val, &val_len),
+ SyscallSucceeds());
+ ASSERT_EQ(max, val);
+}
+
+// Check that setting SO_RCVBUF min <= kRcvBufSz <= max is honored.
+TEST_P(RawSocketTest, SetSocketRecvBuf) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ int max = 0;
+ int min = 0;
+ {
+ // Discover max buf size by trying to set a really large buffer size.
+ constexpr int kRcvBufSz = 0xffffffff;
+ ASSERT_THAT(
+ setsockopt(s_, SOL_SOCKET, SO_RCVBUF, &kRcvBufSz, sizeof(kRcvBufSz)),
+ SyscallSucceeds());
+
+ max = 0;
+ socklen_t max_len = sizeof(max);
+ ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_RCVBUF, &max, &max_len),
+ SyscallSucceeds());
+ }
+
+ {
+ // Discover minimum buffer size by trying to set a zero size receive buffer
+ // size.
+ // See:
+ // https://github.com/torvalds/linux/blob/a5dc8300df75e8b8384b4c82225f1e4a0b4d9b55/net/core/sock.c#L820
+ constexpr int kRcvBufSz = 0;
+ ASSERT_THAT(
+ setsockopt(s_, SOL_SOCKET, SO_RCVBUF, &kRcvBufSz, sizeof(kRcvBufSz)),
+ SyscallSucceeds());
+
+ socklen_t min_len = sizeof(min);
+ ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_RCVBUF, &min, &min_len),
+ SyscallSucceeds());
+ }
+
+ int quarter_sz = min + (max - min) / 4;
+ ASSERT_THAT(
+ setsockopt(s_, SOL_SOCKET, SO_RCVBUF, &quarter_sz, sizeof(quarter_sz)),
+ SyscallSucceeds());
+
+ int val = 0;
+ socklen_t val_len = sizeof(val);
+ ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_RCVBUF, &val, &val_len),
+ SyscallSucceeds());
+
+ // Linux doubles the value set by SO_SNDBUF/SO_RCVBUF.
+ // TODO(gvisor.dev/issue/2926): Remove when Netstack matches linux behavior.
+ if (!IsRunningOnGvisor()) {
+ quarter_sz *= 2;
+ }
+ ASSERT_EQ(quarter_sz, val);
+}
+
+// Check that setting SO_SNDBUF below min is clamped to the minimum
+// receive buffer size.
+TEST_P(RawSocketTest, SetSocketSendBufBelowMin) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ // Discover minimum buffer size by trying to set it to zero.
+ constexpr int kSndBufSz = 0;
+ ASSERT_THAT(
+ setsockopt(s_, SOL_SOCKET, SO_SNDBUF, &kSndBufSz, sizeof(kSndBufSz)),
+ SyscallSucceeds());
+
+ int min = 0;
+ socklen_t min_len = sizeof(min);
+ ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_SNDBUF, &min, &min_len),
+ SyscallSucceeds());
+
+ // Linux doubles the value so let's use a value that when doubled will still
+ // be smaller than min.
+ int below_min = min / 2 - 1;
+ ASSERT_THAT(
+ setsockopt(s_, SOL_SOCKET, SO_SNDBUF, &below_min, sizeof(below_min)),
+ SyscallSucceeds());
+
+ int val = 0;
+ socklen_t val_len = sizeof(val);
+ ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_SNDBUF, &val, &val_len),
+ SyscallSucceeds());
+
+ ASSERT_EQ(min, val);
+}
+
+// Check that setting SO_SNDBUF above max is clamped to the maximum
+// send buffer size.
+TEST_P(RawSocketTest, SetSocketSendBufAboveMax) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ // Discover maximum buffer size by trying to set it to a large value.
+ constexpr int kSndBufSz = 0xffffffff;
+ ASSERT_THAT(
+ setsockopt(s_, SOL_SOCKET, SO_SNDBUF, &kSndBufSz, sizeof(kSndBufSz)),
+ SyscallSucceeds());
+
+ int max = 0;
+ socklen_t max_len = sizeof(max);
+ ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_SNDBUF, &max, &max_len),
+ SyscallSucceeds());
+
+ int above_max = max + 1;
+ ASSERT_THAT(
+ setsockopt(s_, SOL_SOCKET, SO_SNDBUF, &above_max, sizeof(above_max)),
+ SyscallSucceeds());
+
+ int val = 0;
+ socklen_t val_len = sizeof(val);
+ ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_SNDBUF, &val, &val_len),
+ SyscallSucceeds());
+ ASSERT_EQ(max, val);
+}
+
+// Check that setting SO_SNDBUF min <= kSndBufSz <= max is honored.
+TEST_P(RawSocketTest, SetSocketSendBuf) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ int max = 0;
+ int min = 0;
+ {
+ // Discover maximum buffer size by trying to set it to a large value.
+ constexpr int kSndBufSz = 0xffffffff;
+ ASSERT_THAT(
+ setsockopt(s_, SOL_SOCKET, SO_SNDBUF, &kSndBufSz, sizeof(kSndBufSz)),
+ SyscallSucceeds());
+
+ max = 0;
+ socklen_t max_len = sizeof(max);
+ ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_SNDBUF, &max, &max_len),
+ SyscallSucceeds());
+ }
+
+ {
+ // Discover minimum buffer size by trying to set it to zero.
+ constexpr int kSndBufSz = 0;
+ ASSERT_THAT(
+ setsockopt(s_, SOL_SOCKET, SO_SNDBUF, &kSndBufSz, sizeof(kSndBufSz)),
+ SyscallSucceeds());
+
+ socklen_t min_len = sizeof(min);
+ ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_SNDBUF, &min, &min_len),
+ SyscallSucceeds());
+ }
+
+ int quarter_sz = min + (max - min) / 4;
+ ASSERT_THAT(
+ setsockopt(s_, SOL_SOCKET, SO_SNDBUF, &quarter_sz, sizeof(quarter_sz)),
+ SyscallSucceeds());
+
+ int val = 0;
+ socklen_t val_len = sizeof(val);
+ ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_SNDBUF, &val, &val_len),
+ SyscallSucceeds());
+
+ // Linux doubles the value set by SO_SNDBUF/SO_RCVBUF.
+ // TODO(gvisor.dev/issue/2926): Remove the gvisor special casing when Netstack
+ // matches linux behavior.
+ if (!IsRunningOnGvisor()) {
+ quarter_sz *= 2;
+ }
+
+ ASSERT_EQ(quarter_sz, val);
+}
+
+// Test that receive buffer limits are not enforced when the recv buffer is
+// empty.
+TEST_P(RawSocketTest, RecvBufLimitsEmptyRecvBuffer) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ ASSERT_THAT(
+ bind(s_, reinterpret_cast<struct sockaddr*>(&addr_), AddrLen()),
+ SyscallSucceeds());
+ ASSERT_THAT(
+ connect(s_, reinterpret_cast<struct sockaddr*>(&addr_), AddrLen()),
+ SyscallSucceeds());
+
+ int min = 0;
+ {
+ // Discover minimum buffer size by trying to set it to zero.
+ constexpr int kRcvBufSz = 0;
+ ASSERT_THAT(
+ setsockopt(s_, SOL_SOCKET, SO_RCVBUF, &kRcvBufSz, sizeof(kRcvBufSz)),
+ SyscallSucceeds());
+
+ socklen_t min_len = sizeof(min);
+ ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_RCVBUF, &min, &min_len),
+ SyscallSucceeds());
+ }
+
+ {
+ // Send data of size min and verify that it's received.
+ std::vector<char> buf(min);
+ RandomizeBuffer(buf.data(), buf.size());
+ ASSERT_NO_FATAL_FAILURE(SendBuf(buf.data(), buf.size()));
+
+ // Receive the packet and make sure it's identical.
+ std::vector<char> recv_buf(buf.size() + HdrLen());
+ ASSERT_NO_FATAL_FAILURE(ReceiveBuf(recv_buf.data(), recv_buf.size()));
+ EXPECT_EQ(
+ memcmp(recv_buf.data() + HdrLen(), buf.data(), buf.size()),
+ 0);
+ }
+
+ {
+ // Send data of size min + 1 and verify that its received. Both linux and
+ // Netstack accept a dgram that exceeds rcvBuf limits if the receive buffer
+ // is currently empty.
+ std::vector<char> buf(min + 1);
+ RandomizeBuffer(buf.data(), buf.size());
+ ASSERT_NO_FATAL_FAILURE(SendBuf(buf.data(), buf.size()));
+ // Receive the packet and make sure it's identical.
+ std::vector<char> recv_buf(buf.size() + HdrLen());
+ ASSERT_NO_FATAL_FAILURE(ReceiveBuf(recv_buf.data(), recv_buf.size()));
+ EXPECT_EQ(
+ memcmp(recv_buf.data() + HdrLen(), buf.data(), buf.size()),
+ 0);
+ }
+}
+
+TEST_P(RawSocketTest, RecvBufLimits) {
+ // TCP stack generates RSTs for unknown endpoints and it complicates the test
+ // as we have to deal with the RST packets as well. For testing the raw socket
+ // endpoints buffer limit enforcement we can just test for UDP.
+ //
+ // We don't use SKIP_IF here because root_test_runner explicitly fails if a
+ // test is skipped.
+ if (Protocol() == IPPROTO_TCP) {
+ return;
+ }
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ ASSERT_THAT(
+ bind(s_, reinterpret_cast<struct sockaddr*>(&addr_), AddrLen()),
+ SyscallSucceeds());
+ ASSERT_THAT(
+ connect(s_, reinterpret_cast<struct sockaddr*>(&addr_), AddrLen()),
+ SyscallSucceeds());
+
+ int min = 0;
+ {
+ // Discover minimum buffer size by trying to set it to zero.
+ constexpr int kRcvBufSz = 0;
+ ASSERT_THAT(
+ setsockopt(s_, SOL_SOCKET, SO_RCVBUF, &kRcvBufSz, sizeof(kRcvBufSz)),
+ SyscallSucceeds());
+
+ socklen_t min_len = sizeof(min);
+ ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_RCVBUF, &min, &min_len),
+ SyscallSucceeds());
+ }
+
+ // Now set the limit to min * 2.
+ int new_rcv_buf_sz = min * 4;
+ if (!IsRunningOnGvisor()) {
+ // Linux doubles the value specified so just set to min.
+ new_rcv_buf_sz = min * 2;
+ }
+
+ ASSERT_THAT(setsockopt(s_, SOL_SOCKET, SO_RCVBUF, &new_rcv_buf_sz,
+ sizeof(new_rcv_buf_sz)),
+ SyscallSucceeds());
+ int rcv_buf_sz = 0;
+ {
+ socklen_t rcv_buf_len = sizeof(rcv_buf_sz);
+ ASSERT_THAT(
+ getsockopt(s_, SOL_SOCKET, SO_RCVBUF, &rcv_buf_sz, &rcv_buf_len),
+ SyscallSucceeds());
+ }
+
+ // Set a receive timeout so that we don't block forever on reads if the test
+ // fails.
+ struct timeval tv {
+ .tv_sec = 1, .tv_usec = 0,
+ };
+ ASSERT_THAT(setsockopt(s_, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)),
+ SyscallSucceeds());
+
+ {
+ std::vector<char> buf(min);
+ RandomizeBuffer(buf.data(), buf.size());
+
+ ASSERT_NO_FATAL_FAILURE(SendBuf(buf.data(), buf.size()));
+ ASSERT_NO_FATAL_FAILURE(SendBuf(buf.data(), buf.size()));
+ ASSERT_NO_FATAL_FAILURE(SendBuf(buf.data(), buf.size()));
+ ASSERT_NO_FATAL_FAILURE(SendBuf(buf.data(), buf.size()));
+ int sent = 4;
+ if (IsRunningOnGvisor()) {
+ // Linux seems to drop the 4th packet even though technically it should
+ // fit in the receive buffer.
+ ASSERT_NO_FATAL_FAILURE(SendBuf(buf.data(), buf.size()));
+ sent++;
+ }
+
+ // Verify that the expected number of packets are available to be read.
+ for (int i = 0; i < sent - 1; i++) {
+ // Receive the packet and make sure it's identical.
+ std::vector<char> recv_buf(buf.size() + HdrLen());
+ ASSERT_NO_FATAL_FAILURE(ReceiveBuf(recv_buf.data(), recv_buf.size()));
+ EXPECT_EQ(memcmp(recv_buf.data() + HdrLen(), buf.data(),
+ buf.size()),
+ 0);
+ }
+
+ // Assert that the last packet is dropped because the receive buffer should
+ // be full after the first four packets.
+ std::vector<char> recv_buf(buf.size() + HdrLen());
+ struct iovec iov = {};
+ iov.iov_base = static_cast<void*>(const_cast<char*>(recv_buf.data()));
+ iov.iov_len = buf.size();
+ struct msghdr msg = {};
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+ ASSERT_THAT(RetryEINTR(recvmsg)(s_, &msg, MSG_DONTWAIT),
+ SyscallFailsWithErrno(EAGAIN));
+ }
+}
+
+void RawSocketTest::SendBuf(const char* buf, int buf_len) {
+ // It's safe to use const_cast here because sendmsg won't modify the iovec or
+ // address.
+ struct iovec iov = {};
+ iov.iov_base = static_cast<void*>(const_cast<char*>(buf));
+ iov.iov_len = static_cast<size_t>(buf_len);
+ struct msghdr msg = {};
+ msg.msg_name = static_cast<void*>(&addr_);
+ msg.msg_namelen = AddrLen();
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+ ASSERT_THAT(sendmsg(s_, &msg, 0), SyscallSucceedsWithValue(buf_len));
+}
+
+void RawSocketTest::ReceiveBuf(char* recv_buf, size_t recv_buf_len) {
+ ASSERT_NO_FATAL_FAILURE(ReceiveBufFrom(s_, recv_buf, recv_buf_len));
+}
+
+void RawSocketTest::ReceiveBufFrom(int sock, char* recv_buf,
+ size_t recv_buf_len) {
+ ASSERT_NO_FATAL_FAILURE(RecvNoCmsg(sock, recv_buf, recv_buf_len));
+}
+
+INSTANTIATE_TEST_SUITE_P(AllInetTests, RawSocketTest,
+ ::testing::Combine(
+ ::testing::Values(IPPROTO_TCP, IPPROTO_UDP),
+ ::testing::Values(AF_INET, AF_INET6)));
+
+// AF_INET6+SOCK_RAW+IPPROTO_RAW sockets can be created, but not written to.
+TEST(RawSocketTest, IPv6ProtoRaw) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ int sock;
+ ASSERT_THAT(sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW),
+ SyscallSucceeds());
+
+ // Verify that writing yields EINVAL.
+ char buf[] = "This is such a weird little edge case";
+ struct sockaddr_in6 sin6 = {};
+ sin6.sin6_family = AF_INET6;
+ sin6.sin6_addr = in6addr_loopback;
+ ASSERT_THAT(sendto(sock, buf, sizeof(buf), 0 /* flags */,
+ reinterpret_cast<struct sockaddr*>(&sin6), sizeof(sin6)),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/raw_socket_hdrincl.cc b/test/syscalls/linux/raw_socket_hdrincl.cc
new file mode 100644
index 000000000..5bb14d57c
--- /dev/null
+++ b/test/syscalls/linux/raw_socket_hdrincl.cc
@@ -0,0 +1,406 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <linux/capability.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h>
+#include <netinet/udp.h>
+#include <poll.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <cstring>
+
+#include "gtest/gtest.h"
+#include "absl/base/internal/endian.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Tests for IPPROTO_RAW raw sockets, which implies IP_HDRINCL.
+class RawHDRINCL : public ::testing::Test {
+ protected:
+ // Creates a socket to be used in tests.
+ void SetUp() override;
+
+ // Closes the socket created by SetUp().
+ void TearDown() override;
+
+ // Returns a valid looback IP header with no payload.
+ struct iphdr LoopbackHeader();
+
+ // Fills in buf with an IP header, UDP header, and payload. Returns false if
+ // buf_size isn't large enough to hold everything.
+ bool FillPacket(char* buf, size_t buf_size, int port, const char* payload,
+ uint16_t payload_size);
+
+ // The socket used for both reading and writing.
+ int socket_;
+
+ // The loopback address.
+ struct sockaddr_in addr_;
+};
+
+void RawHDRINCL::SetUp() {
+ if (!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) {
+ ASSERT_THAT(socket(AF_INET, SOCK_RAW, IPPROTO_RAW),
+ SyscallFailsWithErrno(EPERM));
+ GTEST_SKIP();
+ }
+
+ ASSERT_THAT(socket_ = socket(AF_INET, SOCK_RAW, IPPROTO_RAW),
+ SyscallSucceeds());
+
+ addr_ = {};
+
+ addr_.sin_port = IPPROTO_IP;
+ addr_.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ addr_.sin_family = AF_INET;
+}
+
+void RawHDRINCL::TearDown() {
+ // TearDown will be run even if we skip the test.
+ if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) {
+ EXPECT_THAT(close(socket_), SyscallSucceeds());
+ }
+}
+
+struct iphdr RawHDRINCL::LoopbackHeader() {
+ struct iphdr hdr = {};
+ hdr.ihl = 5;
+ hdr.version = 4;
+ hdr.tos = 0;
+ hdr.tot_len = absl::gbswap_16(sizeof(hdr));
+ hdr.id = 0;
+ hdr.frag_off = 0;
+ hdr.ttl = 7;
+ hdr.protocol = 1;
+ hdr.daddr = htonl(INADDR_LOOPBACK);
+ // hdr.check is set by the network stack.
+ // hdr.tot_len is set by the network stack.
+ // hdr.saddr is set by the network stack.
+ return hdr;
+}
+
+bool RawHDRINCL::FillPacket(char* buf, size_t buf_size, int port,
+ const char* payload, uint16_t payload_size) {
+ if (buf_size < sizeof(struct iphdr) + sizeof(struct udphdr) + payload_size) {
+ return false;
+ }
+
+ struct iphdr ip = LoopbackHeader();
+ ip.protocol = IPPROTO_UDP;
+
+ struct udphdr udp = {};
+ udp.source = absl::gbswap_16(port);
+ udp.dest = absl::gbswap_16(port);
+ udp.len = absl::gbswap_16(sizeof(udp) + payload_size);
+ udp.check = 0;
+
+ memcpy(buf, reinterpret_cast<char*>(&ip), sizeof(ip));
+ memcpy(buf + sizeof(ip), reinterpret_cast<char*>(&udp), sizeof(udp));
+ memcpy(buf + sizeof(ip) + sizeof(udp), payload, payload_size);
+
+ return true;
+}
+
+// We should be able to create multiple IPPROTO_RAW sockets. RawHDRINCL::Setup
+// creates the first one, so we only have to create one more here.
+TEST_F(RawHDRINCL, MultipleCreation) {
+ int s2;
+ ASSERT_THAT(s2 = socket(AF_INET, SOCK_RAW, IPPROTO_RAW), SyscallSucceeds());
+
+ ASSERT_THAT(close(s2), SyscallSucceeds());
+}
+
+// Test that shutting down an unconnected socket fails.
+TEST_F(RawHDRINCL, FailShutdownWithoutConnect) {
+ ASSERT_THAT(shutdown(socket_, SHUT_WR), SyscallFailsWithErrno(ENOTCONN));
+ ASSERT_THAT(shutdown(socket_, SHUT_RD), SyscallFailsWithErrno(ENOTCONN));
+}
+
+// Test that listen() fails.
+TEST_F(RawHDRINCL, FailListen) {
+ ASSERT_THAT(listen(socket_, 1), SyscallFailsWithErrno(ENOTSUP));
+}
+
+// Test that accept() fails.
+TEST_F(RawHDRINCL, FailAccept) {
+ struct sockaddr saddr;
+ socklen_t addrlen;
+ ASSERT_THAT(accept(socket_, &saddr, &addrlen),
+ SyscallFailsWithErrno(ENOTSUP));
+}
+
+// Test that the socket is writable immediately.
+TEST_F(RawHDRINCL, PollWritableImmediately) {
+ struct pollfd pfd = {};
+ pfd.fd = socket_;
+ pfd.events = POLLOUT;
+ ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, 0), SyscallSucceedsWithValue(1));
+}
+
+// Test that the socket isn't readable.
+TEST_F(RawHDRINCL, NotReadable) {
+ // Try to receive data with MSG_DONTWAIT, which returns immediately if there's
+ // nothing to be read.
+ char buf[117];
+ ASSERT_THAT(RetryEINTR(recv)(socket_, buf, sizeof(buf), MSG_DONTWAIT),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+// Test that we can connect() to a valid IP (loopback).
+TEST_F(RawHDRINCL, ConnectToLoopback) {
+ ASSERT_THAT(connect(socket_, reinterpret_cast<struct sockaddr*>(&addr_),
+ sizeof(addr_)),
+ SyscallSucceeds());
+}
+
+TEST_F(RawHDRINCL, SendWithoutConnectSucceeds) {
+ struct iphdr hdr = LoopbackHeader();
+ ASSERT_THAT(send(socket_, &hdr, sizeof(hdr), 0),
+ SyscallSucceedsWithValue(sizeof(hdr)));
+}
+
+// HDRINCL implies write-only. Verify that we can't read a packet sent to
+// loopback.
+TEST_F(RawHDRINCL, NotReadableAfterWrite) {
+ ASSERT_THAT(connect(socket_, reinterpret_cast<struct sockaddr*>(&addr_),
+ sizeof(addr_)),
+ SyscallSucceeds());
+
+ // Construct a packet with an IP header, UDP header, and payload.
+ constexpr char kPayload[] = "odst";
+ char packet[sizeof(struct iphdr) + sizeof(struct udphdr) + sizeof(kPayload)];
+ ASSERT_TRUE(FillPacket(packet, sizeof(packet), 40000 /* port */, kPayload,
+ sizeof(kPayload)));
+
+ socklen_t addrlen = sizeof(addr_);
+ ASSERT_NO_FATAL_FAILURE(
+ sendto(socket_, reinterpret_cast<void*>(&packet), sizeof(packet), 0,
+ reinterpret_cast<struct sockaddr*>(&addr_), addrlen));
+
+ struct pollfd pfd = {};
+ pfd.fd = socket_;
+ pfd.events = POLLIN;
+ ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, 1000), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(RawHDRINCL, WriteTooSmall) {
+ ASSERT_THAT(connect(socket_, reinterpret_cast<struct sockaddr*>(&addr_),
+ sizeof(addr_)),
+ SyscallSucceeds());
+
+ // This is smaller than the size of an IP header.
+ constexpr char kBuf[] = "JP5";
+ ASSERT_THAT(send(socket_, kBuf, sizeof(kBuf), 0),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+// Bind to localhost.
+TEST_F(RawHDRINCL, BindToLocalhost) {
+ ASSERT_THAT(
+ bind(socket_, reinterpret_cast<struct sockaddr*>(&addr_), sizeof(addr_)),
+ SyscallSucceeds());
+}
+
+// Bind to a different address.
+TEST_F(RawHDRINCL, BindToInvalid) {
+ struct sockaddr_in bind_addr = {};
+ bind_addr.sin_family = AF_INET;
+ bind_addr.sin_addr = {1}; // 1.0.0.0 - An address that we can't bind to.
+ ASSERT_THAT(bind(socket_, reinterpret_cast<struct sockaddr*>(&bind_addr),
+ sizeof(bind_addr)),
+ SyscallFailsWithErrno(EADDRNOTAVAIL));
+}
+
+// Send and receive a packet.
+TEST_F(RawHDRINCL, SendAndReceive) {
+ int port = 40000;
+ if (!IsRunningOnGvisor()) {
+ port = static_cast<short>(ASSERT_NO_ERRNO_AND_VALUE(
+ PortAvailable(0, AddressFamily::kIpv4, SocketType::kUdp, false)));
+ }
+
+ // IPPROTO_RAW sockets are write-only. We'll have to open another socket to
+ // read what we write.
+ FileDescriptor udp_sock =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_RAW, IPPROTO_UDP));
+
+ // Construct a packet with an IP header, UDP header, and payload.
+ constexpr char kPayload[] = "toto";
+ char packet[sizeof(struct iphdr) + sizeof(struct udphdr) + sizeof(kPayload)];
+ ASSERT_TRUE(
+ FillPacket(packet, sizeof(packet), port, kPayload, sizeof(kPayload)));
+
+ socklen_t addrlen = sizeof(addr_);
+ ASSERT_NO_FATAL_FAILURE(sendto(socket_, &packet, sizeof(packet), 0,
+ reinterpret_cast<struct sockaddr*>(&addr_),
+ addrlen));
+
+ // Receive the payload.
+ char recv_buf[sizeof(packet)];
+ struct sockaddr_in src;
+ socklen_t src_size = sizeof(src);
+ ASSERT_THAT(recvfrom(udp_sock.get(), recv_buf, sizeof(recv_buf), 0,
+ reinterpret_cast<struct sockaddr*>(&src), &src_size),
+ SyscallSucceedsWithValue(sizeof(packet)));
+ EXPECT_EQ(
+ memcmp(kPayload, recv_buf + sizeof(struct iphdr) + sizeof(struct udphdr),
+ sizeof(kPayload)),
+ 0);
+ // The network stack should have set the source address.
+ EXPECT_EQ(src.sin_family, AF_INET);
+ EXPECT_EQ(absl::gbswap_32(src.sin_addr.s_addr), INADDR_LOOPBACK);
+ // The packet ID should not be 0, as the packet has DF=0.
+ struct iphdr* iphdr = reinterpret_cast<struct iphdr*>(recv_buf);
+ EXPECT_NE(iphdr->id, 0);
+}
+
+// Send and receive a packet where the sendto address is not the same as the
+// provided destination.
+TEST_F(RawHDRINCL, SendAndReceiveDifferentAddress) {
+ int port = 40000;
+ if (!IsRunningOnGvisor()) {
+ port = static_cast<short>(ASSERT_NO_ERRNO_AND_VALUE(
+ PortAvailable(0, AddressFamily::kIpv4, SocketType::kUdp, false)));
+ }
+
+ // IPPROTO_RAW sockets are write-only. We'll have to open another socket to
+ // read what we write.
+ FileDescriptor udp_sock =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_RAW, IPPROTO_UDP));
+
+ // Construct a packet with an IP header, UDP header, and payload.
+ constexpr char kPayload[] = "toto";
+ char packet[sizeof(struct iphdr) + sizeof(struct udphdr) + sizeof(kPayload)];
+ ASSERT_TRUE(
+ FillPacket(packet, sizeof(packet), port, kPayload, sizeof(kPayload)));
+ // Overwrite the IP destination address with an IP we can't get to.
+ struct iphdr iphdr = {};
+ memcpy(&iphdr, packet, sizeof(iphdr));
+ iphdr.daddr = 42;
+ memcpy(packet, &iphdr, sizeof(iphdr));
+
+ socklen_t addrlen = sizeof(addr_);
+ ASSERT_NO_FATAL_FAILURE(sendto(socket_, &packet, sizeof(packet), 0,
+ reinterpret_cast<struct sockaddr*>(&addr_),
+ addrlen));
+
+ // Receive the payload, since sendto should replace the bad destination with
+ // localhost.
+ char recv_buf[sizeof(packet)];
+ struct sockaddr_in src;
+ socklen_t src_size = sizeof(src);
+ ASSERT_THAT(recvfrom(udp_sock.get(), recv_buf, sizeof(recv_buf), 0,
+ reinterpret_cast<struct sockaddr*>(&src), &src_size),
+ SyscallSucceedsWithValue(sizeof(packet)));
+ EXPECT_EQ(
+ memcmp(kPayload, recv_buf + sizeof(struct iphdr) + sizeof(struct udphdr),
+ sizeof(kPayload)),
+ 0);
+ // The network stack should have set the source address.
+ EXPECT_EQ(src.sin_family, AF_INET);
+ EXPECT_EQ(absl::gbswap_32(src.sin_addr.s_addr), INADDR_LOOPBACK);
+ // The packet ID should not be 0, as the packet has DF=0.
+ struct iphdr recv_iphdr = {};
+ memcpy(&recv_iphdr, recv_buf, sizeof(recv_iphdr));
+ EXPECT_NE(recv_iphdr.id, 0);
+ // The destination address should be localhost, not the bad IP we set
+ // initially.
+ EXPECT_EQ(absl::gbswap_32(recv_iphdr.daddr), INADDR_LOOPBACK);
+}
+
+// Send and receive a packet w/ the IP_HDRINCL option set.
+TEST_F(RawHDRINCL, SendAndReceiveIPHdrIncl) {
+ int port = 40000;
+ if (!IsRunningOnGvisor()) {
+ port = static_cast<short>(ASSERT_NO_ERRNO_AND_VALUE(
+ PortAvailable(0, AddressFamily::kIpv4, SocketType::kUdp, false)));
+ }
+
+ FileDescriptor recv_sock =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_RAW, IPPROTO_UDP));
+
+ FileDescriptor send_sock =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_RAW, IPPROTO_UDP));
+
+ // Enable IP_HDRINCL option so that we can build and send w/ an IP
+ // header.
+ constexpr int kSockOptOn = 1;
+ ASSERT_THAT(setsockopt(send_sock.get(), SOL_IP, IP_HDRINCL, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ // This is not strictly required but we do it to make sure that setting
+ // IP_HDRINCL on a non IPPROTO_RAW socket does not prevent it from receiving
+ // packets.
+ ASSERT_THAT(setsockopt(recv_sock.get(), SOL_IP, IP_HDRINCL, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+
+ // Construct a packet with an IP header, UDP header, and payload.
+ constexpr char kPayload[] = "toto";
+ char packet[sizeof(struct iphdr) + sizeof(struct udphdr) + sizeof(kPayload)];
+ ASSERT_TRUE(
+ FillPacket(packet, sizeof(packet), port, kPayload, sizeof(kPayload)));
+
+ socklen_t addrlen = sizeof(addr_);
+ ASSERT_NO_FATAL_FAILURE(sendto(send_sock.get(), &packet, sizeof(packet), 0,
+ reinterpret_cast<struct sockaddr*>(&addr_),
+ addrlen));
+
+ // Receive the payload.
+ char recv_buf[sizeof(packet)];
+ struct sockaddr_in src;
+ socklen_t src_size = sizeof(src);
+ ASSERT_THAT(recvfrom(recv_sock.get(), recv_buf, sizeof(recv_buf), 0,
+ reinterpret_cast<struct sockaddr*>(&src), &src_size),
+ SyscallSucceedsWithValue(sizeof(packet)));
+ EXPECT_EQ(
+ memcmp(kPayload, recv_buf + sizeof(struct iphdr) + sizeof(struct udphdr),
+ sizeof(kPayload)),
+ 0);
+ // The network stack should have set the source address.
+ EXPECT_EQ(src.sin_family, AF_INET);
+ EXPECT_EQ(absl::gbswap_32(src.sin_addr.s_addr), INADDR_LOOPBACK);
+ struct iphdr iphdr = {};
+ memcpy(&iphdr, recv_buf, sizeof(iphdr));
+ EXPECT_NE(iphdr.id, 0);
+
+ // Also verify that the packet we just sent was not delivered to the
+ // IPPROTO_RAW socket.
+ {
+ char recv_buf[sizeof(packet)];
+ struct sockaddr_in src;
+ socklen_t src_size = sizeof(src);
+ ASSERT_THAT(recvfrom(socket_, recv_buf, sizeof(recv_buf), MSG_DONTWAIT,
+ reinterpret_cast<struct sockaddr*>(&src), &src_size),
+ SyscallFailsWithErrno(EAGAIN));
+ }
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/raw_socket_icmp.cc b/test/syscalls/linux/raw_socket_icmp.cc
new file mode 100644
index 000000000..3de898df7
--- /dev/null
+++ b/test/syscalls/linux/raw_socket_icmp.cc
@@ -0,0 +1,514 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <linux/capability.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <cstdint>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// The size of an empty ICMP packet and IP header together.
+constexpr size_t kEmptyICMPSize = 28;
+
+// ICMP raw sockets get their own special tests because Linux automatically
+// responds to ICMP echo requests, and thus a single echo request sent via
+// loopback leads to 2 received ICMP packets.
+
+class RawSocketICMPTest : public ::testing::Test {
+ protected:
+ // Creates a socket to be used in tests.
+ void SetUp() override;
+
+ // Closes the socket created by SetUp().
+ void TearDown() override;
+
+ // Checks that both an ICMP echo request and reply are received. Calls should
+ // be wrapped in ASSERT_NO_FATAL_FAILURE.
+ void ExpectICMPSuccess(const struct icmphdr& icmp);
+
+ // Sends icmp via s_.
+ void SendEmptyICMP(const struct icmphdr& icmp);
+
+ // Sends icmp via s_ to the given address.
+ void SendEmptyICMPTo(int sock, const struct sockaddr_in& addr,
+ const struct icmphdr& icmp);
+
+ // Reads from s_ into recv_buf.
+ void ReceiveICMP(char* recv_buf, size_t recv_buf_len, size_t expected_size,
+ struct sockaddr_in* src);
+
+ // Reads from sock into recv_buf.
+ void ReceiveICMPFrom(char* recv_buf, size_t recv_buf_len,
+ size_t expected_size, struct sockaddr_in* src, int sock);
+
+ // The socket used for both reading and writing.
+ int s_;
+
+ // The loopback address.
+ struct sockaddr_in addr_;
+};
+
+void RawSocketICMPTest::SetUp() {
+ if (!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) {
+ ASSERT_THAT(socket(AF_INET, SOCK_RAW, IPPROTO_ICMP),
+ SyscallFailsWithErrno(EPERM));
+ GTEST_SKIP();
+ }
+
+ ASSERT_THAT(s_ = socket(AF_INET, SOCK_RAW, IPPROTO_ICMP), SyscallSucceeds());
+
+ addr_ = {};
+
+ // "On raw sockets sin_port is set to the IP protocol." - ip(7).
+ addr_.sin_port = IPPROTO_IP;
+ addr_.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ addr_.sin_family = AF_INET;
+}
+
+void RawSocketICMPTest::TearDown() {
+ // TearDown will be run even if we skip the test.
+ if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))) {
+ EXPECT_THAT(close(s_), SyscallSucceeds());
+ }
+}
+
+// We'll only read an echo in this case, as the kernel won't respond to the
+// malformed ICMP checksum.
+TEST_F(RawSocketICMPTest, SendAndReceiveBadChecksum) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ // Prepare and send an ICMP packet. Use arbitrary junk for checksum, sequence,
+ // and ID. None of that should matter for raw sockets - the kernel should
+ // still give us the packet.
+ struct icmphdr icmp;
+ icmp.type = ICMP_ECHO;
+ icmp.code = 0;
+ icmp.checksum = 0;
+ icmp.un.echo.sequence = 2012;
+ icmp.un.echo.id = 2014;
+ ASSERT_NO_FATAL_FAILURE(SendEmptyICMP(icmp));
+
+ // Veryify that we get the echo, then that there's nothing else to read.
+ char recv_buf[kEmptyICMPSize];
+ struct sockaddr_in src;
+ ASSERT_NO_FATAL_FAILURE(
+ ReceiveICMP(recv_buf, sizeof(recv_buf), sizeof(struct icmphdr), &src));
+ EXPECT_EQ(memcmp(&src, &addr_, sizeof(src)), 0);
+ // The packet should be identical to what we sent.
+ EXPECT_EQ(memcmp(recv_buf + sizeof(struct iphdr), &icmp, sizeof(icmp)), 0);
+
+ // And there should be nothing left to read.
+ EXPECT_THAT(RetryEINTR(recv)(s_, recv_buf, sizeof(recv_buf), MSG_DONTWAIT),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+// Send and receive an ICMP packet.
+TEST_F(RawSocketICMPTest, SendAndReceive) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ // Prepare and send an ICMP packet. Use arbitrary junk for sequence and ID.
+ // None of that should matter for raw sockets - the kernel should still give
+ // us the packet.
+ struct icmphdr icmp;
+ icmp.type = ICMP_ECHO;
+ icmp.code = 0;
+ icmp.checksum = 0;
+ icmp.un.echo.sequence = 2012;
+ icmp.un.echo.id = 2014;
+ icmp.checksum = ICMPChecksum(icmp, NULL, 0);
+ ASSERT_NO_FATAL_FAILURE(SendEmptyICMP(icmp));
+
+ ASSERT_NO_FATAL_FAILURE(ExpectICMPSuccess(icmp));
+}
+
+// We should be able to create multiple raw sockets for the same protocol and
+// receive the same packet on both.
+TEST_F(RawSocketICMPTest, MultipleSocketReceive) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ FileDescriptor s2 =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_RAW, IPPROTO_ICMP));
+
+ // Prepare and send an ICMP packet. Use arbitrary junk for sequence and ID.
+ // None of that should matter for raw sockets - the kernel should still give
+ // us the packet.
+ struct icmphdr icmp;
+ icmp.type = ICMP_ECHO;
+ icmp.code = 0;
+ icmp.checksum = 0;
+ icmp.un.echo.sequence = 2016;
+ icmp.un.echo.id = 2018;
+ icmp.checksum = ICMPChecksum(icmp, NULL, 0);
+ ASSERT_NO_FATAL_FAILURE(SendEmptyICMP(icmp));
+
+ // Both sockets will receive the echo request and reply in indeterminate
+ // order, so we'll need to read 2 packets from each.
+
+ // Receive on socket 1.
+ constexpr int kBufSize = kEmptyICMPSize;
+ char recv_buf1[2][kBufSize];
+ struct sockaddr_in src;
+ for (int i = 0; i < 2; i++) {
+ ASSERT_NO_FATAL_FAILURE(ReceiveICMP(recv_buf1[i],
+ ABSL_ARRAYSIZE(recv_buf1[i]),
+ sizeof(struct icmphdr), &src));
+ EXPECT_EQ(memcmp(&src, &addr_, sizeof(src)), 0);
+ }
+
+ // Receive on socket 2.
+ char recv_buf2[2][kBufSize];
+ for (int i = 0; i < 2; i++) {
+ ASSERT_NO_FATAL_FAILURE(
+ ReceiveICMPFrom(recv_buf2[i], ABSL_ARRAYSIZE(recv_buf2[i]),
+ sizeof(struct icmphdr), &src, s2.get()));
+ EXPECT_EQ(memcmp(&src, &addr_, sizeof(src)), 0);
+ }
+
+ // Ensure both sockets receive identical packets.
+ int types[] = {ICMP_ECHO, ICMP_ECHOREPLY};
+ for (int type : types) {
+ auto match_type = [=](char buf[kBufSize]) {
+ struct icmphdr* icmp =
+ reinterpret_cast<struct icmphdr*>(buf + sizeof(struct iphdr));
+ return icmp->type == type;
+ };
+ auto icmp1_it =
+ std::find_if(std::begin(recv_buf1), std::end(recv_buf1), match_type);
+ auto icmp2_it =
+ std::find_if(std::begin(recv_buf2), std::end(recv_buf2), match_type);
+ ASSERT_NE(icmp1_it, std::end(recv_buf1));
+ ASSERT_NE(icmp2_it, std::end(recv_buf2));
+ EXPECT_EQ(memcmp(*icmp1_it + sizeof(struct iphdr),
+ *icmp2_it + sizeof(struct iphdr), sizeof(icmp)),
+ 0);
+ }
+}
+
+// A raw ICMP socket and ping socket should both receive the ICMP packets
+// intended for the ping socket.
+TEST_F(RawSocketICMPTest, RawAndPingSockets) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ FileDescriptor ping_sock =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, IPPROTO_ICMP));
+
+ // Ping sockets take care of the ICMP ID and checksum.
+ struct icmphdr icmp;
+ icmp.type = ICMP_ECHO;
+ icmp.code = 0;
+ icmp.un.echo.sequence = *static_cast<unsigned short*>(&icmp.un.echo.sequence);
+ ASSERT_THAT(RetryEINTR(sendto)(ping_sock.get(), &icmp, sizeof(icmp), 0,
+ reinterpret_cast<struct sockaddr*>(&addr_),
+ sizeof(addr_)),
+ SyscallSucceedsWithValue(sizeof(icmp)));
+
+ // Receive on socket 1, which receives the echo request and reply in
+ // indeterminate order.
+ constexpr int kBufSize = kEmptyICMPSize;
+ char recv_buf1[2][kBufSize];
+ struct sockaddr_in src;
+ for (int i = 0; i < 2; i++) {
+ ASSERT_NO_FATAL_FAILURE(
+ ReceiveICMP(recv_buf1[i], kBufSize, sizeof(struct icmphdr), &src));
+ EXPECT_EQ(memcmp(&src, &addr_, sizeof(src)), 0);
+ }
+
+ // Receive on socket 2. Ping sockets only get the echo reply, not the initial
+ // echo.
+ char ping_recv_buf[kBufSize];
+ ASSERT_THAT(RetryEINTR(recv)(ping_sock.get(), ping_recv_buf, kBufSize, 0),
+ SyscallSucceedsWithValue(sizeof(struct icmphdr)));
+
+ // Ensure both sockets receive identical echo reply packets.
+ auto match_type_raw = [=](char buf[kBufSize]) {
+ struct icmphdr* icmp =
+ reinterpret_cast<struct icmphdr*>(buf + sizeof(struct iphdr));
+ return icmp->type == ICMP_ECHOREPLY;
+ };
+ auto raw_reply_it =
+ std::find_if(std::begin(recv_buf1), std::end(recv_buf1), match_type_raw);
+ ASSERT_NE(raw_reply_it, std::end(recv_buf1));
+ EXPECT_EQ(
+ memcmp(*raw_reply_it + sizeof(struct iphdr), ping_recv_buf, sizeof(icmp)),
+ 0);
+}
+
+// A raw ICMP socket should be able to send a malformed short ICMP Echo Request,
+// while ping socket should not.
+// Neither should be able to receieve a short malformed packet.
+TEST_F(RawSocketICMPTest, ShortEchoRawAndPingSockets) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ FileDescriptor ping_sock =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, IPPROTO_ICMP));
+
+ struct icmphdr icmp;
+ icmp.type = ICMP_ECHO;
+ icmp.code = 0;
+ icmp.un.echo.sequence = 0;
+ icmp.un.echo.id = 6789;
+ icmp.checksum = 0;
+ icmp.checksum = ICMPChecksum(icmp, NULL, 0);
+
+ // Omit 2 bytes from ICMP packet.
+ constexpr int kShortICMPSize = sizeof(icmp) - 2;
+
+ // Sending a malformed short ICMP message to a ping socket should fail.
+ ASSERT_THAT(RetryEINTR(sendto)(ping_sock.get(), &icmp, kShortICMPSize, 0,
+ reinterpret_cast<struct sockaddr*>(&addr_),
+ sizeof(addr_)),
+ SyscallFailsWithErrno(EINVAL));
+
+ // Sending a malformed short ICMP message to a raw socket should not fail.
+ ASSERT_THAT(RetryEINTR(sendto)(s_, &icmp, kShortICMPSize, 0,
+ reinterpret_cast<struct sockaddr*>(&addr_),
+ sizeof(addr_)),
+ SyscallSucceedsWithValue(kShortICMPSize));
+
+ // Neither Ping nor Raw socket should have anything to read.
+ char recv_buf[kEmptyICMPSize];
+ EXPECT_THAT(RetryEINTR(recv)(ping_sock.get(), recv_buf, sizeof(recv_buf),
+ MSG_DONTWAIT),
+ SyscallFailsWithErrno(EAGAIN));
+ EXPECT_THAT(RetryEINTR(recv)(s_, recv_buf, sizeof(recv_buf), MSG_DONTWAIT),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+// A raw ICMP socket should be able to send a malformed short ICMP Echo Reply,
+// while ping socket should not.
+// Neither should be able to receieve a short malformed packet.
+TEST_F(RawSocketICMPTest, ShortEchoReplyRawAndPingSockets) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ FileDescriptor ping_sock =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, IPPROTO_ICMP));
+
+ struct icmphdr icmp;
+ icmp.type = ICMP_ECHOREPLY;
+ icmp.code = 0;
+ icmp.un.echo.sequence = 0;
+ icmp.un.echo.id = 6789;
+ icmp.checksum = 0;
+ icmp.checksum = ICMPChecksum(icmp, NULL, 0);
+
+ // Omit 2 bytes from ICMP packet.
+ constexpr int kShortICMPSize = sizeof(icmp) - 2;
+
+ // Sending a malformed short ICMP message to a ping socket should fail.
+ ASSERT_THAT(RetryEINTR(sendto)(ping_sock.get(), &icmp, kShortICMPSize, 0,
+ reinterpret_cast<struct sockaddr*>(&addr_),
+ sizeof(addr_)),
+ SyscallFailsWithErrno(EINVAL));
+
+ // Sending a malformed short ICMP message to a raw socket should not fail.
+ ASSERT_THAT(RetryEINTR(sendto)(s_, &icmp, kShortICMPSize, 0,
+ reinterpret_cast<struct sockaddr*>(&addr_),
+ sizeof(addr_)),
+ SyscallSucceedsWithValue(kShortICMPSize));
+
+ // Neither Ping nor Raw socket should have anything to read.
+ char recv_buf[kEmptyICMPSize];
+ EXPECT_THAT(RetryEINTR(recv)(ping_sock.get(), recv_buf, sizeof(recv_buf),
+ MSG_DONTWAIT),
+ SyscallFailsWithErrno(EAGAIN));
+ EXPECT_THAT(RetryEINTR(recv)(s_, recv_buf, sizeof(recv_buf), MSG_DONTWAIT),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+// Test that connect() sends packets to the right place.
+TEST_F(RawSocketICMPTest, SendAndReceiveViaConnect) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ ASSERT_THAT(
+ connect(s_, reinterpret_cast<struct sockaddr*>(&addr_), sizeof(addr_)),
+ SyscallSucceeds());
+
+ // Prepare and send an ICMP packet. Use arbitrary junk for sequence and ID.
+ // None of that should matter for raw sockets - the kernel should still give
+ // us the packet.
+ struct icmphdr icmp;
+ icmp.type = ICMP_ECHO;
+ icmp.code = 0;
+ icmp.checksum = 0;
+ icmp.un.echo.sequence = 2003;
+ icmp.un.echo.id = 2004;
+ icmp.checksum = ICMPChecksum(icmp, NULL, 0);
+ ASSERT_THAT(send(s_, &icmp, sizeof(icmp), 0),
+ SyscallSucceedsWithValue(sizeof(icmp)));
+
+ ASSERT_NO_FATAL_FAILURE(ExpectICMPSuccess(icmp));
+}
+
+// Bind to localhost, then send and receive packets.
+TEST_F(RawSocketICMPTest, BindSendAndReceive) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ ASSERT_THAT(
+ bind(s_, reinterpret_cast<struct sockaddr*>(&addr_), sizeof(addr_)),
+ SyscallSucceeds());
+
+ // Prepare and send an ICMP packet. Use arbitrary junk for checksum, sequence,
+ // and ID. None of that should matter for raw sockets - the kernel should
+ // still give us the packet.
+ struct icmphdr icmp;
+ icmp.type = ICMP_ECHO;
+ icmp.code = 0;
+ icmp.checksum = 0;
+ icmp.un.echo.sequence = 2004;
+ icmp.un.echo.id = 2007;
+ icmp.checksum = ICMPChecksum(icmp, NULL, 0);
+ ASSERT_NO_FATAL_FAILURE(SendEmptyICMP(icmp));
+
+ ASSERT_NO_FATAL_FAILURE(ExpectICMPSuccess(icmp));
+}
+
+// Bind and connect to localhost and send/receive packets.
+TEST_F(RawSocketICMPTest, BindConnectSendAndReceive) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)));
+
+ ASSERT_THAT(
+ bind(s_, reinterpret_cast<struct sockaddr*>(&addr_), sizeof(addr_)),
+ SyscallSucceeds());
+ ASSERT_THAT(
+ connect(s_, reinterpret_cast<struct sockaddr*>(&addr_), sizeof(addr_)),
+ SyscallSucceeds());
+
+ // Prepare and send an ICMP packet. Use arbitrary junk for checksum, sequence,
+ // and ID. None of that should matter for raw sockets - the kernel should
+ // still give us the packet.
+ struct icmphdr icmp;
+ icmp.type = ICMP_ECHO;
+ icmp.code = 0;
+ icmp.checksum = 0;
+ icmp.un.echo.sequence = 2010;
+ icmp.un.echo.id = 7;
+ icmp.checksum = ICMPChecksum(icmp, NULL, 0);
+ ASSERT_NO_FATAL_FAILURE(SendEmptyICMP(icmp));
+
+ ASSERT_NO_FATAL_FAILURE(ExpectICMPSuccess(icmp));
+}
+
+void RawSocketICMPTest::ExpectICMPSuccess(const struct icmphdr& icmp) {
+ // We're going to receive both the echo request and reply, but the order is
+ // indeterminate.
+ char recv_buf[kEmptyICMPSize];
+ struct sockaddr_in src;
+ bool received_request = false;
+ bool received_reply = false;
+
+ for (int i = 0; i < 2; i++) {
+ // Receive the packet.
+ ASSERT_NO_FATAL_FAILURE(ReceiveICMP(recv_buf, ABSL_ARRAYSIZE(recv_buf),
+ sizeof(struct icmphdr), &src));
+ EXPECT_EQ(memcmp(&src, &addr_, sizeof(src)), 0);
+ struct icmphdr* recvd_icmp =
+ reinterpret_cast<struct icmphdr*>(recv_buf + sizeof(struct iphdr));
+ switch (recvd_icmp->type) {
+ case ICMP_ECHO:
+ EXPECT_FALSE(received_request);
+ received_request = true;
+ // The packet should be identical to what we sent.
+ EXPECT_EQ(memcmp(recv_buf + sizeof(struct iphdr), &icmp, sizeof(icmp)),
+ 0);
+ break;
+
+ case ICMP_ECHOREPLY:
+ EXPECT_FALSE(received_reply);
+ received_reply = true;
+ // Most fields should be the same.
+ EXPECT_EQ(recvd_icmp->code, icmp.code);
+ EXPECT_EQ(recvd_icmp->un.echo.sequence, icmp.un.echo.sequence);
+ EXPECT_EQ(recvd_icmp->un.echo.id, icmp.un.echo.id);
+ // A couple are different.
+ EXPECT_EQ(recvd_icmp->type, ICMP_ECHOREPLY);
+ // The checksum computed over the reply should still be valid.
+ EXPECT_EQ(ICMPChecksum(*recvd_icmp, NULL, 0), 0);
+ break;
+ }
+ }
+
+ ASSERT_TRUE(received_request);
+ ASSERT_TRUE(received_reply);
+}
+
+void RawSocketICMPTest::SendEmptyICMP(const struct icmphdr& icmp) {
+ ASSERT_NO_FATAL_FAILURE(SendEmptyICMPTo(s_, addr_, icmp));
+}
+
+void RawSocketICMPTest::SendEmptyICMPTo(int sock,
+ const struct sockaddr_in& addr,
+ const struct icmphdr& icmp) {
+ // It's safe to use const_cast here because sendmsg won't modify the iovec or
+ // address.
+ struct iovec iov = {};
+ iov.iov_base = static_cast<void*>(const_cast<struct icmphdr*>(&icmp));
+ iov.iov_len = sizeof(icmp);
+ struct msghdr msg = {};
+ msg.msg_name = static_cast<void*>(const_cast<struct sockaddr_in*>(&addr));
+ msg.msg_namelen = sizeof(addr);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+ ASSERT_THAT(sendmsg(sock, &msg, 0), SyscallSucceedsWithValue(sizeof(icmp)));
+}
+
+void RawSocketICMPTest::ReceiveICMP(char* recv_buf, size_t recv_buf_len,
+ size_t expected_size,
+ struct sockaddr_in* src) {
+ ASSERT_NO_FATAL_FAILURE(
+ ReceiveICMPFrom(recv_buf, recv_buf_len, expected_size, src, s_));
+}
+
+void RawSocketICMPTest::ReceiveICMPFrom(char* recv_buf, size_t recv_buf_len,
+ size_t expected_size,
+ struct sockaddr_in* src, int sock) {
+ struct iovec iov = {};
+ iov.iov_base = recv_buf;
+ iov.iov_len = recv_buf_len;
+ struct msghdr msg = {};
+ msg.msg_name = src;
+ msg.msg_namelen = sizeof(*src);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = 0;
+ // We should receive the ICMP packet plus 20 bytes of IP header.
+ ASSERT_THAT(recvmsg(sock, &msg, 0),
+ SyscallSucceedsWithValue(expected_size + sizeof(struct iphdr)));
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/read.cc b/test/syscalls/linux/read.cc
new file mode 100644
index 000000000..2633ba31b
--- /dev/null
+++ b/test/syscalls/linux/read.cc
@@ -0,0 +1,118 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <unistd.h>
+
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+class ReadTest : public ::testing::Test {
+ void SetUp() override {
+ name_ = NewTempAbsPath();
+ int fd;
+ ASSERT_THAT(fd = open(name_.c_str(), O_CREAT, 0644), SyscallSucceeds());
+ ASSERT_THAT(close(fd), SyscallSucceeds());
+ }
+
+ void TearDown() override { unlink(name_.c_str()); }
+
+ public:
+ std::string name_;
+};
+
+TEST_F(ReadTest, ZeroBuffer) {
+ int fd;
+ ASSERT_THAT(fd = open(name_.c_str(), O_RDWR), SyscallSucceeds());
+
+ char msg[] = "hello world";
+ EXPECT_THAT(PwriteFd(fd, msg, strlen(msg), 0),
+ SyscallSucceedsWithValue(strlen(msg)));
+
+ char buf[10];
+ EXPECT_THAT(ReadFd(fd, buf, 0), SyscallSucceedsWithValue(0));
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
+TEST_F(ReadTest, EmptyFileReturnsZeroAtEOF) {
+ int fd;
+ ASSERT_THAT(fd = open(name_.c_str(), O_RDWR), SyscallSucceeds());
+
+ char eof_buf[10];
+ EXPECT_THAT(ReadFd(fd, eof_buf, 10), SyscallSucceedsWithValue(0));
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
+TEST_F(ReadTest, EofAfterRead) {
+ int fd;
+ ASSERT_THAT(fd = open(name_.c_str(), O_RDWR), SyscallSucceeds());
+
+ // Write some bytes to be read.
+ constexpr char kMessage[] = "hello world";
+ EXPECT_THAT(PwriteFd(fd, kMessage, sizeof(kMessage), 0),
+ SyscallSucceedsWithValue(sizeof(kMessage)));
+
+ // Read all of the bytes at once.
+ char buf[sizeof(kMessage)];
+ EXPECT_THAT(ReadFd(fd, buf, sizeof(kMessage)),
+ SyscallSucceedsWithValue(sizeof(kMessage)));
+
+ // Read again with a non-zero buffer and expect EOF.
+ char eof_buf[10];
+ EXPECT_THAT(ReadFd(fd, eof_buf, 10), SyscallSucceedsWithValue(0));
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
+TEST_F(ReadTest, DevNullReturnsEof) {
+ int fd;
+ ASSERT_THAT(fd = open("/dev/null", O_RDONLY), SyscallSucceeds());
+ std::vector<char> buf(1);
+ EXPECT_THAT(ReadFd(fd, buf.data(), 1), SyscallSucceedsWithValue(0));
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
+const int kReadSize = 128 * 1024;
+
+// Do not allow random save as it could lead to partial reads.
+TEST_F(ReadTest, CanReadFullyFromDevZero_NoRandomSave) {
+ int fd;
+ ASSERT_THAT(fd = open("/dev/zero", O_RDONLY), SyscallSucceeds());
+
+ std::vector<char> buf(kReadSize, 1);
+ EXPECT_THAT(ReadFd(fd, buf.data(), kReadSize),
+ SyscallSucceedsWithValue(kReadSize));
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+ EXPECT_EQ(std::vector<char>(kReadSize, 0), buf);
+}
+
+TEST_F(ReadTest, ReadDirectoryFails) {
+ const FileDescriptor file =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(GetAbsoluteTestTmpdir(), O_RDONLY));
+ std::vector<char> buf(1);
+ EXPECT_THAT(ReadFd(file.get(), buf.data(), 1), SyscallFailsWithErrno(EISDIR));
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/readahead.cc b/test/syscalls/linux/readahead.cc
new file mode 100644
index 000000000..09703b5c1
--- /dev/null
+++ b/test/syscalls/linux/readahead.cc
@@ -0,0 +1,91 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+
+#include "gtest/gtest.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(ReadaheadTest, InvalidFD) {
+ EXPECT_THAT(readahead(-1, 1, 1), SyscallFailsWithErrno(EBADF));
+}
+
+TEST(ReadaheadTest, InvalidOffset) {
+ const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDWR));
+ EXPECT_THAT(readahead(fd.get(), -1, 1), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(ReadaheadTest, ValidOffset) {
+ constexpr char kData[] = "123";
+ const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), kData, TempPath::kDefaultFileMode));
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDWR));
+
+ // N.B. The implementation of readahead is filesystem-specific, and a file
+ // backed by ram may return EINVAL because there is nothing to be read.
+ EXPECT_THAT(readahead(fd.get(), 1, 1), AnyOf(SyscallSucceedsWithValue(0),
+ SyscallFailsWithErrno(EINVAL)));
+}
+
+TEST(ReadaheadTest, PastEnd) {
+ constexpr char kData[] = "123";
+ const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), kData, TempPath::kDefaultFileMode));
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDWR));
+ // See above.
+ EXPECT_THAT(readahead(fd.get(), 2, 2), AnyOf(SyscallSucceedsWithValue(0),
+ SyscallFailsWithErrno(EINVAL)));
+}
+
+TEST(ReadaheadTest, CrossesEnd) {
+ constexpr char kData[] = "123";
+ const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), kData, TempPath::kDefaultFileMode));
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDWR));
+ // See above.
+ EXPECT_THAT(readahead(fd.get(), 4, 2), AnyOf(SyscallSucceedsWithValue(0),
+ SyscallFailsWithErrno(EINVAL)));
+}
+
+TEST(ReadaheadTest, WriteOnly) {
+ const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_WRONLY));
+ EXPECT_THAT(readahead(fd.get(), 0, 1), SyscallFailsWithErrno(EBADF));
+}
+
+TEST(ReadaheadTest, InvalidSize) {
+ const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDWR));
+ EXPECT_THAT(readahead(fd.get(), 0, -1), SyscallFailsWithErrno(EINVAL));
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/readv.cc b/test/syscalls/linux/readv.cc
new file mode 100644
index 000000000..baaf9f757
--- /dev/null
+++ b/test/syscalls/linux/readv.cc
@@ -0,0 +1,294 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/file_base.h"
+#include "test/syscalls/linux/readv_common.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+#include "test/util/timer_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+class ReadvTest : public FileTest {
+ void SetUp() override {
+ FileTest::SetUp();
+
+ ASSERT_THAT(write(test_file_fd_.get(), kReadvTestData, kReadvTestDataSize),
+ SyscallSucceedsWithValue(kReadvTestDataSize));
+ ASSERT_THAT(lseek(test_file_fd_.get(), 0, SEEK_SET),
+ SyscallSucceedsWithValue(0));
+ ASSERT_THAT(write(test_pipe_[1], kReadvTestData, kReadvTestDataSize),
+ SyscallSucceedsWithValue(kReadvTestDataSize));
+ }
+};
+
+TEST_F(ReadvTest, ReadOneBufferPerByte_File) {
+ ReadOneBufferPerByte(test_file_fd_.get());
+}
+
+TEST_F(ReadvTest, ReadOneBufferPerByte_Pipe) {
+ ReadOneBufferPerByte(test_pipe_[0]);
+}
+
+TEST_F(ReadvTest, ReadOneHalfAtATime_File) {
+ ReadOneHalfAtATime(test_file_fd_.get());
+}
+
+TEST_F(ReadvTest, ReadOneHalfAtATime_Pipe) {
+ ReadOneHalfAtATime(test_pipe_[0]);
+}
+
+TEST_F(ReadvTest, ReadAllOneBuffer_File) {
+ ReadAllOneBuffer(test_file_fd_.get());
+}
+
+TEST_F(ReadvTest, ReadAllOneBuffer_Pipe) { ReadAllOneBuffer(test_pipe_[0]); }
+
+TEST_F(ReadvTest, ReadAllOneLargeBuffer_File) {
+ ReadAllOneLargeBuffer(test_file_fd_.get());
+}
+
+TEST_F(ReadvTest, ReadAllOneLargeBuffer_Pipe) {
+ ReadAllOneLargeBuffer(test_pipe_[0]);
+}
+
+TEST_F(ReadvTest, ReadBuffersOverlapping_File) {
+ ReadBuffersOverlapping(test_file_fd_.get());
+}
+
+TEST_F(ReadvTest, ReadBuffersOverlapping_Pipe) {
+ ReadBuffersOverlapping(test_pipe_[0]);
+}
+
+TEST_F(ReadvTest, ReadBuffersDiscontinuous_File) {
+ ReadBuffersDiscontinuous(test_file_fd_.get());
+}
+
+TEST_F(ReadvTest, ReadBuffersDiscontinuous_Pipe) {
+ ReadBuffersDiscontinuous(test_pipe_[0]);
+}
+
+TEST_F(ReadvTest, ReadIovecsCompletelyFilled_File) {
+ ReadIovecsCompletelyFilled(test_file_fd_.get());
+}
+
+TEST_F(ReadvTest, ReadIovecsCompletelyFilled_Pipe) {
+ ReadIovecsCompletelyFilled(test_pipe_[0]);
+}
+
+TEST_F(ReadvTest, BadFileDescriptor) {
+ char buffer[1024];
+ struct iovec iov[1];
+ iov[0].iov_base = buffer;
+ iov[0].iov_len = 1024;
+
+ ASSERT_THAT(readv(-1, iov, 1024), SyscallFailsWithErrno(EBADF));
+}
+
+TEST_F(ReadvTest, BadIovecsPointer_File) {
+ ASSERT_THAT(readv(test_file_fd_.get(), nullptr, 1),
+ SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(ReadvTest, BadIovecsPointer_Pipe) {
+ ASSERT_THAT(readv(test_pipe_[0], nullptr, 1), SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(ReadvTest, BadIovecBase_File) {
+ struct iovec iov[1];
+ iov[0].iov_base = nullptr;
+ iov[0].iov_len = 1024;
+ ASSERT_THAT(readv(test_file_fd_.get(), iov, 1),
+ SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(ReadvTest, BadIovecBase_Pipe) {
+ struct iovec iov[1];
+ iov[0].iov_base = nullptr;
+ iov[0].iov_len = 1024;
+ ASSERT_THAT(readv(test_pipe_[0], iov, 1), SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(ReadvTest, ZeroIovecs_File) {
+ struct iovec iov[1];
+ iov[0].iov_base = 0;
+ iov[0].iov_len = 0;
+ ASSERT_THAT(readv(test_file_fd_.get(), iov, 1), SyscallSucceeds());
+}
+
+TEST_F(ReadvTest, ZeroIovecs_Pipe) {
+ struct iovec iov[1];
+ iov[0].iov_base = 0;
+ iov[0].iov_len = 0;
+ ASSERT_THAT(readv(test_pipe_[0], iov, 1), SyscallSucceeds());
+}
+
+TEST_F(ReadvTest, NotReadable_File) {
+ char buffer[1024];
+ struct iovec iov[1];
+ iov[0].iov_base = buffer;
+ iov[0].iov_len = 1024;
+
+ std::string wronly_file = NewTempAbsPath();
+ FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Open(wronly_file, O_CREAT | O_WRONLY, S_IRUSR | S_IWUSR));
+ ASSERT_THAT(readv(fd.get(), iov, 1), SyscallFailsWithErrno(EBADF));
+ fd.reset(); // Close before unlinking.
+ ASSERT_THAT(unlink(wronly_file.c_str()), SyscallSucceeds());
+}
+
+TEST_F(ReadvTest, NotReadable_Pipe) {
+ char buffer[1024];
+ struct iovec iov[1];
+ iov[0].iov_base = buffer;
+ iov[0].iov_len = 1024;
+ ASSERT_THAT(readv(test_pipe_[1], iov, 1), SyscallFailsWithErrno(EBADF));
+}
+
+TEST_F(ReadvTest, DirNotReadable) {
+ char buffer[1024];
+ struct iovec iov[1];
+ iov[0].iov_base = buffer;
+ iov[0].iov_len = 1024;
+
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(GetAbsoluteTestTmpdir(), O_RDONLY));
+ ASSERT_THAT(readv(fd.get(), iov, 1), SyscallFailsWithErrno(EISDIR));
+}
+
+TEST_F(ReadvTest, OffsetIncremented) {
+ char* buffer = reinterpret_cast<char*>(malloc(kReadvTestDataSize));
+ struct iovec iov[1];
+ iov[0].iov_base = buffer;
+ iov[0].iov_len = kReadvTestDataSize;
+
+ ASSERT_THAT(readv(test_file_fd_.get(), iov, 1),
+ SyscallSucceedsWithValue(kReadvTestDataSize));
+ ASSERT_THAT(lseek(test_file_fd_.get(), 0, SEEK_CUR),
+ SyscallSucceedsWithValue(kReadvTestDataSize));
+
+ free(buffer);
+}
+
+TEST_F(ReadvTest, EndOfFile) {
+ char* buffer = reinterpret_cast<char*>(malloc(kReadvTestDataSize));
+ struct iovec iov[1];
+ iov[0].iov_base = buffer;
+ iov[0].iov_len = kReadvTestDataSize;
+ ASSERT_THAT(readv(test_file_fd_.get(), iov, 1),
+ SyscallSucceedsWithValue(kReadvTestDataSize));
+ free(buffer);
+
+ buffer = reinterpret_cast<char*>(malloc(kReadvTestDataSize));
+ iov[0].iov_base = buffer;
+ iov[0].iov_len = kReadvTestDataSize;
+ ASSERT_THAT(readv(test_file_fd_.get(), iov, 1), SyscallSucceedsWithValue(0));
+ free(buffer);
+}
+
+TEST_F(ReadvTest, WouldBlock_Pipe) {
+ struct iovec iov[1];
+ iov[0].iov_base = reinterpret_cast<char*>(malloc(kReadvTestDataSize));
+ iov[0].iov_len = kReadvTestDataSize;
+ ASSERT_THAT(readv(test_pipe_[0], iov, 1),
+ SyscallSucceedsWithValue(kReadvTestDataSize));
+ free(iov[0].iov_base);
+
+ iov[0].iov_base = reinterpret_cast<char*>(malloc(kReadvTestDataSize));
+ ASSERT_THAT(readv(test_pipe_[0], iov, 1), SyscallFailsWithErrno(EAGAIN));
+ free(iov[0].iov_base);
+}
+
+TEST_F(ReadvTest, ZeroBuffer) {
+ char buf[10];
+ struct iovec iov[1];
+ iov[0].iov_base = buf;
+ iov[0].iov_len = 0;
+ ASSERT_THAT(readv(test_pipe_[0], iov, 1), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(ReadvTest, NullIovecInNonemptyArray) {
+ std::vector<char> buf(kReadvTestDataSize);
+ struct iovec iov[2];
+ iov[0].iov_base = nullptr;
+ iov[0].iov_len = 0;
+ iov[1].iov_base = buf.data();
+ iov[1].iov_len = kReadvTestDataSize;
+ ASSERT_THAT(readv(test_file_fd_.get(), iov, 2),
+ SyscallSucceedsWithValue(kReadvTestDataSize));
+}
+
+TEST_F(ReadvTest, IovecOutsideTaskAddressRangeInNonemptyArray) {
+ std::vector<char> buf(kReadvTestDataSize);
+ struct iovec iov[2];
+ iov[0].iov_base = reinterpret_cast<void*>(~static_cast<uintptr_t>(0));
+ iov[0].iov_len = 0;
+ iov[1].iov_base = buf.data();
+ iov[1].iov_len = kReadvTestDataSize;
+ ASSERT_THAT(readv(test_file_fd_.get(), iov, 2),
+ SyscallFailsWithErrno(EFAULT));
+}
+
+// This test depends on the maximum extent of a single readv() syscall, so
+// we can't tolerate interruption from saving.
+TEST(ReadvTestNoFixture, TruncatedAtMax_NoRandomSave) {
+ // Ensure that we won't be interrupted by ITIMER_PROF. This is particularly
+ // important in environments where automated profiling tools may start
+ // ITIMER_PROF automatically.
+ struct itimerval itv = {};
+ auto const cleanup_itimer =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedItimer(ITIMER_PROF, itv));
+
+ // From Linux's include/linux/fs.h.
+ size_t const MAX_RW_COUNT = INT_MAX & ~(kPageSize - 1);
+
+ // Create an iovec array with 3 segments pointing to consecutive parts of a
+ // buffer. The first covers all but the last three pages, and should be
+ // written to in its entirety. The second covers the last page before
+ // MAX_RW_COUNT and the first page after; only the first page should be
+ // written to. The third covers the last page of the buffer, and should be
+ // skipped entirely.
+ size_t const kBufferSize = MAX_RW_COUNT + 2 * kPageSize;
+ size_t const kFirstOffset = MAX_RW_COUNT - kPageSize;
+ size_t const kSecondOffset = MAX_RW_COUNT + kPageSize;
+ // The buffer is too big to fit on the stack.
+ std::vector<char> buf(kBufferSize);
+ struct iovec iov[3];
+ iov[0].iov_base = buf.data();
+ iov[0].iov_len = kFirstOffset;
+ iov[1].iov_base = buf.data() + kFirstOffset;
+ iov[1].iov_len = kSecondOffset - kFirstOffset;
+ iov[2].iov_base = buf.data() + kSecondOffset;
+ iov[2].iov_len = kBufferSize - kSecondOffset;
+
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/zero", O_RDONLY));
+ EXPECT_THAT(readv(fd.get(), iov, 3), SyscallSucceedsWithValue(MAX_RW_COUNT));
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/readv_common.cc b/test/syscalls/linux/readv_common.cc
new file mode 100644
index 000000000..2694dc64f
--- /dev/null
+++ b/test/syscalls/linux/readv_common.cc
@@ -0,0 +1,220 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// MatchesStringLength checks that a tuple argument of (struct iovec *, int)
+// corresponding to an iovec array and its length, contains data that matches
+// the string length strlen.
+MATCHER_P(MatchesStringLength, strlen, "") {
+ struct iovec* iovs = arg.first;
+ int niov = arg.second;
+ int offset = 0;
+ for (int i = 0; i < niov; i++) {
+ offset += iovs[i].iov_len;
+ }
+ if (offset != static_cast<int>(strlen)) {
+ *result_listener << offset;
+ return false;
+ }
+ return true;
+}
+
+// MatchesStringValue checks that a tuple argument of (struct iovec *, int)
+// corresponding to an iovec array and its length, contains data that matches
+// the string value str.
+MATCHER_P(MatchesStringValue, str, "") {
+ struct iovec* iovs = arg.first;
+ int len = strlen(str);
+ int niov = arg.second;
+ int offset = 0;
+ for (int i = 0; i < niov; i++) {
+ struct iovec iov = iovs[i];
+ if (len < offset) {
+ *result_listener << "strlen " << len << " < offset " << offset;
+ return false;
+ }
+ if (strncmp(static_cast<char*>(iov.iov_base), &str[offset], iov.iov_len)) {
+ absl::string_view iovec_string(static_cast<char*>(iov.iov_base),
+ iov.iov_len);
+ *result_listener << iovec_string << " @offset " << offset;
+ return false;
+ }
+ offset += iov.iov_len;
+ }
+ return true;
+}
+
+extern const char kReadvTestData[] =
+ "127.0.0.1 localhost"
+ ""
+ "# The following lines are desirable for IPv6 capable hosts"
+ "::1 ip6-localhost ip6-loopback"
+ "fe00::0 ip6-localnet"
+ "ff00::0 ip6-mcastprefix"
+ "ff02::1 ip6-allnodes"
+ "ff02::2 ip6-allrouters"
+ "ff02::3 ip6-allhosts"
+ "192.168.1.100 a"
+ "93.184.216.34 foo.bar.example.com xcpu";
+extern const size_t kReadvTestDataSize = sizeof(kReadvTestData);
+
+static void ReadAllOneProvidedBuffer(int fd, std::vector<char>* buffer) {
+ struct iovec iovs[1];
+ iovs[0].iov_base = buffer->data();
+ iovs[0].iov_len = kReadvTestDataSize;
+
+ ASSERT_THAT(readv(fd, iovs, 1), SyscallSucceedsWithValue(kReadvTestDataSize));
+
+ std::pair<struct iovec*, int> iovec_desc(iovs, 1);
+ EXPECT_THAT(iovec_desc, MatchesStringLength(kReadvTestDataSize));
+ EXPECT_THAT(iovec_desc, MatchesStringValue(kReadvTestData));
+}
+
+void ReadAllOneBuffer(int fd) {
+ std::vector<char> buffer(kReadvTestDataSize);
+ ReadAllOneProvidedBuffer(fd, &buffer);
+}
+
+void ReadAllOneLargeBuffer(int fd) {
+ std::vector<char> buffer(10 * kReadvTestDataSize);
+ ReadAllOneProvidedBuffer(fd, &buffer);
+}
+
+void ReadOneHalfAtATime(int fd) {
+ int len0 = kReadvTestDataSize / 2;
+ int len1 = kReadvTestDataSize - len0;
+ std::vector<char> buffer0(len0);
+ std::vector<char> buffer1(len1);
+
+ struct iovec iovs[2];
+ iovs[0].iov_base = buffer0.data();
+ iovs[0].iov_len = len0;
+ iovs[1].iov_base = buffer1.data();
+ iovs[1].iov_len = len1;
+
+ ASSERT_THAT(readv(fd, iovs, 2), SyscallSucceedsWithValue(kReadvTestDataSize));
+
+ std::pair<struct iovec*, int> iovec_desc(iovs, 2);
+ EXPECT_THAT(iovec_desc, MatchesStringLength(kReadvTestDataSize));
+ EXPECT_THAT(iovec_desc, MatchesStringValue(kReadvTestData));
+}
+
+void ReadOneBufferPerByte(int fd) {
+ std::vector<char> buffer(kReadvTestDataSize);
+ std::vector<struct iovec> iovs(kReadvTestDataSize);
+ char* buffer_ptr = buffer.data();
+ struct iovec* iovs_ptr = iovs.data();
+
+ for (int i = 0; i < static_cast<int>(kReadvTestDataSize); i++) {
+ struct iovec iov = {
+ .iov_base = &buffer_ptr[i],
+ .iov_len = 1,
+ };
+ iovs_ptr[i] = iov;
+ }
+
+ ASSERT_THAT(readv(fd, iovs_ptr, kReadvTestDataSize),
+ SyscallSucceedsWithValue(kReadvTestDataSize));
+
+ std::pair<struct iovec*, int> iovec_desc(iovs.data(), kReadvTestDataSize);
+ EXPECT_THAT(iovec_desc, MatchesStringLength(kReadvTestDataSize));
+ EXPECT_THAT(iovec_desc, MatchesStringValue(kReadvTestData));
+}
+
+void ReadBuffersOverlapping(int fd) {
+ // overlap the first overlap_bytes.
+ int overlap_bytes = 8;
+ std::vector<char> buffer(kReadvTestDataSize);
+
+ // overlapping causes us to get more data.
+ int expected_size = kReadvTestDataSize + overlap_bytes;
+ std::vector<char> expected(expected_size);
+ char* expected_ptr = expected.data();
+ memcpy(expected_ptr, &kReadvTestData[overlap_bytes], overlap_bytes);
+ memcpy(&expected_ptr[overlap_bytes], &kReadvTestData[overlap_bytes],
+ kReadvTestDataSize - overlap_bytes);
+
+ struct iovec iovs[2];
+ iovs[0].iov_base = buffer.data();
+ iovs[0].iov_len = overlap_bytes;
+ iovs[1].iov_base = buffer.data();
+ iovs[1].iov_len = kReadvTestDataSize;
+
+ ASSERT_THAT(readv(fd, iovs, 2), SyscallSucceedsWithValue(kReadvTestDataSize));
+
+ std::pair<struct iovec*, int> iovec_desc(iovs, 2);
+ EXPECT_THAT(iovec_desc, MatchesStringLength(expected_size));
+ EXPECT_THAT(iovec_desc, MatchesStringValue(expected_ptr));
+}
+
+void ReadBuffersDiscontinuous(int fd) {
+ // Each iov is 1 byte separated by 1 byte.
+ std::vector<char> buffer(kReadvTestDataSize * 2);
+ std::vector<struct iovec> iovs(kReadvTestDataSize);
+
+ char* buffer_ptr = buffer.data();
+ struct iovec* iovs_ptr = iovs.data();
+
+ for (int i = 0; i < static_cast<int>(kReadvTestDataSize); i++) {
+ struct iovec iov = {
+ .iov_base = &buffer_ptr[i * 2],
+ .iov_len = 1,
+ };
+ iovs_ptr[i] = iov;
+ }
+
+ ASSERT_THAT(readv(fd, iovs_ptr, kReadvTestDataSize),
+ SyscallSucceedsWithValue(kReadvTestDataSize));
+
+ std::pair<struct iovec*, int> iovec_desc(iovs.data(), kReadvTestDataSize);
+ EXPECT_THAT(iovec_desc, MatchesStringLength(kReadvTestDataSize));
+ EXPECT_THAT(iovec_desc, MatchesStringValue(kReadvTestData));
+}
+
+void ReadIovecsCompletelyFilled(int fd) {
+ int half = kReadvTestDataSize / 2;
+ std::vector<char> buffer(kReadvTestDataSize);
+ char* buffer_ptr = buffer.data();
+ memset(buffer.data(), '\0', kReadvTestDataSize);
+
+ struct iovec iovs[2];
+ iovs[0].iov_base = buffer.data();
+ iovs[0].iov_len = half;
+ iovs[1].iov_base = &buffer_ptr[half];
+ iovs[1].iov_len = half;
+
+ ASSERT_THAT(readv(fd, iovs, 2), SyscallSucceedsWithValue(half * 2));
+
+ std::pair<struct iovec*, int> iovec_desc(iovs, 2);
+ EXPECT_THAT(iovec_desc, MatchesStringLength(half * 2));
+ EXPECT_THAT(iovec_desc, MatchesStringValue(kReadvTestData));
+
+ char* str = static_cast<char*>(iovs[0].iov_base);
+ str[iovs[0].iov_len - 1] = '\0';
+ ASSERT_EQ(half - 1, strlen(str));
+}
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/readv_common.h b/test/syscalls/linux/readv_common.h
new file mode 100644
index 000000000..2fa40c35f
--- /dev/null
+++ b/test/syscalls/linux/readv_common.h
@@ -0,0 +1,61 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_READV_COMMON_H_
+#define GVISOR_TEST_SYSCALLS_READV_COMMON_H_
+
+#include <stddef.h>
+
+namespace gvisor {
+namespace testing {
+
+// A NUL-terminated string containing the data used by tests using the following
+// test helpers.
+extern const char kReadvTestData[];
+
+// The size of kReadvTestData, including the terminating NUL.
+extern const size_t kReadvTestDataSize;
+
+// ReadAllOneBuffer asserts that it can read kReadvTestData from an fd using
+// exactly one iovec.
+void ReadAllOneBuffer(int fd);
+
+// ReadAllOneLargeBuffer asserts that it can read kReadvTestData from an fd
+// using exactly one iovec containing an overly large buffer.
+void ReadAllOneLargeBuffer(int fd);
+
+// ReadOneHalfAtATime asserts that it can read test_data_from an fd using
+// exactly two iovecs that are roughly equivalent in size.
+void ReadOneHalfAtATime(int fd);
+
+// ReadOneBufferPerByte asserts that it can read kReadvTestData from an fd
+// using one iovec per byte.
+void ReadOneBufferPerByte(int fd);
+
+// ReadBuffersOverlapping asserts that it can read kReadvTestData from an fd
+// where two iovecs are overlapping.
+void ReadBuffersOverlapping(int fd);
+
+// ReadBuffersDiscontinuous asserts that it can read kReadvTestData from an fd
+// where each iovec is discontinuous from the next by 1 byte.
+void ReadBuffersDiscontinuous(int fd);
+
+// ReadIovecsCompletelyFilled asserts that the previous iovec is completely
+// filled before moving onto the next.
+void ReadIovecsCompletelyFilled(int fd);
+
+} // namespace testing
+} // namespace gvisor
+
+#endif // GVISOR_TEST_SYSCALLS_READV_COMMON_H_
diff --git a/test/syscalls/linux/readv_socket.cc b/test/syscalls/linux/readv_socket.cc
new file mode 100644
index 000000000..dd6fb7008
--- /dev/null
+++ b/test/syscalls/linux/readv_socket.cc
@@ -0,0 +1,212 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/readv_common.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+class ReadvSocketTest : public ::testing::Test {
+ public:
+ void SetUp() override {
+ test_unix_stream_socket_[0] = -1;
+ test_unix_stream_socket_[1] = -1;
+ test_unix_dgram_socket_[0] = -1;
+ test_unix_dgram_socket_[1] = -1;
+ test_unix_seqpacket_socket_[0] = -1;
+ test_unix_seqpacket_socket_[1] = -1;
+
+ ASSERT_THAT(socketpair(AF_UNIX, SOCK_STREAM, 0, test_unix_stream_socket_),
+ SyscallSucceeds());
+ ASSERT_THAT(fcntl(test_unix_stream_socket_[0], F_SETFL, O_NONBLOCK),
+ SyscallSucceeds());
+ ASSERT_THAT(socketpair(AF_UNIX, SOCK_DGRAM, 0, test_unix_dgram_socket_),
+ SyscallSucceeds());
+ ASSERT_THAT(fcntl(test_unix_dgram_socket_[0], F_SETFL, O_NONBLOCK),
+ SyscallSucceeds());
+ ASSERT_THAT(
+ socketpair(AF_UNIX, SOCK_SEQPACKET, 0, test_unix_seqpacket_socket_),
+ SyscallSucceeds());
+ ASSERT_THAT(fcntl(test_unix_seqpacket_socket_[0], F_SETFL, O_NONBLOCK),
+ SyscallSucceeds());
+
+ ASSERT_THAT(
+ write(test_unix_stream_socket_[1], kReadvTestData, kReadvTestDataSize),
+ SyscallSucceedsWithValue(kReadvTestDataSize));
+ ASSERT_THAT(
+ write(test_unix_dgram_socket_[1], kReadvTestData, kReadvTestDataSize),
+ SyscallSucceedsWithValue(kReadvTestDataSize));
+ ASSERT_THAT(write(test_unix_seqpacket_socket_[1], kReadvTestData,
+ kReadvTestDataSize),
+ SyscallSucceedsWithValue(kReadvTestDataSize));
+ }
+
+ void TearDown() override {
+ close(test_unix_stream_socket_[0]);
+ close(test_unix_stream_socket_[1]);
+
+ close(test_unix_dgram_socket_[0]);
+ close(test_unix_dgram_socket_[1]);
+
+ close(test_unix_seqpacket_socket_[0]);
+ close(test_unix_seqpacket_socket_[1]);
+ }
+
+ int test_unix_stream_socket_[2];
+ int test_unix_dgram_socket_[2];
+ int test_unix_seqpacket_socket_[2];
+};
+
+TEST_F(ReadvSocketTest, ReadOneBufferPerByte_StreamSocket) {
+ ReadOneBufferPerByte(test_unix_stream_socket_[0]);
+}
+
+TEST_F(ReadvSocketTest, ReadOneBufferPerByte_DgramSocket) {
+ ReadOneBufferPerByte(test_unix_dgram_socket_[0]);
+}
+
+TEST_F(ReadvSocketTest, ReadOneBufferPerByte_SeqPacketSocket) {
+ ReadOneBufferPerByte(test_unix_seqpacket_socket_[0]);
+}
+
+TEST_F(ReadvSocketTest, ReadOneHalfAtATime_StreamSocket) {
+ ReadOneHalfAtATime(test_unix_stream_socket_[0]);
+}
+
+TEST_F(ReadvSocketTest, ReadOneHalfAtATime_DgramSocket) {
+ ReadOneHalfAtATime(test_unix_dgram_socket_[0]);
+}
+
+TEST_F(ReadvSocketTest, ReadAllOneBuffer_StreamSocket) {
+ ReadAllOneBuffer(test_unix_stream_socket_[0]);
+}
+
+TEST_F(ReadvSocketTest, ReadAllOneBuffer_DgramSocket) {
+ ReadAllOneBuffer(test_unix_dgram_socket_[0]);
+}
+
+TEST_F(ReadvSocketTest, ReadAllOneLargeBuffer_StreamSocket) {
+ ReadAllOneLargeBuffer(test_unix_stream_socket_[0]);
+}
+
+TEST_F(ReadvSocketTest, ReadAllOneLargeBuffer_DgramSocket) {
+ ReadAllOneLargeBuffer(test_unix_dgram_socket_[0]);
+}
+
+TEST_F(ReadvSocketTest, ReadBuffersOverlapping_StreamSocket) {
+ ReadBuffersOverlapping(test_unix_stream_socket_[0]);
+}
+
+TEST_F(ReadvSocketTest, ReadBuffersOverlapping_DgramSocket) {
+ ReadBuffersOverlapping(test_unix_dgram_socket_[0]);
+}
+
+TEST_F(ReadvSocketTest, ReadBuffersDiscontinuous_StreamSocket) {
+ ReadBuffersDiscontinuous(test_unix_stream_socket_[0]);
+}
+
+TEST_F(ReadvSocketTest, ReadBuffersDiscontinuous_DgramSocket) {
+ ReadBuffersDiscontinuous(test_unix_dgram_socket_[0]);
+}
+
+TEST_F(ReadvSocketTest, ReadIovecsCompletelyFilled_StreamSocket) {
+ ReadIovecsCompletelyFilled(test_unix_stream_socket_[0]);
+}
+
+TEST_F(ReadvSocketTest, ReadIovecsCompletelyFilled_DgramSocket) {
+ ReadIovecsCompletelyFilled(test_unix_dgram_socket_[0]);
+}
+
+TEST_F(ReadvSocketTest, BadIovecsPointer_StreamSocket) {
+ ASSERT_THAT(readv(test_unix_stream_socket_[0], nullptr, 1),
+ SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(ReadvSocketTest, BadIovecsPointer_DgramSocket) {
+ ASSERT_THAT(readv(test_unix_dgram_socket_[0], nullptr, 1),
+ SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(ReadvSocketTest, BadIovecBase_StreamSocket) {
+ struct iovec iov[1];
+ iov[0].iov_base = nullptr;
+ iov[0].iov_len = 1024;
+ ASSERT_THAT(readv(test_unix_stream_socket_[0], iov, 1),
+ SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(ReadvSocketTest, BadIovecBase_DgramSocket) {
+ struct iovec iov[1];
+ iov[0].iov_base = nullptr;
+ iov[0].iov_len = 1024;
+ ASSERT_THAT(readv(test_unix_dgram_socket_[0], iov, 1),
+ SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(ReadvSocketTest, ZeroIovecs_StreamSocket) {
+ struct iovec iov[1];
+ iov[0].iov_base = 0;
+ iov[0].iov_len = 0;
+ ASSERT_THAT(readv(test_unix_stream_socket_[0], iov, 1), SyscallSucceeds());
+}
+
+TEST_F(ReadvSocketTest, ZeroIovecs_DgramSocket) {
+ struct iovec iov[1];
+ iov[0].iov_base = 0;
+ iov[0].iov_len = 0;
+ ASSERT_THAT(readv(test_unix_dgram_socket_[0], iov, 1), SyscallSucceeds());
+}
+
+TEST_F(ReadvSocketTest, WouldBlock_StreamSocket) {
+ struct iovec iov[1];
+ iov[0].iov_base = reinterpret_cast<char*>(malloc(kReadvTestDataSize));
+ iov[0].iov_len = kReadvTestDataSize;
+ ASSERT_THAT(readv(test_unix_stream_socket_[0], iov, 1),
+ SyscallSucceedsWithValue(kReadvTestDataSize));
+ free(iov[0].iov_base);
+
+ iov[0].iov_base = reinterpret_cast<char*>(malloc(kReadvTestDataSize));
+ ASSERT_THAT(readv(test_unix_stream_socket_[0], iov, 1),
+ SyscallFailsWithErrno(EAGAIN));
+ free(iov[0].iov_base);
+}
+
+TEST_F(ReadvSocketTest, WouldBlock_DgramSocket) {
+ struct iovec iov[1];
+ iov[0].iov_base = reinterpret_cast<char*>(malloc(kReadvTestDataSize));
+ iov[0].iov_len = kReadvTestDataSize;
+ ASSERT_THAT(readv(test_unix_dgram_socket_[0], iov, 1),
+ SyscallSucceedsWithValue(kReadvTestDataSize));
+ free(iov[0].iov_base);
+
+ iov[0].iov_base = reinterpret_cast<char*>(malloc(kReadvTestDataSize));
+ ASSERT_THAT(readv(test_unix_dgram_socket_[0], iov, 1),
+ SyscallFailsWithErrno(EAGAIN));
+ free(iov[0].iov_base);
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/rename.cc b/test/syscalls/linux/rename.cc
new file mode 100644
index 000000000..833c0dc4f
--- /dev/null
+++ b/test/syscalls/linux/rename.cc
@@ -0,0 +1,394 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <stdio.h>
+
+#include <string>
+
+#include "gtest/gtest.h"
+#include "absl/strings/string_view.h"
+#include "test/util/capability_util.h"
+#include "test/util/cleanup.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(RenameTest, RootToAnything) {
+ ASSERT_THAT(rename("/", "/bin"), SyscallFailsWithErrno(EBUSY));
+}
+
+TEST(RenameTest, AnythingToRoot) {
+ ASSERT_THAT(rename("/bin", "/"), SyscallFailsWithErrno(EBUSY));
+}
+
+TEST(RenameTest, SourceIsAncestorOfTarget) {
+ auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ auto subdir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(dir.path()));
+ ASSERT_THAT(rename(dir.path().c_str(), subdir.path().c_str()),
+ SyscallFailsWithErrno(EINVAL));
+
+ // Try an even deeper directory.
+ auto deep_subdir =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(subdir.path()));
+ ASSERT_THAT(rename(dir.path().c_str(), deep_subdir.path().c_str()),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(RenameTest, TargetIsAncestorOfSource) {
+ auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ auto subdir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(dir.path()));
+ ASSERT_THAT(rename(subdir.path().c_str(), dir.path().c_str()),
+ SyscallFailsWithErrno(ENOTEMPTY));
+
+ // Try an even deeper directory.
+ auto deep_subdir =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(subdir.path()));
+ ASSERT_THAT(rename(deep_subdir.path().c_str(), dir.path().c_str()),
+ SyscallFailsWithErrno(ENOTEMPTY));
+}
+
+TEST(RenameTest, FileToSelf) {
+ auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ EXPECT_THAT(rename(f.path().c_str(), f.path().c_str()), SyscallSucceeds());
+}
+
+TEST(RenameTest, DirectoryToSelf) {
+ auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ EXPECT_THAT(rename(f.path().c_str(), f.path().c_str()), SyscallSucceeds());
+}
+
+TEST(RenameTest, FileToSameDirectory) {
+ auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ std::string const newpath = NewTempAbsPath();
+ ASSERT_THAT(rename(f.path().c_str(), newpath.c_str()), SyscallSucceeds());
+ std::string const oldpath = f.release();
+ f.reset(newpath);
+ EXPECT_THAT(Exists(oldpath), IsPosixErrorOkAndHolds(false));
+ EXPECT_THAT(Exists(newpath), IsPosixErrorOkAndHolds(true));
+}
+
+TEST(RenameTest, DirectoryToSameDirectory) {
+ auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ std::string const newpath = NewTempAbsPath();
+ ASSERT_THAT(rename(dir.path().c_str(), newpath.c_str()), SyscallSucceeds());
+ std::string const oldpath = dir.release();
+ dir.reset(newpath);
+ EXPECT_THAT(Exists(oldpath), IsPosixErrorOkAndHolds(false));
+ EXPECT_THAT(Exists(newpath), IsPosixErrorOkAndHolds(true));
+}
+
+TEST(RenameTest, FileToParentDirectory) {
+ auto dir1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ auto dir2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(dir1.path()));
+ auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir2.path()));
+ std::string const newpath = NewTempAbsPathInDir(dir1.path());
+ ASSERT_THAT(rename(f.path().c_str(), newpath.c_str()), SyscallSucceeds());
+ std::string const oldpath = f.release();
+ f.reset(newpath);
+ EXPECT_THAT(Exists(oldpath), IsPosixErrorOkAndHolds(false));
+ EXPECT_THAT(Exists(newpath), IsPosixErrorOkAndHolds(true));
+}
+
+TEST(RenameTest, DirectoryToParentDirectory) {
+ auto dir1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ auto dir2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(dir1.path()));
+ auto dir3 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(dir2.path()));
+ EXPECT_THAT(IsDirectory(dir3.path()), IsPosixErrorOkAndHolds(true));
+ std::string const newpath = NewTempAbsPathInDir(dir1.path());
+ ASSERT_THAT(rename(dir3.path().c_str(), newpath.c_str()), SyscallSucceeds());
+ std::string const oldpath = dir3.release();
+ dir3.reset(newpath);
+ EXPECT_THAT(Exists(oldpath), IsPosixErrorOkAndHolds(false));
+ EXPECT_THAT(Exists(newpath), IsPosixErrorOkAndHolds(true));
+ EXPECT_THAT(IsDirectory(newpath), IsPosixErrorOkAndHolds(true));
+}
+
+TEST(RenameTest, FileToChildDirectory) {
+ auto dir1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ auto dir2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(dir1.path()));
+ auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir1.path()));
+ std::string const newpath = NewTempAbsPathInDir(dir2.path());
+ ASSERT_THAT(rename(f.path().c_str(), newpath.c_str()), SyscallSucceeds());
+ std::string const oldpath = f.release();
+ f.reset(newpath);
+ EXPECT_THAT(Exists(oldpath), IsPosixErrorOkAndHolds(false));
+ EXPECT_THAT(Exists(newpath), IsPosixErrorOkAndHolds(true));
+}
+
+TEST(RenameTest, DirectoryToChildDirectory) {
+ auto dir1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ auto dir2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(dir1.path()));
+ auto dir3 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(dir1.path()));
+ std::string const newpath = NewTempAbsPathInDir(dir2.path());
+ ASSERT_THAT(rename(dir3.path().c_str(), newpath.c_str()), SyscallSucceeds());
+ std::string const oldpath = dir3.release();
+ dir3.reset(newpath);
+ EXPECT_THAT(Exists(oldpath), IsPosixErrorOkAndHolds(false));
+ EXPECT_THAT(Exists(newpath), IsPosixErrorOkAndHolds(true));
+ EXPECT_THAT(IsDirectory(newpath), IsPosixErrorOkAndHolds(true));
+}
+
+TEST(RenameTest, DirectoryToOwnChildDirectory) {
+ auto dir1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ auto dir2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(dir1.path()));
+ std::string const newpath = NewTempAbsPathInDir(dir2.path());
+ ASSERT_THAT(rename(dir1.path().c_str(), newpath.c_str()),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(RenameTest, FileOverwritesFile) {
+ auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ auto f1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ dir.path(), "first", TempPath::kDefaultFileMode));
+ auto f2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ dir.path(), "second", TempPath::kDefaultFileMode));
+ ASSERT_THAT(rename(f1.path().c_str(), f2.path().c_str()), SyscallSucceeds());
+ EXPECT_THAT(Exists(f1.path()), IsPosixErrorOkAndHolds(false));
+
+ f1.release();
+ std::string f2_contents;
+ ASSERT_NO_ERRNO(GetContents(f2.path(), &f2_contents));
+ EXPECT_EQ("first", f2_contents);
+}
+
+TEST(RenameTest, DirectoryOverwritesDirectoryLinkCount) {
+ auto parent1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ EXPECT_THAT(Links(parent1.path()), IsPosixErrorOkAndHolds(2));
+
+ auto parent2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ EXPECT_THAT(Links(parent2.path()), IsPosixErrorOkAndHolds(2));
+
+ auto dir1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(parent1.path()));
+ auto dir2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(parent2.path()));
+
+ EXPECT_THAT(Links(parent1.path()), IsPosixErrorOkAndHolds(3));
+ EXPECT_THAT(Links(parent2.path()), IsPosixErrorOkAndHolds(3));
+
+ ASSERT_THAT(rename(dir1.path().c_str(), dir2.path().c_str()),
+ SyscallSucceeds());
+
+ EXPECT_THAT(Links(parent1.path()), IsPosixErrorOkAndHolds(2));
+ EXPECT_THAT(Links(parent2.path()), IsPosixErrorOkAndHolds(3));
+}
+
+TEST(RenameTest, FileDoesNotExist) {
+ auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const std::string source = JoinPath(dir.path(), "source");
+ const std::string dest = JoinPath(dir.path(), "dest");
+ ASSERT_THAT(rename(source.c_str(), dest.c_str()),
+ SyscallFailsWithErrno(ENOENT));
+}
+
+TEST(RenameTest, FileDoesNotOverwriteDirectory) {
+ auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ ASSERT_THAT(rename(f.path().c_str(), dir.path().c_str()),
+ SyscallFailsWithErrno(EISDIR));
+}
+
+TEST(RenameTest, DirectoryDoesNotOverwriteFile) {
+ auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ ASSERT_THAT(rename(dir.path().c_str(), f.path().c_str()),
+ SyscallFailsWithErrno(ENOTDIR));
+}
+
+TEST(RenameTest, DirectoryOverwritesEmptyDirectory) {
+ auto dir1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir1.path()));
+ auto dir2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ EXPECT_THAT(rename(dir1.path().c_str(), dir2.path().c_str()),
+ SyscallSucceeds());
+ EXPECT_THAT(Exists(dir1.path()), IsPosixErrorOkAndHolds(false));
+ dir1.release();
+ EXPECT_THAT(Exists(JoinPath(dir2.path(), Basename(f.path()))),
+ IsPosixErrorOkAndHolds(true));
+ f.release();
+}
+
+TEST(RenameTest, FailsWithDots) {
+ auto dir1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ auto dir2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ auto dir1_dot = absl::StrCat(dir1.path(), "/.");
+ auto dir2_dot = absl::StrCat(dir2.path(), "/.");
+ auto dir1_dot_dot = absl::StrCat(dir1.path(), "/..");
+ auto dir2_dot_dot = absl::StrCat(dir2.path(), "/..");
+
+ // Try with dot paths in the first argument
+ EXPECT_THAT(rename(dir1_dot.c_str(), dir2.path().c_str()),
+ SyscallFailsWithErrno(EBUSY));
+ EXPECT_THAT(rename(dir1_dot_dot.c_str(), dir2.path().c_str()),
+ SyscallFailsWithErrno(EBUSY));
+
+ // Try with dot paths in the second argument
+ EXPECT_THAT(rename(dir1.path().c_str(), dir2_dot.c_str()),
+ SyscallFailsWithErrno(EBUSY));
+ EXPECT_THAT(rename(dir1.path().c_str(), dir2_dot_dot.c_str()),
+ SyscallFailsWithErrno(EBUSY));
+}
+
+TEST(RenameTest, DirectoryDoesNotOverwriteNonemptyDirectory) {
+ auto dir1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ auto f1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir1.path()));
+ auto dir2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ auto f2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir2.path()));
+ ASSERT_THAT(rename(dir1.path().c_str(), dir2.path().c_str()),
+ SyscallFailsWithErrno(ENOTEMPTY));
+}
+
+TEST(RenameTest, FailsWhenOldParentNotWritable) {
+ // Drop capabilities that allow us to override file and directory permissions.
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+ auto dir1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ auto f1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir1.path()));
+ auto dir2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ // dir1 is not writable.
+ ASSERT_THAT(chmod(dir1.path().c_str(), 0555), SyscallSucceeds());
+
+ std::string const newpath = NewTempAbsPathInDir(dir2.path());
+ EXPECT_THAT(rename(f1.path().c_str(), newpath.c_str()),
+ SyscallFailsWithErrno(EACCES));
+}
+
+TEST(RenameTest, FailsWhenNewParentNotWritable) {
+ // Drop capabilities that allow us to override file and directory permissions.
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+ auto dir1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ auto f1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir1.path()));
+ // dir2 is not writable.
+ auto dir2 = ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateDirWith(GetAbsoluteTestTmpdir(), 0555));
+
+ std::string const newpath = NewTempAbsPathInDir(dir2.path());
+ EXPECT_THAT(rename(f1.path().c_str(), newpath.c_str()),
+ SyscallFailsWithErrno(EACCES));
+}
+
+// Equivalent to FailsWhenNewParentNotWritable, but with a destination file
+// to overwrite.
+TEST(RenameTest, OverwriteFailsWhenNewParentNotWritable) {
+ // Drop capabilities that allow us to override file and directory permissions.
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+ auto dir1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ auto f1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir1.path()));
+
+ // dir2 is not writable.
+ auto dir2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ auto f2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir2.path()));
+ ASSERT_THAT(chmod(dir2.path().c_str(), 0555), SyscallSucceeds());
+
+ EXPECT_THAT(rename(f1.path().c_str(), f2.path().c_str()),
+ SyscallFailsWithErrno(EACCES));
+}
+
+// If the parent directory of source is not accessible, rename returns EACCES
+// because the user cannot determine if source exists.
+TEST(RenameTest, FileDoesNotExistWhenNewParentNotExecutable) {
+ // Drop capabilities that allow us to override file and directory permissions.
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+ // No execute permission.
+ auto dir = ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateDirWith(GetAbsoluteTestTmpdir(), 0400));
+
+ const std::string source = JoinPath(dir.path(), "source");
+ const std::string dest = JoinPath(dir.path(), "dest");
+ ASSERT_THAT(rename(source.c_str(), dest.c_str()),
+ SyscallFailsWithErrno(EACCES));
+}
+
+TEST(RenameTest, DirectoryWithOpenFdOverwritesEmptyDirectory) {
+ auto dir1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir1.path()));
+ auto dir2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+ // Get an fd on dir1
+ int fd;
+ ASSERT_THAT(fd = open(dir1.path().c_str(), O_DIRECTORY), SyscallSucceeds());
+ auto close_f = Cleanup([fd] {
+ // Close the fd on f.
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+ });
+
+ EXPECT_THAT(rename(dir1.path().c_str(), dir2.path().c_str()),
+ SyscallSucceeds());
+
+ const std::string new_f_path = JoinPath(dir2.path(), Basename(f.path()));
+
+ auto remove_f = Cleanup([&] {
+ // Delete f in its new location.
+ ASSERT_NO_ERRNO(Delete(new_f_path));
+ f.release();
+ });
+
+ EXPECT_THAT(Exists(dir1.path()), IsPosixErrorOkAndHolds(false));
+ dir1.release();
+ EXPECT_THAT(Exists(new_f_path), IsPosixErrorOkAndHolds(true));
+}
+
+TEST(RenameTest, FileWithOpenFd) {
+ TempPath root_dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ TempPath dir1 =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(root_dir.path()));
+ TempPath dir2 =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(root_dir.path()));
+ TempPath dir3 =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(root_dir.path()));
+
+ // Create file in dir1.
+ constexpr char kContents[] = "foo";
+ auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ dir1.path(), kContents, TempPath::kDefaultFileMode));
+
+ // Get fd on file.
+ const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(f.path(), O_RDWR));
+
+ // Move f to dir2.
+ const std::string path2 = NewTempAbsPathInDir(dir2.path());
+ ASSERT_THAT(rename(f.path().c_str(), path2.c_str()), SyscallSucceeds());
+
+ // Read f's kContents.
+ char buf[sizeof(kContents)];
+ EXPECT_THAT(PreadFd(fd.get(), &buf, sizeof(kContents), 0),
+ SyscallSucceedsWithValue(sizeof(kContents) - 1));
+ EXPECT_EQ(absl::string_view(buf, sizeof(buf) - 1), kContents);
+
+ // Move f to dir3.
+ const std::string path3 = NewTempAbsPathInDir(dir3.path());
+ ASSERT_THAT(rename(path2.c_str(), path3.c_str()), SyscallSucceeds());
+
+ // Read f's kContents.
+ EXPECT_THAT(PreadFd(fd.get(), &buf, sizeof(kContents), 0),
+ SyscallSucceedsWithValue(sizeof(kContents) - 1));
+ EXPECT_EQ(absl::string_view(buf, sizeof(buf) - 1), kContents);
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/rlimits.cc b/test/syscalls/linux/rlimits.cc
new file mode 100644
index 000000000..860f0f688
--- /dev/null
+++ b/test/syscalls/linux/rlimits.cc
@@ -0,0 +1,75 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/resource.h>
+#include <sys/time.h>
+
+#include "test/util/capability_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(RlimitTest, SetRlimitHigher) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_RESOURCE)));
+
+ struct rlimit rl = {};
+ EXPECT_THAT(getrlimit(RLIMIT_NOFILE, &rl), SyscallSucceeds());
+
+ // Lower the rlimit first, as it may be equal to /proc/sys/fs/nr_open, in
+ // which case even users with CAP_SYS_RESOURCE can't raise it.
+ rl.rlim_cur--;
+ rl.rlim_max--;
+ ASSERT_THAT(setrlimit(RLIMIT_NOFILE, &rl), SyscallSucceeds());
+
+ rl.rlim_max++;
+ EXPECT_THAT(setrlimit(RLIMIT_NOFILE, &rl), SyscallSucceeds());
+}
+
+TEST(RlimitTest, UnprivilegedSetRlimit) {
+ // Drop privileges if necessary.
+ if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_RESOURCE))) {
+ EXPECT_NO_ERRNO(SetCapability(CAP_SYS_RESOURCE, false));
+ }
+
+ struct rlimit rl = {};
+ rl.rlim_cur = 1000;
+ rl.rlim_max = 20000;
+ EXPECT_THAT(setrlimit(RLIMIT_NOFILE, &rl), SyscallSucceeds());
+
+ struct rlimit rl2 = {};
+ EXPECT_THAT(getrlimit(RLIMIT_NOFILE, &rl2), SyscallSucceeds());
+ EXPECT_EQ(rl.rlim_cur, rl2.rlim_cur);
+ EXPECT_EQ(rl.rlim_max, rl2.rlim_max);
+
+ rl.rlim_max = 100000;
+ EXPECT_THAT(setrlimit(RLIMIT_NOFILE, &rl), SyscallFailsWithErrno(EPERM));
+}
+
+TEST(RlimitTest, SetSoftRlimitAboveHard) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_RESOURCE)));
+
+ struct rlimit rl = {};
+ EXPECT_THAT(getrlimit(RLIMIT_NOFILE, &rl), SyscallSucceeds());
+
+ rl.rlim_cur = rl.rlim_max + 1;
+ EXPECT_THAT(setrlimit(RLIMIT_NOFILE, &rl), SyscallFailsWithErrno(EINVAL));
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/rseq.cc b/test/syscalls/linux/rseq.cc
new file mode 100644
index 000000000..4bfb1ff56
--- /dev/null
+++ b/test/syscalls/linux/rseq.cc
@@ -0,0 +1,198 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <signal.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/rseq/test.h"
+#include "test/syscalls/linux/rseq/uapi.h"
+#include "test/util/logging.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Syscall test for rseq (restartable sequences).
+//
+// We must be very careful about how these tests are written. Each thread may
+// only have one struct rseq registration, which may be done automatically at
+// thread start (as of 2019-11-13, glibc does *not* support rseq and thus does
+// not do so, but other libraries do).
+//
+// Testing of rseq is thus done primarily in a child process with no
+// registration. This means exec'ing a nostdlib binary, as rseq registration can
+// only be cleared by execve (or knowing the old rseq address), and glibc (based
+// on the current unmerged patches) register rseq before calling main()).
+
+int RSeq(struct rseq* rseq, uint32_t rseq_len, int flags, uint32_t sig) {
+ return syscall(kRseqSyscall, rseq, rseq_len, flags, sig);
+}
+
+// Returns true if this kernel supports the rseq syscall.
+PosixErrorOr<bool> RSeqSupported() {
+ // We have to be careful here, there are three possible cases:
+ //
+ // 1. rseq is not supported -> ENOSYS
+ // 2. rseq is supported and not registered -> success, but we should
+ // unregister.
+ // 3. rseq is supported and registered -> EINVAL (most likely).
+
+ // The only validation done on new registrations is that rseq is aligned and
+ // writable.
+ rseq rseq = {};
+ int ret = RSeq(&rseq, sizeof(rseq), 0, 0);
+ if (ret == 0) {
+ // Successfully registered, rseq is supported. Unregister.
+ ret = RSeq(&rseq, sizeof(rseq), kRseqFlagUnregister, 0);
+ if (ret != 0) {
+ return PosixError(errno);
+ }
+ return true;
+ }
+
+ switch (errno) {
+ case ENOSYS:
+ // Not supported.
+ return false;
+ case EINVAL:
+ // Supported, but already registered. EINVAL returned because we provided
+ // a different address.
+ return true;
+ default:
+ // Unknown error.
+ return PosixError(errno);
+ }
+}
+
+constexpr char kRseqBinary[] = "test/syscalls/linux/rseq/rseq";
+
+void RunChildTest(std::string test_case, int want_status) {
+ std::string path = RunfilePath(kRseqBinary);
+
+ pid_t child_pid = -1;
+ int execve_errno = 0;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec(path, {path, test_case}, {}, &child_pid, &execve_errno));
+
+ ASSERT_GT(child_pid, 0);
+ ASSERT_EQ(execve_errno, 0);
+
+ int status = 0;
+ ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+ ASSERT_EQ(status, want_status);
+}
+
+// Test that rseq must be aligned.
+TEST(RseqTest, Unaligned) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+ RunChildTest(kRseqTestUnaligned, 0);
+}
+
+// Sanity test that registration works.
+TEST(RseqTest, Register) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+ RunChildTest(kRseqTestRegister, 0);
+}
+
+// Registration can't be done twice.
+TEST(RseqTest, DoubleRegister) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+ RunChildTest(kRseqTestDoubleRegister, 0);
+}
+
+// Registration can be done again after unregister.
+TEST(RseqTest, RegisterUnregister) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+ RunChildTest(kRseqTestRegisterUnregister, 0);
+}
+
+// The pointer to rseq must match on register/unregister.
+TEST(RseqTest, UnregisterDifferentPtr) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+ RunChildTest(kRseqTestUnregisterDifferentPtr, 0);
+}
+
+// The signature must match on register/unregister.
+TEST(RseqTest, UnregisterDifferentSignature) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+ RunChildTest(kRseqTestUnregisterDifferentSignature, 0);
+}
+
+// The CPU ID is initialized.
+TEST(RseqTest, CPU) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+ RunChildTest(kRseqTestCPU, 0);
+}
+
+// Critical section is eventually aborted.
+TEST(RseqTest, Abort) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+ RunChildTest(kRseqTestAbort, 0);
+}
+
+// Abort may be before the critical section.
+TEST(RseqTest, AbortBefore) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+ RunChildTest(kRseqTestAbortBefore, 0);
+}
+
+// Signature must match.
+TEST(RseqTest, AbortSignature) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+ RunChildTest(kRseqTestAbortSignature, SIGSEGV);
+}
+
+// Abort must not be in the critical section.
+TEST(RseqTest, AbortPreCommit) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+ RunChildTest(kRseqTestAbortPreCommit, SIGSEGV);
+}
+
+// rseq.rseq_cs is cleared on abort.
+TEST(RseqTest, AbortClearsCS) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+ RunChildTest(kRseqTestAbortClearsCS, 0);
+}
+
+// rseq.rseq_cs is cleared on abort outside of critical section.
+TEST(RseqTest, InvalidAbortClearsCS) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(RSeqSupported()));
+
+ RunChildTest(kRseqTestInvalidAbortClearsCS, 0);
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/rseq/BUILD b/test/syscalls/linux/rseq/BUILD
new file mode 100644
index 000000000..853258b04
--- /dev/null
+++ b/test/syscalls/linux/rseq/BUILD
@@ -0,0 +1,61 @@
+# This package contains a standalone rseq test binary. This binary must not
+# depend on libc, which might use rseq itself.
+
+load("//tools:defs.bzl", "cc_flags_supplier", "cc_library", "cc_toolchain", "select_arch")
+
+package(licenses = ["notice"])
+
+genrule(
+ name = "rseq_binary",
+ srcs = [
+ "critical.h",
+ "critical_amd64.S",
+ "critical_arm64.S",
+ "rseq.cc",
+ "syscalls.h",
+ "start_amd64.S",
+ "start_arm64.S",
+ "test.h",
+ "types.h",
+ "uapi.h",
+ ],
+ outs = ["rseq"],
+ cmd = "$(CC) " +
+ "$(CC_FLAGS) " +
+ "-I. " +
+ "-Wall " +
+ "-Werror " +
+ "-O2 " +
+ "-std=c++17 " +
+ "-static " +
+ "-nostdlib " +
+ "-ffreestanding " +
+ "-o " +
+ "$(location rseq) " +
+ select_arch(
+ amd64 = "$(location critical_amd64.S) $(location start_amd64.S) ",
+ arm64 = "$(location critical_arm64.S) $(location start_arm64.S) ",
+ no_match_error = "unsupported architecture",
+ ) +
+ "$(location rseq.cc)",
+ toolchains = [
+ cc_toolchain,
+ ":no_pie_cc_flags",
+ ],
+ visibility = ["//:sandbox"],
+)
+
+cc_flags_supplier(
+ name = "no_pie_cc_flags",
+ features = ["-pie"],
+)
+
+cc_library(
+ name = "lib",
+ testonly = 1,
+ hdrs = [
+ "test.h",
+ "uapi.h",
+ ],
+ visibility = ["//:sandbox"],
+)
diff --git a/test/syscalls/linux/rseq/critical.h b/test/syscalls/linux/rseq/critical.h
new file mode 100644
index 000000000..ac987a25e
--- /dev/null
+++ b/test/syscalls/linux/rseq/critical.h
@@ -0,0 +1,39 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_RSEQ_CRITICAL_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_RSEQ_CRITICAL_H_
+
+#include "test/syscalls/linux/rseq/types.h"
+#include "test/syscalls/linux/rseq/uapi.h"
+
+constexpr uint32_t kRseqSignature = 0x90909090;
+
+extern "C" {
+
+extern void rseq_loop(struct rseq* r, struct rseq_cs* cs);
+extern void* rseq_loop_early_abort;
+extern void* rseq_loop_start;
+extern void* rseq_loop_pre_commit;
+extern void* rseq_loop_post_commit;
+extern void* rseq_loop_abort;
+
+extern int rseq_getpid(struct rseq* r, struct rseq_cs* cs);
+extern void* rseq_getpid_start;
+extern void* rseq_getpid_post_commit;
+extern void* rseq_getpid_abort;
+
+} // extern "C"
+
+#endif // GVISOR_TEST_SYSCALLS_LINUX_RSEQ_CRITICAL_H_
diff --git a/test/syscalls/linux/rseq/critical_amd64.S b/test/syscalls/linux/rseq/critical_amd64.S
new file mode 100644
index 000000000..8c0687e6d
--- /dev/null
+++ b/test/syscalls/linux/rseq/critical_amd64.S
@@ -0,0 +1,66 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Restartable sequences critical sections.
+
+// Loops continuously until aborted.
+//
+// void rseq_loop(struct rseq* r, struct rseq_cs* cs)
+
+ .text
+ .globl rseq_loop
+ .type rseq_loop, @function
+
+rseq_loop:
+ jmp begin
+
+ // Abort block before the critical section.
+ // Abort signature is 4 nops for simplicity.
+ .byte 0x90, 0x90, 0x90, 0x90
+ .globl rseq_loop_early_abort
+rseq_loop_early_abort:
+ ret
+
+begin:
+ // r->rseq_cs = cs
+ movq %rsi, 8(%rdi)
+
+ // N.B. rseq_cs will be cleared by any preempt, even outside the critical
+ // section. Thus it must be set in or immediately before the critical section
+ // to ensure it is not cleared before the section begins.
+ .globl rseq_loop_start
+rseq_loop_start:
+ jmp rseq_loop_start
+
+ // "Pre-commit": extra instructions inside the critical section. These are
+ // used as the abort point in TestAbortPreCommit, which is not valid.
+ .globl rseq_loop_pre_commit
+rseq_loop_pre_commit:
+ // Extra abort signature + nop for TestAbortPostCommit.
+ .byte 0x90, 0x90, 0x90, 0x90
+ nop
+
+ // "Post-commit": never reached in this case.
+ .globl rseq_loop_post_commit
+rseq_loop_post_commit:
+
+ // Abort signature is 4 nops for simplicity.
+ .byte 0x90, 0x90, 0x90, 0x90
+
+ .globl rseq_loop_abort
+rseq_loop_abort:
+ ret
+
+ .size rseq_loop,.-rseq_loop
+ .section .note.GNU-stack,"",@progbits
diff --git a/test/syscalls/linux/rseq/critical_arm64.S b/test/syscalls/linux/rseq/critical_arm64.S
new file mode 100644
index 000000000..bfe7e8307
--- /dev/null
+++ b/test/syscalls/linux/rseq/critical_arm64.S
@@ -0,0 +1,66 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Restartable sequences critical sections.
+
+// Loops continuously until aborted.
+//
+// void rseq_loop(struct rseq* r, struct rseq_cs* cs)
+
+ .text
+ .globl rseq_loop
+ .type rseq_loop, @function
+
+rseq_loop:
+ b begin
+
+ // Abort block before the critical section.
+ // Abort signature.
+ .byte 0x90, 0x90, 0x90, 0x90
+ .globl rseq_loop_early_abort
+rseq_loop_early_abort:
+ ret
+
+begin:
+ // r->rseq_cs = cs
+ str x1, [x0, #8]
+
+ // N.B. rseq_cs will be cleared by any preempt, even outside the critical
+ // section. Thus it must be set in or immediately before the critical section
+ // to ensure it is not cleared before the section begins.
+ .globl rseq_loop_start
+rseq_loop_start:
+ b rseq_loop_start
+
+ // "Pre-commit": extra instructions inside the critical section. These are
+ // used as the abort point in TestAbortPreCommit, which is not valid.
+ .globl rseq_loop_pre_commit
+rseq_loop_pre_commit:
+ // Extra abort signature + nop for TestAbortPostCommit.
+ .byte 0x90, 0x90, 0x90, 0x90
+ nop
+
+ // "Post-commit": never reached in this case.
+ .globl rseq_loop_post_commit
+rseq_loop_post_commit:
+
+ // Abort signature.
+ .byte 0x90, 0x90, 0x90, 0x90
+
+ .globl rseq_loop_abort
+rseq_loop_abort:
+ ret
+
+ .size rseq_loop,.-rseq_loop
+ .section .note.GNU-stack,"",@progbits
diff --git a/test/syscalls/linux/rseq/rseq.cc b/test/syscalls/linux/rseq/rseq.cc
new file mode 100644
index 000000000..f036db26d
--- /dev/null
+++ b/test/syscalls/linux/rseq/rseq.cc
@@ -0,0 +1,366 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/rseq/critical.h"
+#include "test/syscalls/linux/rseq/syscalls.h"
+#include "test/syscalls/linux/rseq/test.h"
+#include "test/syscalls/linux/rseq/types.h"
+#include "test/syscalls/linux/rseq/uapi.h"
+
+namespace gvisor {
+namespace testing {
+
+extern "C" int main(int argc, char** argv, char** envp);
+
+// Standalone initialization before calling main().
+extern "C" void __init(uintptr_t* sp) {
+ int argc = sp[0];
+ char** argv = reinterpret_cast<char**>(&sp[1]);
+ char** envp = &argv[argc + 1];
+
+ // Call main() and exit.
+ sys_exit_group(main(argc, argv, envp));
+
+ // sys_exit_group does not return
+}
+
+int strcmp(const char* s1, const char* s2) {
+ const unsigned char* p1 = reinterpret_cast<const unsigned char*>(s1);
+ const unsigned char* p2 = reinterpret_cast<const unsigned char*>(s2);
+
+ while (*p1 == *p2) {
+ if (!*p1) {
+ return 0;
+ }
+ ++p1;
+ ++p2;
+ }
+ return static_cast<int>(*p1) - static_cast<int>(*p2);
+}
+
+int sys_rseq(struct rseq* rseq, uint32_t rseq_len, int flags, uint32_t sig) {
+ return raw_syscall(kRseqSyscall, rseq, rseq_len, flags, sig);
+}
+
+// Test that rseq must be aligned.
+int TestUnaligned() {
+ constexpr uintptr_t kRequiredAlignment = alignof(rseq);
+
+ char buf[2 * kRequiredAlignment] = {};
+ uintptr_t ptr = reinterpret_cast<uintptr_t>(&buf[0]);
+ if ((ptr & (kRequiredAlignment - 1)) == 0) {
+ // buf is already aligned. Misalign it.
+ ptr++;
+ }
+
+ int ret = sys_rseq(reinterpret_cast<rseq*>(ptr), sizeof(rseq), 0, 0);
+ if (sys_errno(ret) != EINVAL) {
+ return 1;
+ }
+ return 0;
+}
+
+// Sanity test that registration works.
+int TestRegister() {
+ struct rseq r = {};
+ if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != 0) {
+ return 1;
+ }
+ return 0;
+};
+
+// Registration can't be done twice.
+int TestDoubleRegister() {
+ struct rseq r = {};
+ if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != 0) {
+ return 1;
+ }
+
+ if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != EBUSY) {
+ return 1;
+ }
+
+ return 0;
+};
+
+// Registration can be done again after unregister.
+int TestRegisterUnregister() {
+ struct rseq r = {};
+ if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != 0) {
+ return 1;
+ }
+
+ if (int ret = sys_rseq(&r, sizeof(r), kRseqFlagUnregister, 0);
+ sys_errno(ret) != 0) {
+ return 1;
+ }
+
+ if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != 0) {
+ return 1;
+ }
+
+ return 0;
+};
+
+// The pointer to rseq must match on register/unregister.
+int TestUnregisterDifferentPtr() {
+ struct rseq r = {};
+ if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != 0) {
+ return 1;
+ }
+
+ struct rseq r2 = {};
+ if (int ret = sys_rseq(&r2, sizeof(r2), kRseqFlagUnregister, 0);
+ sys_errno(ret) != EINVAL) {
+ return 1;
+ }
+
+ return 0;
+};
+
+// The signature must match on register/unregister.
+int TestUnregisterDifferentSignature() {
+ constexpr int kSignature = 0;
+
+ struct rseq r = {};
+ if (int ret = sys_rseq(&r, sizeof(r), 0, kSignature); sys_errno(ret) != 0) {
+ return 1;
+ }
+
+ if (int ret = sys_rseq(&r, sizeof(r), kRseqFlagUnregister, kSignature + 1);
+ sys_errno(ret) != EPERM) {
+ return 1;
+ }
+
+ return 0;
+};
+
+// The CPU ID is initialized.
+int TestCPU() {
+ struct rseq r = {};
+ r.cpu_id = kRseqCPUIDUninitialized;
+
+ if (int ret = sys_rseq(&r, sizeof(r), 0, 0); sys_errno(ret) != 0) {
+ return 1;
+ }
+
+ if (__atomic_load_n(&r.cpu_id, __ATOMIC_RELAXED) < 0) {
+ return 1;
+ }
+ if (__atomic_load_n(&r.cpu_id_start, __ATOMIC_RELAXED) < 0) {
+ return 1;
+ }
+
+ return 0;
+};
+
+// Critical section is eventually aborted.
+int TestAbort() {
+ struct rseq r = {};
+ if (int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature);
+ sys_errno(ret) != 0) {
+ return 1;
+ }
+
+ struct rseq_cs cs = {};
+ cs.version = 0;
+ cs.flags = 0;
+ cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start);
+ cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) -
+ reinterpret_cast<uint64_t>(&rseq_loop_start);
+ cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_abort);
+
+ // Loops until abort. If this returns then abort occurred.
+ rseq_loop(&r, &cs);
+
+ return 0;
+};
+
+// Abort may be before the critical section.
+int TestAbortBefore() {
+ struct rseq r = {};
+ if (int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature);
+ sys_errno(ret) != 0) {
+ return 1;
+ }
+
+ struct rseq_cs cs = {};
+ cs.version = 0;
+ cs.flags = 0;
+ cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start);
+ cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) -
+ reinterpret_cast<uint64_t>(&rseq_loop_start);
+ cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_early_abort);
+
+ // Loops until abort. If this returns then abort occurred.
+ rseq_loop(&r, &cs);
+
+ return 0;
+};
+
+// Signature must match.
+int TestAbortSignature() {
+ struct rseq r = {};
+ if (int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature + 1);
+ sys_errno(ret) != 0) {
+ return 1;
+ }
+
+ struct rseq_cs cs = {};
+ cs.version = 0;
+ cs.flags = 0;
+ cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start);
+ cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) -
+ reinterpret_cast<uint64_t>(&rseq_loop_start);
+ cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_abort);
+
+ // Loops until abort. This should SIGSEGV on abort.
+ rseq_loop(&r, &cs);
+
+ return 1;
+};
+
+// Abort must not be in the critical section.
+int TestAbortPreCommit() {
+ struct rseq r = {};
+ if (int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature + 1);
+ sys_errno(ret) != 0) {
+ return 1;
+ }
+
+ struct rseq_cs cs = {};
+ cs.version = 0;
+ cs.flags = 0;
+ cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start);
+ cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) -
+ reinterpret_cast<uint64_t>(&rseq_loop_start);
+ cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_pre_commit);
+
+ // Loops until abort. This should SIGSEGV on abort.
+ rseq_loop(&r, &cs);
+
+ return 1;
+};
+
+// rseq.rseq_cs is cleared on abort.
+int TestAbortClearsCS() {
+ struct rseq r = {};
+ if (int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature);
+ sys_errno(ret) != 0) {
+ return 1;
+ }
+
+ struct rseq_cs cs = {};
+ cs.version = 0;
+ cs.flags = 0;
+ cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start);
+ cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) -
+ reinterpret_cast<uint64_t>(&rseq_loop_start);
+ cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_abort);
+
+ // Loops until abort. If this returns then abort occurred.
+ rseq_loop(&r, &cs);
+
+ if (__atomic_load_n(&r.rseq_cs, __ATOMIC_RELAXED)) {
+ return 1;
+ }
+
+ return 0;
+};
+
+// rseq.rseq_cs is cleared on abort outside of critical section.
+int TestInvalidAbortClearsCS() {
+ struct rseq r = {};
+ if (int ret = sys_rseq(&r, sizeof(r), 0, kRseqSignature);
+ sys_errno(ret) != 0) {
+ return 1;
+ }
+
+ struct rseq_cs cs = {};
+ cs.version = 0;
+ cs.flags = 0;
+ cs.start_ip = reinterpret_cast<uint64_t>(&rseq_loop_start);
+ cs.post_commit_offset = reinterpret_cast<uint64_t>(&rseq_loop_post_commit) -
+ reinterpret_cast<uint64_t>(&rseq_loop_start);
+ cs.abort_ip = reinterpret_cast<uint64_t>(&rseq_loop_abort);
+
+ __atomic_store_n(&r.rseq_cs, &cs, __ATOMIC_RELAXED);
+
+ // When the next abort condition occurs, the kernel will clear cs once it
+ // determines we aren't in the critical section.
+ while (1) {
+ if (!__atomic_load_n(&r.rseq_cs, __ATOMIC_RELAXED)) {
+ break;
+ }
+ }
+
+ return 0;
+};
+
+// Exit codes:
+// 0 - Pass
+// 1 - Fail
+// 2 - Missing argument
+// 3 - Unknown test case
+extern "C" int main(int argc, char** argv, char** envp) {
+ if (argc != 2) {
+ // Usage: rseq <test case>
+ return 2;
+ }
+
+ if (strcmp(argv[1], kRseqTestUnaligned) == 0) {
+ return TestUnaligned();
+ }
+ if (strcmp(argv[1], kRseqTestRegister) == 0) {
+ return TestRegister();
+ }
+ if (strcmp(argv[1], kRseqTestDoubleRegister) == 0) {
+ return TestDoubleRegister();
+ }
+ if (strcmp(argv[1], kRseqTestRegisterUnregister) == 0) {
+ return TestRegisterUnregister();
+ }
+ if (strcmp(argv[1], kRseqTestUnregisterDifferentPtr) == 0) {
+ return TestUnregisterDifferentPtr();
+ }
+ if (strcmp(argv[1], kRseqTestUnregisterDifferentSignature) == 0) {
+ return TestUnregisterDifferentSignature();
+ }
+ if (strcmp(argv[1], kRseqTestCPU) == 0) {
+ return TestCPU();
+ }
+ if (strcmp(argv[1], kRseqTestAbort) == 0) {
+ return TestAbort();
+ }
+ if (strcmp(argv[1], kRseqTestAbortBefore) == 0) {
+ return TestAbortBefore();
+ }
+ if (strcmp(argv[1], kRseqTestAbortSignature) == 0) {
+ return TestAbortSignature();
+ }
+ if (strcmp(argv[1], kRseqTestAbortPreCommit) == 0) {
+ return TestAbortPreCommit();
+ }
+ if (strcmp(argv[1], kRseqTestAbortClearsCS) == 0) {
+ return TestAbortClearsCS();
+ }
+ if (strcmp(argv[1], kRseqTestInvalidAbortClearsCS) == 0) {
+ return TestInvalidAbortClearsCS();
+ }
+
+ return 3;
+}
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/rseq/start_amd64.S b/test/syscalls/linux/rseq/start_amd64.S
new file mode 100644
index 000000000..b9611b276
--- /dev/null
+++ b/test/syscalls/linux/rseq/start_amd64.S
@@ -0,0 +1,45 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+ .text
+ .align 4
+ .type _start,@function
+ .globl _start
+
+_start:
+ movq %rsp,%rdi
+ call __init
+ hlt
+
+ .size _start,.-_start
+ .section .note.GNU-stack,"",@progbits
+
+ .text
+ .globl raw_syscall
+ .type raw_syscall, @function
+
+raw_syscall:
+ mov %rdi,%rax // syscall #
+ mov %rsi,%rdi // arg0
+ mov %rdx,%rsi // arg1
+ mov %rcx,%rdx // arg2
+ mov %r8,%r10 // arg3 (goes in r10 instead of rcx for system calls)
+ mov %r9,%r8 // arg4
+ mov 0x8(%rsp),%r9 // arg5
+ syscall
+ ret
+
+ .size raw_syscall,.-raw_syscall
+ .section .note.GNU-stack,"",@progbits
diff --git a/test/syscalls/linux/rseq/start_arm64.S b/test/syscalls/linux/rseq/start_arm64.S
new file mode 100644
index 000000000..693c1c6eb
--- /dev/null
+++ b/test/syscalls/linux/rseq/start_arm64.S
@@ -0,0 +1,45 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+ .text
+ .align 4
+ .type _start,@function
+ .globl _start
+
+_start:
+ mov x29, sp
+ bl __init
+ wfi
+
+ .size _start,.-_start
+ .section .note.GNU-stack,"",@progbits
+
+ .text
+ .globl raw_syscall
+ .type raw_syscall, @function
+
+raw_syscall:
+ mov x8,x0 // syscall #
+ mov x0,x1 // arg0
+ mov x1,x2 // arg1
+ mov x2,x3 // arg2
+ mov x3,x4 // arg3
+ mov x4,x5 // arg4
+ mov x5,x6 // arg5
+ svc #0
+ ret
+
+ .size raw_syscall,.-raw_syscall
+ .section .note.GNU-stack,"",@progbits
diff --git a/test/syscalls/linux/rseq/syscalls.h b/test/syscalls/linux/rseq/syscalls.h
new file mode 100644
index 000000000..c4118e6c5
--- /dev/null
+++ b/test/syscalls/linux/rseq/syscalls.h
@@ -0,0 +1,69 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_RSEQ_SYSCALLS_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_RSEQ_SYSCALLS_H_
+
+#include "test/syscalls/linux/rseq/types.h"
+
+// Syscall numbers.
+#if defined(__x86_64__)
+constexpr int kGetpid = 39;
+constexpr int kExitGroup = 231;
+#elif defined(__aarch64__)
+constexpr int kGetpid = 172;
+constexpr int kExitGroup = 94;
+#else
+#error "Unknown architecture"
+#endif
+
+namespace gvisor {
+namespace testing {
+
+// Standalone system call interfaces.
+// Note that these are all "raw" system call interfaces which encode
+// errors by setting the return value to a small negative number.
+// Use sys_errno() to check system call return values for errors.
+
+// Maximum Linux error number.
+constexpr int kMaxErrno = 4095;
+
+// Errno values.
+#define EPERM 1
+#define EFAULT 14
+#define EBUSY 16
+#define EINVAL 22
+
+// Get the error number from a raw system call return value.
+// Returns a positive error number or 0 if there was no error.
+static inline int sys_errno(uintptr_t rval) {
+ if (rval >= static_cast<uintptr_t>(-kMaxErrno)) {
+ return -static_cast<int>(rval);
+ }
+ return 0;
+}
+
+extern "C" uintptr_t raw_syscall(int number, ...);
+
+static inline void sys_exit_group(int status) {
+ raw_syscall(kExitGroup, status);
+}
+static inline int sys_getpid() {
+ return static_cast<int>(raw_syscall(kGetpid));
+}
+
+} // namespace testing
+} // namespace gvisor
+
+#endif // GVISOR_TEST_SYSCALLS_LINUX_RSEQ_SYSCALLS_H_
diff --git a/test/syscalls/linux/rseq/test.h b/test/syscalls/linux/rseq/test.h
new file mode 100644
index 000000000..3b7bb74b1
--- /dev/null
+++ b/test/syscalls/linux/rseq/test.h
@@ -0,0 +1,43 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_RSEQ_TEST_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_RSEQ_TEST_H_
+
+namespace gvisor {
+namespace testing {
+
+// Test cases supported by rseq binary.
+
+inline constexpr char kRseqTestUnaligned[] = "unaligned";
+inline constexpr char kRseqTestRegister[] = "register";
+inline constexpr char kRseqTestDoubleRegister[] = "double-register";
+inline constexpr char kRseqTestRegisterUnregister[] = "register-unregister";
+inline constexpr char kRseqTestUnregisterDifferentPtr[] =
+ "unregister-different-ptr";
+inline constexpr char kRseqTestUnregisterDifferentSignature[] =
+ "unregister-different-signature";
+inline constexpr char kRseqTestCPU[] = "cpu";
+inline constexpr char kRseqTestAbort[] = "abort";
+inline constexpr char kRseqTestAbortBefore[] = "abort-before";
+inline constexpr char kRseqTestAbortSignature[] = "abort-signature";
+inline constexpr char kRseqTestAbortPreCommit[] = "abort-precommit";
+inline constexpr char kRseqTestAbortClearsCS[] = "abort-clears-cs";
+inline constexpr char kRseqTestInvalidAbortClearsCS[] =
+ "invalid-abort-clears-cs";
+
+} // namespace testing
+} // namespace gvisor
+
+#endif // GVISOR_TEST_SYSCALLS_LINUX_RSEQ_TEST_H_
diff --git a/test/syscalls/linux/rseq/types.h b/test/syscalls/linux/rseq/types.h
new file mode 100644
index 000000000..b6afe9817
--- /dev/null
+++ b/test/syscalls/linux/rseq/types.h
@@ -0,0 +1,31 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_RSEQ_TYPES_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_RSEQ_TYPES_H_
+
+using size_t = __SIZE_TYPE__;
+using uintptr_t = __UINTPTR_TYPE__;
+
+using uint8_t = __UINT8_TYPE__;
+using uint16_t = __UINT16_TYPE__;
+using uint32_t = __UINT32_TYPE__;
+using uint64_t = __UINT64_TYPE__;
+
+using int8_t = __INT8_TYPE__;
+using int16_t = __INT16_TYPE__;
+using int32_t = __INT32_TYPE__;
+using int64_t = __INT64_TYPE__;
+
+#endif // GVISOR_TEST_SYSCALLS_LINUX_RSEQ_TYPES_H_
diff --git a/test/syscalls/linux/rseq/uapi.h b/test/syscalls/linux/rseq/uapi.h
new file mode 100644
index 000000000..d3e60d0a4
--- /dev/null
+++ b/test/syscalls/linux/rseq/uapi.h
@@ -0,0 +1,51 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_RSEQ_UAPI_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_RSEQ_UAPI_H_
+
+#include <stdint.h>
+
+// User-kernel ABI for restartable sequences.
+
+// Syscall numbers.
+#if defined(__x86_64__)
+constexpr int kRseqSyscall = 334;
+#elif defined(__aarch64__)
+constexpr int kRseqSyscall = 293;
+#else
+#error "Unknown architecture"
+#endif // __x86_64__
+
+struct rseq_cs {
+ uint32_t version;
+ uint32_t flags;
+ uint64_t start_ip;
+ uint64_t post_commit_offset;
+ uint64_t abort_ip;
+} __attribute__((aligned(4 * sizeof(uint64_t))));
+
+// N.B. alignment is enforced by the kernel.
+struct rseq {
+ uint32_t cpu_id_start;
+ uint32_t cpu_id;
+ struct rseq_cs* rseq_cs;
+ uint32_t flags;
+} __attribute__((aligned(4 * sizeof(uint64_t))));
+
+constexpr int kRseqFlagUnregister = 1 << 0;
+
+constexpr int kRseqCPUIDUninitialized = -1;
+
+#endif // GVISOR_TEST_SYSCALLS_LINUX_RSEQ_UAPI_H_
diff --git a/test/syscalls/linux/rtsignal.cc b/test/syscalls/linux/rtsignal.cc
new file mode 100644
index 000000000..ed27e2566
--- /dev/null
+++ b/test/syscalls/linux/rtsignal.cc
@@ -0,0 +1,171 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <cerrno>
+#include <csignal>
+
+#include "gtest/gtest.h"
+#include "test/util/cleanup.h"
+#include "test/util/logging.h"
+#include "test/util/posix_error.h"
+#include "test/util/signal_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// saved_info is set by the handler.
+siginfo_t saved_info;
+
+// has_saved_info is set to true by the handler.
+volatile bool has_saved_info;
+
+void SigHandler(int sig, siginfo_t* info, void* context) {
+ // Copy to the given info.
+ saved_info = *info;
+ has_saved_info = true;
+}
+
+void ClearSavedInfo() {
+ // Clear the cached info.
+ memset(&saved_info, 0, sizeof(saved_info));
+ has_saved_info = false;
+}
+
+PosixErrorOr<Cleanup> SetupSignalHandler(int sig) {
+ struct sigaction sa;
+ sa.sa_sigaction = SigHandler;
+ sigfillset(&sa.sa_mask);
+ sa.sa_flags = SA_SIGINFO;
+ return ScopedSigaction(sig, sa);
+}
+
+class RtSignalTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ action_cleanup_ = ASSERT_NO_ERRNO_AND_VALUE(SetupSignalHandler(SIGUSR1));
+ mask_cleanup_ =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_UNBLOCK, SIGUSR1));
+ }
+
+ void TearDown() override { ClearSavedInfo(); }
+
+ private:
+ Cleanup action_cleanup_;
+ Cleanup mask_cleanup_;
+};
+
+static int rt_sigqueueinfo(pid_t tgid, int sig, siginfo_t* uinfo) {
+ int ret;
+ do {
+ // NOTE(b/25434735): rt_sigqueueinfo(2) could return EAGAIN for RT signals.
+ ret = syscall(SYS_rt_sigqueueinfo, tgid, sig, uinfo);
+ } while (ret == -1 && errno == EAGAIN);
+ return ret;
+}
+
+TEST_F(RtSignalTest, InvalidTID) {
+ siginfo_t uinfo;
+ // Depending on the kernel version, these calls may fail with
+ // ESRCH (goobunutu machines) or EPERM (production machines). Thus,
+ // the test simply ensures that they do fail.
+ EXPECT_THAT(rt_sigqueueinfo(-1, SIGUSR1, &uinfo), SyscallFails());
+ EXPECT_FALSE(has_saved_info);
+ EXPECT_THAT(rt_sigqueueinfo(0, SIGUSR1, &uinfo), SyscallFails());
+ EXPECT_FALSE(has_saved_info);
+}
+
+TEST_F(RtSignalTest, InvalidCodes) {
+ siginfo_t uinfo;
+
+ // We need a child for the code checks to apply. If the process is delivering
+ // to itself, then it can use whatever codes it wants and they will go
+ // through.
+ pid_t child = fork();
+ if (child == 0) {
+ _exit(1);
+ }
+ ASSERT_THAT(child, SyscallSucceeds());
+
+ // These are not allowed for child processes.
+ uinfo.si_code = 0; // SI_USER.
+ EXPECT_THAT(rt_sigqueueinfo(child, SIGUSR1, &uinfo),
+ SyscallFailsWithErrno(EPERM));
+ uinfo.si_code = 0x80; // SI_KERNEL.
+ EXPECT_THAT(rt_sigqueueinfo(child, SIGUSR1, &uinfo),
+ SyscallFailsWithErrno(EPERM));
+ uinfo.si_code = -6; // SI_TKILL.
+ EXPECT_THAT(rt_sigqueueinfo(child, SIGUSR1, &uinfo),
+ SyscallFailsWithErrno(EPERM));
+ uinfo.si_code = -1; // SI_QUEUE (allowed).
+ EXPECT_THAT(rt_sigqueueinfo(child, SIGUSR1, &uinfo), SyscallSucceeds());
+
+ // Join the child process.
+ EXPECT_THAT(waitpid(child, nullptr, 0), SyscallSucceeds());
+}
+
+TEST_F(RtSignalTest, ValueDelivered) {
+ siginfo_t uinfo;
+ uinfo.si_code = -1; // SI_QUEUE (allowed).
+ uinfo.si_errno = 0x1234;
+
+ EXPECT_EQ(saved_info.si_errno, 0x0);
+ EXPECT_THAT(rt_sigqueueinfo(getpid(), SIGUSR1, &uinfo), SyscallSucceeds());
+ EXPECT_TRUE(has_saved_info);
+ EXPECT_EQ(saved_info.si_errno, 0x1234);
+}
+
+TEST_F(RtSignalTest, SignoMatch) {
+ auto action2_cleanup = ASSERT_NO_ERRNO_AND_VALUE(SetupSignalHandler(SIGUSR2));
+ auto mask2_cleanup =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_UNBLOCK, SIGUSR2));
+
+ siginfo_t uinfo;
+ uinfo.si_code = -1; // SI_QUEUE (allowed).
+
+ EXPECT_THAT(rt_sigqueueinfo(getpid(), SIGUSR1, &uinfo), SyscallSucceeds());
+ EXPECT_TRUE(has_saved_info);
+ EXPECT_EQ(saved_info.si_signo, SIGUSR1);
+
+ ClearSavedInfo();
+
+ EXPECT_THAT(rt_sigqueueinfo(getpid(), SIGUSR2, &uinfo), SyscallSucceeds());
+ EXPECT_TRUE(has_saved_info);
+ EXPECT_EQ(saved_info.si_signo, SIGUSR2);
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
+
+int main(int argc, char** argv) {
+ // These tests depend on delivering SIGUSR1/2 to the main thread (so they can
+ // synchronously check has_saved_info). Block these so that any other threads
+ // created by TestInit will also have them blocked.
+ sigset_t set;
+ sigemptyset(&set);
+ sigaddset(&set, SIGUSR1);
+ sigaddset(&set, SIGUSR2);
+ TEST_PCHECK(sigprocmask(SIG_BLOCK, &set, nullptr) == 0);
+
+ gvisor::testing::TestInit(&argc, &argv);
+ return gvisor::testing::RunAllTests();
+}
diff --git a/test/syscalls/linux/sched.cc b/test/syscalls/linux/sched.cc
new file mode 100644
index 000000000..735e99411
--- /dev/null
+++ b/test/syscalls/linux/sched.cc
@@ -0,0 +1,71 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <sched.h>
+
+#include "gtest/gtest.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// In linux, pid is limited to 29 bits because how futex is implemented.
+constexpr int kImpossiblePID = (1 << 29) + 1;
+
+TEST(SchedGetparamTest, ReturnsZero) {
+ struct sched_param param;
+ EXPECT_THAT(sched_getparam(getpid(), &param), SyscallSucceeds());
+ EXPECT_EQ(param.sched_priority, 0);
+ EXPECT_THAT(sched_getparam(/*pid=*/0, &param), SyscallSucceeds());
+ EXPECT_EQ(param.sched_priority, 0);
+}
+
+TEST(SchedGetparamTest, InvalidPIDReturnsEINVAL) {
+ struct sched_param param;
+ EXPECT_THAT(sched_getparam(/*pid=*/-1, &param),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(SchedGetparamTest, ImpossiblePIDReturnsESRCH) {
+ struct sched_param param;
+ EXPECT_THAT(sched_getparam(kImpossiblePID, &param),
+ SyscallFailsWithErrno(ESRCH));
+}
+
+TEST(SchedGetparamTest, NullParamReturnsEINVAL) {
+ EXPECT_THAT(sched_getparam(0, nullptr), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(SchedGetschedulerTest, ReturnsSchedOther) {
+ EXPECT_THAT(sched_getscheduler(getpid()),
+ SyscallSucceedsWithValue(SCHED_OTHER));
+ EXPECT_THAT(sched_getscheduler(/*pid=*/0),
+ SyscallSucceedsWithValue(SCHED_OTHER));
+}
+
+TEST(SchedGetschedulerTest, ReturnsEINVAL) {
+ EXPECT_THAT(sched_getscheduler(/*pid=*/-1), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(SchedGetschedulerTest, ReturnsESRCH) {
+ EXPECT_THAT(sched_getscheduler(kImpossiblePID), SyscallFailsWithErrno(ESRCH));
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/sched_yield.cc b/test/syscalls/linux/sched_yield.cc
new file mode 100644
index 000000000..5d24f5b58
--- /dev/null
+++ b/test/syscalls/linux/sched_yield.cc
@@ -0,0 +1,33 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sched.h>
+
+#include "gtest/gtest.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(SchedYieldTest, Success) {
+ EXPECT_THAT(sched_yield(), SyscallSucceeds());
+ EXPECT_THAT(sched_yield(), SyscallSucceeds());
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/seccomp.cc b/test/syscalls/linux/seccomp.cc
new file mode 100644
index 000000000..ce88d90dd
--- /dev/null
+++ b/test/syscalls/linux/seccomp.cc
@@ -0,0 +1,425 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <linux/audit.h>
+#include <linux/filter.h>
+#include <linux/seccomp.h>
+#include <pthread.h>
+#include <sched.h>
+#include <signal.h>
+#include <string.h>
+#include <sys/prctl.h>
+#include <sys/syscall.h>
+#include <time.h>
+#include <ucontext.h>
+#include <unistd.h>
+
+#include <atomic>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/base/macros.h"
+#include "test/util/logging.h"
+#include "test/util/memory_util.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/proc_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+#ifndef SYS_SECCOMP
+#define SYS_SECCOMP 1
+#endif
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// A syscall not implemented by Linux that we don't expect to be called.
+#ifdef __x86_64__
+constexpr uint32_t kFilteredSyscall = SYS_vserver;
+#elif __aarch64__
+// Use the last of arch_specific_syscalls which are not implemented on arm64.
+constexpr uint32_t kFilteredSyscall = __NR_arch_specific_syscall + 15;
+#endif
+
+// Applies a seccomp-bpf filter that returns `filtered_result` for
+// `sysno` and allows all other syscalls. Async-signal-safe.
+void ApplySeccompFilter(uint32_t sysno, uint32_t filtered_result,
+ uint32_t flags = 0) {
+ // "Prior to [PR_SET_SECCOMP], the task must call prctl(PR_SET_NO_NEW_PRIVS,
+ // 1) or run with CAP_SYS_ADMIN privileges in its namespace." -
+ // Documentation/prctl/seccomp_filter.txt
+ //
+ // prctl(PR_SET_NO_NEW_PRIVS, 1) may be called repeatedly; calls after the
+ // first are no-ops.
+ TEST_PCHECK(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == 0);
+ MaybeSave();
+
+ struct sock_filter filter[] = {
+ // A = seccomp_data.arch
+ BPF_STMT(BPF_LD | BPF_ABS | BPF_W, 4),
+#if defined(__x86_64__)
+ // if (A != AUDIT_ARCH_X86_64) goto kill
+ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_X86_64, 0, 4),
+#elif defined(__aarch64__)
+ // if (A != AUDIT_ARCH_AARCH64) goto kill
+ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_AARCH64, 0, 4),
+#else
+#error "Unknown architecture"
+#endif
+ // A = seccomp_data.nr
+ BPF_STMT(BPF_LD | BPF_ABS | BPF_W, 0),
+ // if (A != sysno) goto allow
+ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, sysno, 0, 1),
+ // return filtered_result
+ BPF_STMT(BPF_RET | BPF_K, filtered_result),
+ // allow: return SECCOMP_RET_ALLOW
+ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
+ // kill: return SECCOMP_RET_KILL
+ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL),
+ };
+ struct sock_fprog prog;
+ prog.len = ABSL_ARRAYSIZE(filter);
+ prog.filter = filter;
+ if (flags) {
+ TEST_CHECK(syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER, flags, &prog) ==
+ 0);
+ } else {
+ TEST_PCHECK(prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0) == 0);
+ }
+ MaybeSave();
+}
+
+// Wrapper for sigaction. Async-signal-safe.
+void RegisterSignalHandler(int signum,
+ void (*handler)(int, siginfo_t*, void*)) {
+ struct sigaction sa = {};
+ sa.sa_sigaction = handler;
+ sigemptyset(&sa.sa_mask);
+ sa.sa_flags = SA_SIGINFO;
+ TEST_PCHECK(sigaction(signum, &sa, nullptr) == 0);
+ MaybeSave();
+}
+
+// All of the following tests execute in a subprocess to ensure that each test
+// is run in a separate process. This avoids cross-contamination of seccomp
+// state between tests, and is necessary to ensure that test processes killed
+// by SECCOMP_RET_KILL are single-threaded (since SECCOMP_RET_KILL only kills
+// the offending thread, not the whole thread group).
+
+TEST(SeccompTest, RetKillCausesDeathBySIGSYS) {
+ pid_t const pid = fork();
+ if (pid == 0) {
+ // Register a signal handler for SIGSYS that we don't expect to be invoked.
+ RegisterSignalHandler(
+ SIGSYS, +[](int, siginfo_t*, void*) { _exit(1); });
+ ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_KILL);
+ syscall(kFilteredSyscall);
+ TEST_CHECK_MSG(false, "Survived invocation of test syscall");
+ }
+ ASSERT_THAT(pid, SyscallSucceeds());
+ int status;
+ ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
+ EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGSYS)
+ << "status " << status;
+}
+
+TEST(SeccompTest, RetKillOnlyKillsOneThread) {
+ Mapping stack = ASSERT_NO_ERRNO_AND_VALUE(
+ MmapAnon(2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+
+ pid_t const pid = fork();
+ if (pid == 0) {
+ // Register a signal handler for SIGSYS that we don't expect to be invoked.
+ RegisterSignalHandler(
+ SIGSYS, +[](int, siginfo_t*, void*) { _exit(1); });
+ ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_KILL);
+ // Pass CLONE_VFORK to block the original thread in the child process until
+ // the clone thread exits with SIGSYS.
+ //
+ // N.B. clone(2) is not officially async-signal-safe, but at minimum glibc's
+ // x86_64 implementation is safe. See glibc
+ // sysdeps/unix/sysv/linux/x86_64/clone.S.
+ clone(
+ +[](void* arg) {
+ syscall(kFilteredSyscall); // should kill the thread
+ _exit(1); // should be unreachable
+ return 2; // should be very unreachable, shut up the compiler
+ },
+ stack.endptr(),
+ CLONE_FILES | CLONE_FS | CLONE_SIGHAND | CLONE_THREAD | CLONE_VM |
+ CLONE_VFORK,
+ nullptr);
+ _exit(0);
+ }
+ ASSERT_THAT(pid, SyscallSucceeds());
+ int status;
+ ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+ << "status " << status;
+}
+
+TEST(SeccompTest, RetTrapCausesSIGSYS) {
+ pid_t const pid = fork();
+ if (pid == 0) {
+ constexpr uint16_t kTrapValue = 0xdead;
+ RegisterSignalHandler(
+ SIGSYS, +[](int signo, siginfo_t* info, void* ucv) {
+ ucontext_t* uc = static_cast<ucontext_t*>(ucv);
+ // This is a signal handler, so we must stay async-signal-safe.
+ TEST_CHECK(info->si_signo == SIGSYS);
+ TEST_CHECK(info->si_code == SYS_SECCOMP);
+ TEST_CHECK(info->si_errno == kTrapValue);
+ TEST_CHECK(info->si_call_addr != nullptr);
+ TEST_CHECK(info->si_syscall == kFilteredSyscall);
+#if defined(__x86_64__)
+ TEST_CHECK(info->si_arch == AUDIT_ARCH_X86_64);
+ TEST_CHECK(uc->uc_mcontext.gregs[REG_RAX] == kFilteredSyscall);
+#elif defined(__aarch64__)
+ TEST_CHECK(info->si_arch == AUDIT_ARCH_AARCH64);
+ TEST_CHECK(uc->uc_mcontext.regs[8] == kFilteredSyscall);
+#endif // defined(__x86_64__)
+ _exit(0);
+ });
+ ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_TRAP | kTrapValue);
+ syscall(kFilteredSyscall);
+ TEST_CHECK_MSG(false, "Survived invocation of test syscall");
+ }
+ ASSERT_THAT(pid, SyscallSucceeds());
+ int status;
+ ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+ << "status " << status;
+}
+
+#ifdef __x86_64__
+
+constexpr uint64_t kVsyscallTimeEntry = 0xffffffffff600400;
+
+time_t vsyscall_time(time_t* t) {
+ return reinterpret_cast<time_t (*)(time_t*)>(kVsyscallTimeEntry)(t);
+}
+
+TEST(SeccompTest, SeccompAppliesToVsyscall) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsVsyscallEnabled()));
+
+ pid_t const pid = fork();
+ if (pid == 0) {
+ constexpr uint16_t kTrapValue = 0xdead;
+ RegisterSignalHandler(
+ SIGSYS, +[](int signo, siginfo_t* info, void* ucv) {
+ ucontext_t* uc = static_cast<ucontext_t*>(ucv);
+ // This is a signal handler, so we must stay async-signal-safe.
+ TEST_CHECK(info->si_signo == SIGSYS);
+ TEST_CHECK(info->si_code == SYS_SECCOMP);
+ TEST_CHECK(info->si_errno == kTrapValue);
+ TEST_CHECK(info->si_call_addr != nullptr);
+ TEST_CHECK(info->si_syscall == SYS_time);
+ TEST_CHECK(info->si_arch == AUDIT_ARCH_X86_64);
+ TEST_CHECK(uc->uc_mcontext.gregs[REG_RAX] == SYS_time);
+ _exit(0);
+ });
+ ApplySeccompFilter(SYS_time, SECCOMP_RET_TRAP | kTrapValue);
+ vsyscall_time(nullptr); // Should result in death.
+ TEST_CHECK_MSG(false, "Survived invocation of test syscall");
+ }
+ ASSERT_THAT(pid, SyscallSucceeds());
+ int status;
+ ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+ << "status " << status;
+}
+
+TEST(SeccompTest, RetKillVsyscallCausesDeathBySIGSYS) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsVsyscallEnabled()));
+
+ pid_t const pid = fork();
+ if (pid == 0) {
+ // Register a signal handler for SIGSYS that we don't expect to be invoked.
+ RegisterSignalHandler(
+ SIGSYS, +[](int, siginfo_t*, void*) { _exit(1); });
+ ApplySeccompFilter(SYS_time, SECCOMP_RET_KILL);
+ vsyscall_time(nullptr); // Should result in death.
+ TEST_CHECK_MSG(false, "Survived invocation of test syscall");
+ }
+ ASSERT_THAT(pid, SyscallSucceeds());
+ int status;
+ ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
+ EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGSYS)
+ << "status " << status;
+}
+
+#endif // defined(__x86_64__)
+
+TEST(SeccompTest, RetTraceWithoutPtracerReturnsENOSYS) {
+ pid_t const pid = fork();
+ if (pid == 0) {
+ ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_TRACE);
+ TEST_CHECK(syscall(kFilteredSyscall) == -1 && errno == ENOSYS);
+ _exit(0);
+ }
+ ASSERT_THAT(pid, SyscallSucceeds());
+ int status;
+ ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+ << "status " << status;
+}
+
+TEST(SeccompTest, RetErrnoReturnsErrno) {
+ pid_t const pid = fork();
+ if (pid == 0) {
+ // ENOTNAM: "Not a XENIX named type file"
+ ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_ERRNO | ENOTNAM);
+ TEST_CHECK(syscall(kFilteredSyscall) == -1 && errno == ENOTNAM);
+ _exit(0);
+ }
+ ASSERT_THAT(pid, SyscallSucceeds());
+ int status;
+ ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+ << "status " << status;
+}
+
+TEST(SeccompTest, RetAllowAllowsSyscall) {
+ pid_t const pid = fork();
+ if (pid == 0) {
+ ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_ALLOW);
+ TEST_CHECK(syscall(kFilteredSyscall) == -1 && errno == ENOSYS);
+ _exit(0);
+ }
+ ASSERT_THAT(pid, SyscallSucceeds());
+ int status;
+ ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+ << "status " << status;
+}
+
+// This test will validate that TSYNC will apply to all threads.
+TEST(SeccompTest, TsyncAppliesToAllThreads) {
+ Mapping stack = ASSERT_NO_ERRNO_AND_VALUE(
+ MmapAnon(2 * kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE));
+
+ // We don't want to apply this policy to other test runner threads, so fork.
+ const pid_t pid = fork();
+
+ if (pid == 0) {
+ // First check that we receive a ENOSYS before the policy is applied.
+ TEST_CHECK(syscall(kFilteredSyscall) == -1 && errno == ENOSYS);
+
+ // N.B. clone(2) is not officially async-signal-safe, but at minimum glibc's
+ // x86_64 implementation is safe. See glibc
+ // sysdeps/unix/sysv/linux/x86_64/clone.S.
+ clone(
+ +[](void* arg) {
+ ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_ERRNO | ENOTNAM,
+ SECCOMP_FILTER_FLAG_TSYNC);
+ return 0;
+ },
+ stack.endptr(),
+ CLONE_FILES | CLONE_FS | CLONE_SIGHAND | CLONE_THREAD | CLONE_VM |
+ CLONE_VFORK,
+ nullptr);
+
+ // Because we're using CLONE_VFORK this thread will be blocked until
+ // the second thread has released resources to our virtual memory, since
+ // we're not execing that will happen on _exit.
+
+ // Now verify that the policy applied to this thread too.
+ TEST_CHECK(syscall(kFilteredSyscall) == -1 && errno == ENOTNAM);
+ _exit(0);
+ }
+
+ ASSERT_THAT(pid, SyscallSucceeds());
+ int status = 0;
+ ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+ << "status " << status;
+}
+
+// This test will validate that seccomp(2) rejects unsupported flags.
+TEST(SeccompTest, SeccompRejectsUnknownFlags) {
+ constexpr uint32_t kInvalidFlag = 123;
+ ASSERT_THAT(
+ syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER, kInvalidFlag, nullptr),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(SeccompTest, LeastPermissiveFilterReturnValueApplies) {
+ // This is RetKillCausesDeathBySIGSYS, plus extra filters before and after the
+ // one that causes the kill that should be ignored.
+ pid_t const pid = fork();
+ if (pid == 0) {
+ RegisterSignalHandler(
+ SIGSYS, +[](int, siginfo_t*, void*) { _exit(1); });
+ ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_TRACE);
+ ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_KILL);
+ ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_ERRNO | ENOTNAM);
+ syscall(kFilteredSyscall);
+ TEST_CHECK_MSG(false, "Survived invocation of test syscall");
+ }
+ ASSERT_THAT(pid, SyscallSucceeds());
+ int status;
+ ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
+ EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGSYS)
+ << "status " << status;
+}
+
+// Passed as argv[1] to cause the test binary to invoke kFilteredSyscall and
+// exit. Not a real flag since flag parsing happens during initialization,
+// which may create threads.
+constexpr char kInvokeFilteredSyscallFlag[] = "--seccomp_test_child";
+
+TEST(SeccompTest, FiltersPreservedAcrossForkAndExecve) {
+ ExecveArray const grandchild_argv(
+ {"/proc/self/exe", kInvokeFilteredSyscallFlag});
+
+ pid_t const pid = fork();
+ if (pid == 0) {
+ ApplySeccompFilter(kFilteredSyscall, SECCOMP_RET_KILL);
+ pid_t const grandchild_pid = fork();
+ if (grandchild_pid == 0) {
+ execve(grandchild_argv.get()[0], grandchild_argv.get(),
+ /* envp = */ nullptr);
+ TEST_PCHECK_MSG(false, "execve failed");
+ }
+ int status;
+ TEST_PCHECK(waitpid(grandchild_pid, &status, 0) == grandchild_pid);
+ TEST_CHECK(WIFSIGNALED(status) && WTERMSIG(status) == SIGSYS);
+ _exit(0);
+ }
+ ASSERT_THAT(pid, SyscallSucceeds());
+ int status;
+ ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+ << "status " << status;
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
+
+int main(int argc, char** argv) {
+ if (argc >= 2 &&
+ strcmp(argv[1], gvisor::testing::kInvokeFilteredSyscallFlag) == 0) {
+ syscall(gvisor::testing::kFilteredSyscall);
+ exit(0);
+ }
+
+ gvisor::testing::TestInit(&argc, &argv);
+ return gvisor::testing::RunAllTests();
+}
diff --git a/test/syscalls/linux/select.cc b/test/syscalls/linux/select.cc
new file mode 100644
index 000000000..be2364fb8
--- /dev/null
+++ b/test/syscalls/linux/select.cc
@@ -0,0 +1,168 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <sys/resource.h>
+#include <sys/select.h>
+#include <sys/time.h>
+
+#include <climits>
+#include <csignal>
+#include <cstdio>
+
+#include "gtest/gtest.h"
+#include "absl/time/time.h"
+#include "test/syscalls/linux/base_poll_test.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/rlimit_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+class SelectTest : public BasePollTest {
+ protected:
+ void SetUp() override { BasePollTest::SetUp(); }
+ void TearDown() override { BasePollTest::TearDown(); }
+};
+
+// See that when there are no FD sets, select behaves like sleep.
+TEST_F(SelectTest, NullFds) {
+ struct timeval timeout = absl::ToTimeval(absl::Milliseconds(10));
+ ASSERT_THAT(select(0, nullptr, nullptr, nullptr, &timeout),
+ SyscallSucceeds());
+ EXPECT_EQ(timeout.tv_sec, 0);
+ EXPECT_EQ(timeout.tv_usec, 0);
+
+ timeout = absl::ToTimeval(absl::Milliseconds(10));
+ ASSERT_THAT(select(1, nullptr, nullptr, nullptr, &timeout),
+ SyscallSucceeds());
+ EXPECT_EQ(timeout.tv_sec, 0);
+ EXPECT_EQ(timeout.tv_usec, 0);
+}
+
+TEST_F(SelectTest, NegativeNfds) {
+ EXPECT_THAT(select(-1, nullptr, nullptr, nullptr, nullptr),
+ SyscallFailsWithErrno(EINVAL));
+ EXPECT_THAT(select(-100000, nullptr, nullptr, nullptr, nullptr),
+ SyscallFailsWithErrno(EINVAL));
+ EXPECT_THAT(select(INT_MIN, nullptr, nullptr, nullptr, nullptr),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_F(SelectTest, ClosedFds) {
+ auto temp_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(temp_file.path(), O_RDONLY));
+
+ // We can't rely on a file descriptor being closed in a multi threaded
+ // application so fork to get a clean process.
+ EXPECT_THAT(InForkedProcess([&] {
+ int fd_num = fd.get();
+ fd.reset();
+
+ fd_set read_set;
+ FD_ZERO(&read_set);
+ FD_SET(fd_num, &read_set);
+
+ struct timeval timeout =
+ absl::ToTimeval(absl::Milliseconds(10));
+ TEST_PCHECK(select(fd_num + 1, &read_set, nullptr, nullptr,
+ &timeout) != 0);
+ TEST_PCHECK(errno == EBADF);
+ }),
+ IsPosixErrorOkAndHolds(0));
+}
+
+TEST_F(SelectTest, ZeroTimeout) {
+ struct timeval timeout = {};
+ EXPECT_THAT(select(1, nullptr, nullptr, nullptr, &timeout),
+ SyscallSucceeds());
+ // Ignore timeout as its value is now undefined.
+}
+
+// If random S/R interrupts the select, SIGALRM may be delivered before select
+// restarts, causing the select to hang forever.
+TEST_F(SelectTest, NoTimeout_NoRandomSave) {
+ // When there's no timeout, select may never return so set a timer.
+ SetTimer(absl::Milliseconds(100));
+ // See that we get interrupted by the timer.
+ ASSERT_THAT(select(1, nullptr, nullptr, nullptr, nullptr),
+ SyscallFailsWithErrno(EINTR));
+ EXPECT_TRUE(TimerFired());
+}
+
+TEST_F(SelectTest, InvalidTimeoutNegative) {
+ struct timeval timeout = absl::ToTimeval(absl::Microseconds(-1));
+ EXPECT_THAT(select(1, nullptr, nullptr, nullptr, &timeout),
+ SyscallFailsWithErrno(EINVAL));
+ // Ignore timeout as its value is now undefined.
+}
+
+// Verify that a signal interrupts select.
+//
+// If random S/R interrupts the select, SIGALRM may be delivered before select
+// restarts, causing the select to hang forever.
+TEST_F(SelectTest, InterruptedBySignal_NoRandomSave) {
+ absl::Duration duration(absl::Seconds(5));
+ struct timeval timeout = absl::ToTimeval(duration);
+ SetTimer(absl::Milliseconds(100));
+ ASSERT_FALSE(TimerFired());
+ ASSERT_THAT(select(1, nullptr, nullptr, nullptr, &timeout),
+ SyscallFailsWithErrno(EINTR));
+ EXPECT_TRUE(TimerFired());
+ // Ignore timeout as its value is now undefined.
+}
+
+TEST_F(SelectTest, IgnoreBitsAboveNfds) {
+ // fd_set is a bit array with at least FD_SETSIZE bits. Test that bits
+ // corresponding to file descriptors above nfds are ignored.
+ fd_set read_set;
+ FD_ZERO(&read_set);
+ constexpr int kNfds = 1;
+ for (int fd = kNfds; fd < FD_SETSIZE; fd++) {
+ FD_SET(fd, &read_set);
+ }
+ // Pass a zero timeout so that select returns immediately.
+ struct timeval timeout = {};
+ EXPECT_THAT(select(kNfds, &read_set, nullptr, nullptr, &timeout),
+ SyscallSucceedsWithValue(0));
+}
+
+// This test illustrates Linux's behavior of 'select' calls passing after
+// setrlimit RLIMIT_NOFILE is called. In particular, versions of sshd rely on
+// this behavior. See b/122318458.
+TEST_F(SelectTest, SetrlimitCallNOFILE) {
+ fd_set read_set;
+ FD_ZERO(&read_set);
+ timeval timeout = {};
+ const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Open(NewTempAbsPath(), O_RDONLY | O_CREAT, S_IRUSR));
+
+ Cleanup reset_rlimit =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSetSoftRlimit(RLIMIT_NOFILE, 0));
+
+ FD_SET(fd.get(), &read_set);
+ // this call with zero timeout should return immediately
+ EXPECT_THAT(select(fd.get() + 1, &read_set, nullptr, nullptr, &timeout),
+ SyscallSucceeds());
+}
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/semaphore.cc b/test/syscalls/linux/semaphore.cc
new file mode 100644
index 000000000..e9b131ca9
--- /dev/null
+++ b/test/syscalls/linux/semaphore.cc
@@ -0,0 +1,491 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/ipc.h>
+#include <sys/sem.h>
+#include <sys/types.h>
+
+#include <atomic>
+#include <cerrno>
+#include <ctime>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/base/macros.h"
+#include "absl/memory/memory.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/clock.h"
+#include "test/util/capability_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+class AutoSem {
+ public:
+ explicit AutoSem(int id) : id_(id) {}
+ ~AutoSem() {
+ if (id_ >= 0) {
+ EXPECT_THAT(semctl(id_, 0, IPC_RMID), SyscallSucceeds());
+ }
+ }
+
+ int release() {
+ int old = id_;
+ id_ = -1;
+ return old;
+ }
+
+ int get() { return id_; }
+
+ private:
+ int id_ = -1;
+};
+
+TEST(SemaphoreTest, SemGet) {
+ // Test creation and lookup.
+ AutoSem sem(semget(1, 10, IPC_CREAT));
+ ASSERT_THAT(sem.get(), SyscallSucceeds());
+ EXPECT_THAT(semget(1, 10, IPC_CREAT), SyscallSucceedsWithValue(sem.get()));
+ EXPECT_THAT(semget(1, 9, IPC_CREAT), SyscallSucceedsWithValue(sem.get()));
+
+ // Creation and lookup failure cases.
+ EXPECT_THAT(semget(1, 11, IPC_CREAT), SyscallFailsWithErrno(EINVAL));
+ EXPECT_THAT(semget(1, -1, IPC_CREAT), SyscallFailsWithErrno(EINVAL));
+ EXPECT_THAT(semget(1, 10, IPC_CREAT | IPC_EXCL),
+ SyscallFailsWithErrno(EEXIST));
+ EXPECT_THAT(semget(2, 1, 0), SyscallFailsWithErrno(ENOENT));
+ EXPECT_THAT(semget(2, 0, IPC_CREAT), SyscallFailsWithErrno(EINVAL));
+
+ // Private semaphores never conflict.
+ AutoSem sem2(semget(IPC_PRIVATE, 1, 0));
+ AutoSem sem3(semget(IPC_PRIVATE, 1, 0));
+ ASSERT_THAT(sem2.get(), SyscallSucceeds());
+ EXPECT_NE(sem.get(), sem2.get());
+ ASSERT_THAT(sem3.get(), SyscallSucceeds());
+ EXPECT_NE(sem3.get(), sem2.get());
+}
+
+// Tests simple operations that shouldn't block in a single-thread.
+TEST(SemaphoreTest, SemOpSingleNoBlock) {
+ AutoSem sem(semget(IPC_PRIVATE, 1, 0600 | IPC_CREAT));
+ ASSERT_THAT(sem.get(), SyscallSucceeds());
+
+ struct sembuf buf = {};
+ buf.sem_op = 1;
+ ASSERT_THAT(semop(sem.get(), &buf, 1), SyscallSucceeds());
+
+ buf.sem_op = -1;
+ ASSERT_THAT(semop(sem.get(), &buf, 1), SyscallSucceeds());
+
+ buf.sem_op = 0;
+ ASSERT_THAT(semop(sem.get(), &buf, 1), SyscallSucceeds());
+
+ // Error cases with invalid values.
+ ASSERT_THAT(semop(sem.get() + 1, &buf, 1), SyscallFailsWithErrno(EINVAL));
+
+ buf.sem_num = 1;
+ ASSERT_THAT(semop(sem.get(), &buf, 1), SyscallFailsWithErrno(EFBIG));
+
+ ASSERT_THAT(semop(sem.get(), nullptr, 0), SyscallFailsWithErrno(EINVAL));
+}
+
+// Tests multiple operations that shouldn't block in a single-thread.
+TEST(SemaphoreTest, SemOpMultiNoBlock) {
+ AutoSem sem(semget(IPC_PRIVATE, 4, 0600 | IPC_CREAT));
+ ASSERT_THAT(sem.get(), SyscallSucceeds());
+
+ struct sembuf bufs[5] = {};
+ bufs[0].sem_num = 0;
+ bufs[0].sem_op = 10;
+ bufs[0].sem_flg = 0;
+
+ bufs[1].sem_num = 1;
+ bufs[1].sem_op = 2;
+ bufs[1].sem_flg = 0;
+
+ bufs[2].sem_num = 2;
+ bufs[2].sem_op = 3;
+ bufs[2].sem_flg = 0;
+
+ bufs[3].sem_num = 0;
+ bufs[3].sem_op = -5;
+ bufs[3].sem_flg = 0;
+
+ bufs[4].sem_num = 2;
+ bufs[4].sem_op = 2;
+ bufs[4].sem_flg = 0;
+
+ ASSERT_THAT(semop(sem.get(), bufs, ABSL_ARRAYSIZE(bufs)), SyscallSucceeds());
+
+ ASSERT_THAT(semctl(sem.get(), 0, GETVAL), SyscallSucceedsWithValue(5));
+ ASSERT_THAT(semctl(sem.get(), 1, GETVAL), SyscallSucceedsWithValue(2));
+ ASSERT_THAT(semctl(sem.get(), 2, GETVAL), SyscallSucceedsWithValue(5));
+ ASSERT_THAT(semctl(sem.get(), 3, GETVAL), SyscallSucceedsWithValue(0));
+
+ for (auto& b : bufs) {
+ b.sem_op = -b.sem_op;
+ }
+ // 0 and 3 order must be reversed, otherwise it will block.
+ std::swap(bufs[0].sem_op, bufs[3].sem_op);
+ ASSERT_THAT(RetryEINTR(semop)(sem.get(), bufs, ABSL_ARRAYSIZE(bufs)),
+ SyscallSucceeds());
+
+ // All semaphores should be back to 0 now.
+ for (size_t i = 0; i < 4; ++i) {
+ ASSERT_THAT(semctl(sem.get(), i, GETVAL), SyscallSucceedsWithValue(0));
+ }
+}
+
+// Makes a best effort attempt to ensure that operation would block.
+TEST(SemaphoreTest, SemOpBlock) {
+ AutoSem sem(semget(IPC_PRIVATE, 1, 0600 | IPC_CREAT));
+ ASSERT_THAT(sem.get(), SyscallSucceeds());
+
+ std::atomic<int> blocked = ATOMIC_VAR_INIT(1);
+ ScopedThread th([&sem, &blocked] {
+ absl::SleepFor(absl::Milliseconds(100));
+ ASSERT_EQ(blocked.load(), 1);
+
+ struct sembuf buf = {};
+ buf.sem_op = 1;
+ ASSERT_THAT(RetryEINTR(semop)(sem.get(), &buf, 1), SyscallSucceeds());
+ });
+
+ struct sembuf buf = {};
+ buf.sem_op = -1;
+ ASSERT_THAT(RetryEINTR(semop)(sem.get(), &buf, 1), SyscallSucceeds());
+ blocked.store(0);
+}
+
+// Tests that IPC_NOWAIT returns with no wait.
+TEST(SemaphoreTest, SemOpNoBlock) {
+ AutoSem sem(semget(IPC_PRIVATE, 1, 0600 | IPC_CREAT));
+ ASSERT_THAT(sem.get(), SyscallSucceeds());
+
+ struct sembuf buf = {};
+ buf.sem_flg = IPC_NOWAIT;
+
+ buf.sem_op = -1;
+ ASSERT_THAT(semop(sem.get(), &buf, 1), SyscallFailsWithErrno(EAGAIN));
+
+ buf.sem_op = 1;
+ ASSERT_THAT(semop(sem.get(), &buf, 1), SyscallSucceeds());
+
+ buf.sem_op = 0;
+ ASSERT_THAT(semop(sem.get(), &buf, 1), SyscallFailsWithErrno(EAGAIN));
+}
+
+// Test runs 2 threads, one signals the other waits the same number of times.
+TEST(SemaphoreTest, SemOpSimple) {
+ AutoSem sem(semget(IPC_PRIVATE, 1, 0600 | IPC_CREAT));
+ ASSERT_THAT(sem.get(), SyscallSucceeds());
+
+ constexpr size_t kLoops = 100;
+ ScopedThread th([&sem] {
+ struct sembuf buf = {};
+ buf.sem_op = 1;
+ for (size_t i = 0; i < kLoops; i++) {
+ // Sleep to prevent making all increments in one shot without letting
+ // the waiter wait.
+ absl::SleepFor(absl::Milliseconds(1));
+ ASSERT_THAT(semop(sem.get(), &buf, 1), SyscallSucceeds());
+ }
+ });
+
+ struct sembuf buf = {};
+ buf.sem_op = -1;
+ for (size_t i = 0; i < kLoops; i++) {
+ ASSERT_THAT(RetryEINTR(semop)(sem.get(), &buf, 1), SyscallSucceeds());
+ }
+}
+
+// Tests that semaphore can be removed while there are waiters.
+// NoRandomSave: Test relies on timing that random save throws off.
+TEST(SemaphoreTest, SemOpRemoveWithWaiter_NoRandomSave) {
+ AutoSem sem(semget(IPC_PRIVATE, 2, 0600 | IPC_CREAT));
+ ASSERT_THAT(sem.get(), SyscallSucceeds());
+
+ ScopedThread th([&sem] {
+ absl::SleepFor(absl::Milliseconds(250));
+ ASSERT_THAT(semctl(sem.release(), 0, IPC_RMID), SyscallSucceeds());
+ });
+
+ // This must happen before IPC_RMID runs above. Otherwise it fails with EINVAL
+ // instead because the semaphore has already been removed.
+ struct sembuf buf = {};
+ buf.sem_op = -1;
+ ASSERT_THAT(RetryEINTR(semop)(sem.get(), &buf, 1),
+ SyscallFailsWithErrno(EIDRM));
+}
+
+// Semaphore isn't fair. It will execute any waiter that can satisfy the
+// request even if it gets in front of other waiters.
+TEST(SemaphoreTest, SemOpBestFitExecution) {
+ AutoSem sem(semget(IPC_PRIVATE, 1, 0600 | IPC_CREAT));
+ ASSERT_THAT(sem.get(), SyscallSucceeds());
+
+ ScopedThread th([&sem] {
+ struct sembuf buf = {};
+ buf.sem_op = -2;
+ ASSERT_THAT(RetryEINTR(semop)(sem.get(), &buf, 1), SyscallFails());
+ // Ensure that wait will only unblock when the semaphore is removed. On
+ // EINTR retry it may race with deletion and return EINVAL.
+ ASSERT_TRUE(errno == EIDRM || errno == EINVAL) << "errno=" << errno;
+ });
+
+ // Ensures that '-1' below will unblock even though '-10' above is waiting
+ // for the same semaphore.
+ for (size_t i = 0; i < 10; ++i) {
+ struct sembuf buf = {};
+ buf.sem_op = 1;
+ ASSERT_THAT(RetryEINTR(semop)(sem.get(), &buf, 1), SyscallSucceeds());
+
+ absl::SleepFor(absl::Milliseconds(10));
+
+ buf.sem_op = -1;
+ ASSERT_THAT(RetryEINTR(semop)(sem.get(), &buf, 1), SyscallSucceeds());
+ }
+
+ ASSERT_THAT(semctl(sem.release(), 0, IPC_RMID), SyscallSucceeds());
+}
+
+// Executes random operations in multiple threads and verify correctness.
+TEST(SemaphoreTest, SemOpRandom) {
+ // Don't do cooperative S/R tests because there are too many syscalls in
+ // this test,
+ const DisableSave ds;
+
+ AutoSem sem(semget(IPC_PRIVATE, 1, 0600 | IPC_CREAT));
+ ASSERT_THAT(sem.get(), SyscallSucceeds());
+
+ // Protects the seed below.
+ absl::Mutex mutex;
+ uint32_t seed = time(nullptr);
+
+ int count = 0; // Tracks semaphore value.
+ bool done = false; // Tells waiters to stop after signal threads are done.
+
+ // These threads will wait in a loop.
+ std::unique_ptr<ScopedThread> decs[5];
+ for (auto& dec : decs) {
+ dec = absl::make_unique<ScopedThread>([&sem, &mutex, &count, &seed, &done] {
+ for (size_t i = 0; i < 500; ++i) {
+ int16_t val;
+ {
+ absl::MutexLock l(&mutex);
+ if (done) {
+ return;
+ }
+ val = (rand_r(&seed) % 10 + 1); // Rand between 1 and 10.
+ count -= val;
+ }
+ struct sembuf buf = {};
+ buf.sem_op = -val;
+ ASSERT_THAT(RetryEINTR(semop)(sem.get(), &buf, 1), SyscallSucceeds());
+ absl::SleepFor(absl::Milliseconds(val * 2));
+ }
+ });
+ }
+
+ // These threads will wait for zero in a loop.
+ std::unique_ptr<ScopedThread> zeros[5];
+ for (auto& zero : zeros) {
+ zero = absl::make_unique<ScopedThread>([&sem, &mutex, &done] {
+ for (size_t i = 0; i < 500; ++i) {
+ {
+ absl::MutexLock l(&mutex);
+ if (done) {
+ return;
+ }
+ }
+ struct sembuf buf = {};
+ buf.sem_op = 0;
+ ASSERT_THAT(RetryEINTR(semop)(sem.get(), &buf, 1), SyscallSucceeds());
+ absl::SleepFor(absl::Milliseconds(10));
+ }
+ });
+ }
+
+ // These threads will signal in a loop.
+ std::unique_ptr<ScopedThread> incs[5];
+ for (auto& inc : incs) {
+ inc = absl::make_unique<ScopedThread>([&sem, &mutex, &count, &seed] {
+ for (size_t i = 0; i < 500; ++i) {
+ int16_t val;
+ {
+ absl::MutexLock l(&mutex);
+ val = (rand_r(&seed) % 10 + 1); // Rand between 1 and 10.
+ count += val;
+ }
+ struct sembuf buf = {};
+ buf.sem_op = val;
+ ASSERT_THAT(semop(sem.get(), &buf, 1), SyscallSucceeds());
+ absl::SleepFor(absl::Milliseconds(val * 2));
+ }
+ });
+ }
+
+ // First wait for signal threads to be done.
+ for (auto& inc : incs) {
+ inc->Join();
+ }
+
+ // Now there could be waiters blocked (remember operations are random).
+ // Notify waiters that we're done and signal semaphore just the right amount.
+ {
+ absl::MutexLock l(&mutex);
+ done = true;
+ struct sembuf buf = {};
+ buf.sem_op = -count;
+ ASSERT_THAT(semop(sem.get(), &buf, 1), SyscallSucceeds());
+ }
+
+ // Now all waiters should unblock and exit.
+ for (auto& dec : decs) {
+ dec->Join();
+ }
+ for (auto& zero : zeros) {
+ zero->Join();
+ }
+}
+
+TEST(SemaphoreTest, SemOpNamespace) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+ AutoSem sem(semget(123, 1, 0600 | IPC_CREAT | IPC_EXCL));
+ ASSERT_THAT(sem.get(), SyscallSucceeds());
+
+ ScopedThread([]() {
+ EXPECT_THAT(unshare(CLONE_NEWIPC), SyscallSucceeds());
+ AutoSem sem(semget(123, 1, 0600 | IPC_CREAT | IPC_EXCL));
+ ASSERT_THAT(sem.get(), SyscallSucceeds());
+ });
+}
+
+TEST(SemaphoreTest, SemCtlVal) {
+ AutoSem sem(semget(IPC_PRIVATE, 1, 0600 | IPC_CREAT));
+ ASSERT_THAT(sem.get(), SyscallSucceeds());
+
+ // Semaphore must start with 0.
+ EXPECT_THAT(semctl(sem.get(), 0, GETVAL), SyscallSucceedsWithValue(0));
+
+ // Increase value and ensure waiters are woken up.
+ ScopedThread th([&sem] {
+ struct sembuf buf = {};
+ buf.sem_op = -10;
+ ASSERT_THAT(RetryEINTR(semop)(sem.get(), &buf, 1), SyscallSucceeds());
+ });
+
+ ASSERT_THAT(semctl(sem.get(), 0, SETVAL, 9), SyscallSucceeds());
+ EXPECT_THAT(semctl(sem.get(), 0, GETVAL), SyscallSucceedsWithValue(9));
+
+ ASSERT_THAT(semctl(sem.get(), 0, SETVAL, 20), SyscallSucceeds());
+ const int value = semctl(sem.get(), 0, GETVAL);
+ // 10 or 20 because it could have raced with waiter above.
+ EXPECT_TRUE(value == 10 || value == 20) << "value=" << value;
+ th.Join();
+
+ // Set it back to 0 and ensure that waiters are woken up.
+ ScopedThread thZero([&sem] {
+ struct sembuf buf = {};
+ buf.sem_op = 0;
+ ASSERT_THAT(RetryEINTR(semop)(sem.get(), &buf, 1), SyscallSucceeds());
+ });
+ ASSERT_THAT(semctl(sem.get(), 0, SETVAL, 0), SyscallSucceeds());
+ EXPECT_THAT(semctl(sem.get(), 0, GETVAL), SyscallSucceedsWithValue(0));
+ thZero.Join();
+}
+
+TEST(SemaphoreTest, SemCtlValAll) {
+ AutoSem sem(semget(IPC_PRIVATE, 3, 0600 | IPC_CREAT));
+ ASSERT_THAT(sem.get(), SyscallSucceeds());
+
+ // Semaphores must start with 0.
+ uint16_t get[3] = {10, 10, 10};
+ EXPECT_THAT(semctl(sem.get(), 1, GETALL, get), SyscallSucceedsWithValue(0));
+ for (auto v : get) {
+ EXPECT_EQ(v, 0);
+ }
+
+ // SetAll and check that they were set.
+ uint16_t vals[3] = {0, 10, 20};
+ EXPECT_THAT(semctl(sem.get(), 1, SETALL, vals), SyscallSucceedsWithValue(0));
+ EXPECT_THAT(semctl(sem.get(), 1, GETALL, get), SyscallSucceedsWithValue(0));
+ for (size_t i = 0; i < ABSL_ARRAYSIZE(vals); ++i) {
+ EXPECT_EQ(get[i], vals[i]);
+ }
+
+ EXPECT_THAT(semctl(sem.get(), 1, SETALL, nullptr),
+ SyscallFailsWithErrno(EFAULT));
+}
+
+TEST(SemaphoreTest, SemCtlGetPid) {
+ AutoSem sem(semget(IPC_PRIVATE, 1, 0600 | IPC_CREAT));
+ ASSERT_THAT(sem.get(), SyscallSucceeds());
+
+ ASSERT_THAT(semctl(sem.get(), 0, SETVAL, 1), SyscallSucceeds());
+ EXPECT_THAT(semctl(sem.get(), 0, GETPID), SyscallSucceedsWithValue(getpid()));
+}
+
+TEST(SemaphoreTest, SemCtlGetPidFork) {
+ AutoSem sem(semget(IPC_PRIVATE, 1, 0600 | IPC_CREAT));
+ ASSERT_THAT(sem.get(), SyscallSucceeds());
+
+ const pid_t child_pid = fork();
+ if (child_pid == 0) {
+ TEST_PCHECK(semctl(sem.get(), 0, SETVAL, 1) == 0);
+ TEST_PCHECK(semctl(sem.get(), 0, GETPID) == getpid());
+
+ _exit(0);
+ }
+ ASSERT_THAT(child_pid, SyscallSucceeds());
+
+ int status;
+ ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+ << " status " << status;
+}
+
+TEST(SemaphoreTest, SemIpcSet) {
+ // Drop CAP_IPC_OWNER which allows us to bypass semaphore permissions.
+ ASSERT_NO_ERRNO(SetCapability(CAP_IPC_OWNER, false));
+
+ AutoSem sem(semget(IPC_PRIVATE, 1, 0600 | IPC_CREAT));
+ ASSERT_THAT(sem.get(), SyscallSucceeds());
+
+ struct semid_ds semid = {};
+ semid.sem_perm.uid = getuid();
+ semid.sem_perm.gid = getgid();
+
+ // Make semaphore readonly and check that signal fails.
+ semid.sem_perm.mode = 0400;
+ EXPECT_THAT(semctl(sem.get(), 0, IPC_SET, &semid), SyscallSucceeds());
+ struct sembuf buf = {};
+ buf.sem_op = 1;
+ ASSERT_THAT(semop(sem.get(), &buf, 1), SyscallFailsWithErrno(EACCES));
+
+ // Make semaphore writeonly and check that wait for zero fails.
+ semid.sem_perm.mode = 0200;
+ EXPECT_THAT(semctl(sem.get(), 0, IPC_SET, &semid), SyscallSucceeds());
+ buf.sem_op = 0;
+ ASSERT_THAT(semop(sem.get(), &buf, 1), SyscallFailsWithErrno(EACCES));
+}
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/sendfile.cc b/test/syscalls/linux/sendfile.cc
new file mode 100644
index 000000000..64123e904
--- /dev/null
+++ b/test/syscalls/linux/sendfile.cc
@@ -0,0 +1,587 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <linux/unistd.h>
+#include <sys/eventfd.h>
+#include <sys/sendfile.h>
+#include <unistd.h>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/eventfd_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(SendFileTest, SendZeroBytes) {
+ // Create temp files.
+ const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+ // Open the input file as read only.
+ const FileDescriptor inf =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+ // Open the output file as write only.
+ const FileDescriptor outf =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_WRONLY));
+
+ // Send data and verify that sendfile returns the correct value.
+ EXPECT_THAT(sendfile(outf.get(), inf.get(), nullptr, 0),
+ SyscallSucceedsWithValue(0));
+}
+
+TEST(SendFileTest, InvalidOffset) {
+ // Create temp files.
+ const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+ // Open the input file as read only.
+ const FileDescriptor inf =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+ // Open the output file as write only.
+ const FileDescriptor outf =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_WRONLY));
+
+ // Send data and verify that sendfile returns the correct value.
+ off_t offset = -1;
+ EXPECT_THAT(sendfile(outf.get(), inf.get(), &offset, 0),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+int memfd_create(const std::string& name, unsigned int flags) {
+ return syscall(__NR_memfd_create, name.c_str(), flags);
+}
+
+TEST(SendFileTest, Overflow) {
+ // Create input file.
+ const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const FileDescriptor inf =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+ // Open the output file.
+ int fd;
+ EXPECT_THAT(fd = memfd_create("overflow", 0), SyscallSucceeds());
+ const FileDescriptor outf(fd);
+
+ // out_offset + kSize overflows INT64_MAX.
+ loff_t out_offset = 0x7ffffffffffffffeull;
+ constexpr int kSize = 3;
+ EXPECT_THAT(sendfile(outf.get(), inf.get(), &out_offset, kSize),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(SendFileTest, SendTrivially) {
+ // Create temp files.
+ constexpr char kData[] = "To be, or not to be, that is the question:";
+ constexpr int kDataSize = sizeof(kData) - 1;
+ const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), kData, TempPath::kDefaultFileMode));
+ const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+ // Open the input file as read only.
+ const FileDescriptor inf =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+ // Open the output file as write only.
+ FileDescriptor outf;
+ outf = ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_WRONLY));
+
+ // Send data and verify that sendfile returns the correct value.
+ int bytes_sent;
+ EXPECT_THAT(bytes_sent = sendfile(outf.get(), inf.get(), nullptr, kDataSize),
+ SyscallSucceedsWithValue(kDataSize));
+
+ // Close outf to avoid leak.
+ outf.reset();
+
+ // Open the output file as read only.
+ outf = ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_RDONLY));
+
+ // Verify that the output file has the correct data.
+ char actual[kDataSize];
+ ASSERT_THAT(read(outf.get(), &actual, bytes_sent),
+ SyscallSucceedsWithValue(kDataSize));
+ EXPECT_EQ(kData, absl::string_view(actual, bytes_sent));
+}
+
+TEST(SendFileTest, SendTriviallyWithBothFilesReadWrite) {
+ // Create temp files.
+ constexpr char kData[] = "Whether 'tis nobler in the mind to suffer";
+ constexpr int kDataSize = sizeof(kData) - 1;
+ const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), kData, TempPath::kDefaultFileMode));
+ const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+ // Open the input file as readwrite.
+ const FileDescriptor inf =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDWR));
+
+ // Open the output file as readwrite.
+ FileDescriptor outf;
+ outf = ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_RDWR));
+
+ // Send data and verify that sendfile returns the correct value.
+ int bytes_sent;
+ EXPECT_THAT(bytes_sent = sendfile(outf.get(), inf.get(), nullptr, kDataSize),
+ SyscallSucceedsWithValue(kDataSize));
+
+ // Close outf to avoid leak.
+ outf.reset();
+
+ // Open the output file as read only.
+ outf = ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_RDONLY));
+
+ // Verify that the output file has the correct data.
+ char actual[kDataSize];
+ ASSERT_THAT(read(outf.get(), &actual, bytes_sent),
+ SyscallSucceedsWithValue(kDataSize));
+ EXPECT_EQ(kData, absl::string_view(actual, bytes_sent));
+}
+
+TEST(SendFileTest, SendAndUpdateFileOffset) {
+ // Create temp files.
+ // Test input string length must be > 2 AND even.
+ constexpr char kData[] = "The slings and arrows of outrageous fortune,";
+ constexpr int kDataSize = sizeof(kData) - 1;
+ constexpr int kHalfDataSize = kDataSize / 2;
+ const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), kData, TempPath::kDefaultFileMode));
+ const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+ // Open the input file as read only.
+ const FileDescriptor inf =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+ // Open the output file as write only.
+ FileDescriptor outf;
+ outf = ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_WRONLY));
+
+ // Send data and verify that sendfile returns the correct value.
+ int bytes_sent;
+ EXPECT_THAT(
+ bytes_sent = sendfile(outf.get(), inf.get(), nullptr, kHalfDataSize),
+ SyscallSucceedsWithValue(kHalfDataSize));
+
+ // Close outf to avoid leak.
+ outf.reset();
+
+ // Open the output file as read only.
+ outf = ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_RDONLY));
+
+ // Verify that the output file has the correct data.
+ char actual[kHalfDataSize];
+ ASSERT_THAT(read(outf.get(), &actual, bytes_sent),
+ SyscallSucceedsWithValue(kHalfDataSize));
+ EXPECT_EQ(absl::string_view(kData, kHalfDataSize),
+ absl::string_view(actual, bytes_sent));
+
+ // Verify that the input file offset has been updated
+ ASSERT_THAT(read(inf.get(), &actual, kDataSize - bytes_sent),
+ SyscallSucceedsWithValue(kHalfDataSize));
+ EXPECT_EQ(
+ absl::string_view(kData + kDataSize - bytes_sent, kDataSize - bytes_sent),
+ absl::string_view(actual, kHalfDataSize));
+}
+
+TEST(SendFileTest, SendAndUpdateFileOffsetFromNonzeroStartingPoint) {
+ // Create temp files.
+ // Test input string length must be > 2 AND divisible by 4.
+ constexpr char kData[] = "The slings and arrows of outrageous fortune,";
+ constexpr int kDataSize = sizeof(kData) - 1;
+ constexpr int kHalfDataSize = kDataSize / 2;
+ constexpr int kQuarterDataSize = kHalfDataSize / 2;
+ const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), kData, TempPath::kDefaultFileMode));
+ const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+ // Open the input file as read only.
+ const FileDescriptor inf =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+ // Open the output file as write only.
+ FileDescriptor outf;
+ outf = ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_WRONLY));
+
+ // Read a quarter of the data from the infile which should update the file
+ // offset, we don't actually care about the data so it goes into the garbage.
+ char garbage[kQuarterDataSize];
+ ASSERT_THAT(read(inf.get(), &garbage, kQuarterDataSize),
+ SyscallSucceedsWithValue(kQuarterDataSize));
+
+ // Send data and verify that sendfile returns the correct value.
+ int bytes_sent;
+ EXPECT_THAT(
+ bytes_sent = sendfile(outf.get(), inf.get(), nullptr, kHalfDataSize),
+ SyscallSucceedsWithValue(kHalfDataSize));
+
+ // Close out_fd to avoid leak.
+ outf.reset();
+
+ // Open the output file as read only.
+ outf = ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_RDONLY));
+
+ // Verify that the output file has the correct data.
+ char actual[kHalfDataSize];
+ ASSERT_THAT(read(outf.get(), &actual, bytes_sent),
+ SyscallSucceedsWithValue(kHalfDataSize));
+ EXPECT_EQ(absl::string_view(kData + kQuarterDataSize, kHalfDataSize),
+ absl::string_view(actual, bytes_sent));
+
+ // Verify that the input file offset has been updated
+ ASSERT_THAT(read(inf.get(), &actual, kQuarterDataSize),
+ SyscallSucceedsWithValue(kQuarterDataSize));
+
+ EXPECT_EQ(
+ absl::string_view(kData + kDataSize - kQuarterDataSize, kQuarterDataSize),
+ absl::string_view(actual, kQuarterDataSize));
+}
+
+TEST(SendFileTest, SendAndUpdateGivenOffset) {
+ // Create temp files.
+ // Test input string length must be >= 4 AND divisible by 4.
+ constexpr char kData[] = "Or to take Arms against a Sea of troubles,";
+ constexpr int kDataSize = sizeof(kData) + 1;
+ constexpr int kHalfDataSize = kDataSize / 2;
+ constexpr int kQuarterDataSize = kHalfDataSize / 2;
+ constexpr int kThreeFourthsDataSize = 3 * kDataSize / 4;
+
+ const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), kData, TempPath::kDefaultFileMode));
+ const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+ // Open the input file as read only.
+ const FileDescriptor inf =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+ // Open the output file as write only.
+ FileDescriptor outf;
+ outf = ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_WRONLY));
+
+ // Create offset for sending.
+ off_t offset = kQuarterDataSize;
+
+ // Send data and verify that sendfile returns the correct value.
+ int bytes_sent;
+ EXPECT_THAT(
+ bytes_sent = sendfile(outf.get(), inf.get(), &offset, kHalfDataSize),
+ SyscallSucceedsWithValue(kHalfDataSize));
+
+ // Close out_fd to avoid leak.
+ outf.reset();
+
+ // Open the output file as read only.
+ outf = ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_RDONLY));
+
+ // Verify that the output file has the correct data.
+ char actual[kHalfDataSize];
+ ASSERT_THAT(read(outf.get(), &actual, bytes_sent),
+ SyscallSucceedsWithValue(kHalfDataSize));
+ EXPECT_EQ(absl::string_view(kData + kQuarterDataSize, kHalfDataSize),
+ absl::string_view(actual, bytes_sent));
+
+ // Verify that the input file offset has NOT been updated.
+ ASSERT_THAT(read(inf.get(), &actual, kHalfDataSize),
+ SyscallSucceedsWithValue(kHalfDataSize));
+ EXPECT_EQ(absl::string_view(kData, kHalfDataSize),
+ absl::string_view(actual, kHalfDataSize));
+
+ // Verify that the offset pointer has been updated.
+ EXPECT_EQ(offset, kThreeFourthsDataSize);
+}
+
+TEST(SendFileTest, DoNotSendfileIfOutfileIsAppendOnly) {
+ // Create temp files.
+ constexpr char kData[] = "And by opposing end them: to die, to sleep";
+ constexpr int kDataSize = sizeof(kData) - 1;
+
+ const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), kData, TempPath::kDefaultFileMode));
+ const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+ // Open the input file as read only.
+ const FileDescriptor inf =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+ // Open the output file as append only.
+ const FileDescriptor outf =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_WRONLY | O_APPEND));
+
+ // Send data and verify that sendfile returns the correct errno.
+ EXPECT_THAT(sendfile(outf.get(), inf.get(), nullptr, kDataSize),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(SendFileTest, AppendCheckOrdering) {
+ constexpr char kData[] = "And by opposing end them: to die, to sleep";
+ constexpr int kDataSize = sizeof(kData) - 1;
+ const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), kData, TempPath::kDefaultFileMode));
+
+ const FileDescriptor read =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY));
+ const FileDescriptor write =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_WRONLY));
+ const FileDescriptor append =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_APPEND));
+
+ // Check that read/write file mode is verified before append.
+ EXPECT_THAT(sendfile(append.get(), read.get(), nullptr, kDataSize),
+ SyscallFailsWithErrno(EBADF));
+ EXPECT_THAT(sendfile(write.get(), write.get(), nullptr, kDataSize),
+ SyscallFailsWithErrno(EBADF));
+}
+
+TEST(SendFileTest, DoNotSendfileIfOutfileIsNotWritable) {
+ // Create temp files.
+ constexpr char kData[] = "No more; and by a sleep, to say we end";
+ constexpr int kDataSize = sizeof(kData) - 1;
+
+ const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), kData, TempPath::kDefaultFileMode));
+ const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+ // Open the input file as read only.
+ const FileDescriptor inf =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+ // Open the output file as read only.
+ const FileDescriptor outf =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_RDONLY));
+
+ // Send data and verify that sendfile returns the correct errno.
+ EXPECT_THAT(sendfile(outf.get(), inf.get(), nullptr, kDataSize),
+ SyscallFailsWithErrno(EBADF));
+}
+
+TEST(SendFileTest, DoNotSendfileIfInfileIsNotReadable) {
+ // Create temp files.
+ constexpr char kData[] = "the heart-ache, and the thousand natural shocks";
+ constexpr int kDataSize = sizeof(kData) - 1;
+
+ const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), kData, TempPath::kDefaultFileMode));
+ const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+ // Open the input file as write only.
+ const FileDescriptor inf =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_WRONLY));
+
+ // Open the output file as write only.
+ const FileDescriptor outf =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_WRONLY));
+
+ // Send data and verify that sendfile returns the correct errno.
+ EXPECT_THAT(sendfile(outf.get(), inf.get(), nullptr, kDataSize),
+ SyscallFailsWithErrno(EBADF));
+}
+
+TEST(SendFileTest, DoNotSendANegativeNumberOfBytes) {
+ // Create temp files.
+ constexpr char kData[] = "that Flesh is heir to? 'Tis a consummation";
+
+ const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), kData, TempPath::kDefaultFileMode));
+ const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+ // Open the input file as read only.
+ const FileDescriptor inf =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+ // Open the output file as write only.
+ const FileDescriptor outf =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_WRONLY));
+
+ // Send data and verify that sendfile returns the correct errno.
+ EXPECT_THAT(sendfile(outf.get(), inf.get(), nullptr, -1),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(SendFileTest, SendTheCorrectNumberOfBytesEvenIfWeTryToSendTooManyBytes) {
+ // Create temp files.
+ constexpr char kData[] = "devoutly to be wished. To die, to sleep,";
+ constexpr int kDataSize = sizeof(kData) - 1;
+
+ const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), kData, TempPath::kDefaultFileMode));
+ const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+ // Open the input file as read only.
+ const FileDescriptor inf =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+ // Open the output file as write only.
+ FileDescriptor outf;
+ outf = ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_WRONLY));
+
+ // Send data and verify that sendfile returns the correct value.
+ int bytes_sent;
+ EXPECT_THAT(
+ bytes_sent = sendfile(outf.get(), inf.get(), nullptr, kDataSize + 100),
+ SyscallSucceedsWithValue(kDataSize));
+
+ // Close outf to avoid leak.
+ outf.reset();
+
+ // Open the output file as read only.
+ outf = ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_RDONLY));
+
+ // Verify that the output file has the correct data.
+ char actual[kDataSize];
+ ASSERT_THAT(read(outf.get(), &actual, bytes_sent),
+ SyscallSucceedsWithValue(kDataSize));
+ EXPECT_EQ(kData, absl::string_view(actual, bytes_sent));
+}
+
+TEST(SendFileTest, SendToNotARegularFile) {
+ // Make temp input directory and open as read only.
+ const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const FileDescriptor inf =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_RDONLY));
+
+ // Make temp output file and open as write only.
+ const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const FileDescriptor outf =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_WRONLY));
+
+ // Receive an error since a directory is not a regular file.
+ EXPECT_THAT(sendfile(outf.get(), inf.get(), nullptr, 0),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(SendFileTest, SendPipeWouldBlock) {
+ // Create temp file.
+ constexpr char kData[] =
+ "The fool doth think he is wise, but the wise man knows himself to be a "
+ "fool.";
+ constexpr int kDataSize = sizeof(kData) - 1;
+ const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), kData, TempPath::kDefaultFileMode));
+
+ // Open the input file as read only.
+ const FileDescriptor inf =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+ // Setup the output named pipe.
+ int fds[2];
+ ASSERT_THAT(pipe2(fds, O_NONBLOCK), SyscallSucceeds());
+ const FileDescriptor rfd(fds[0]);
+ const FileDescriptor wfd(fds[1]);
+
+ // Fill up the pipe's buffer.
+ int pipe_size = -1;
+ ASSERT_THAT(pipe_size = fcntl(wfd.get(), F_GETPIPE_SZ), SyscallSucceeds());
+ std::vector<char> buf(2 * pipe_size);
+ ASSERT_THAT(write(wfd.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(pipe_size));
+
+ EXPECT_THAT(sendfile(wfd.get(), inf.get(), nullptr, kDataSize),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+}
+
+TEST(SendFileTest, SendPipeBlocks) {
+ // Create temp file.
+ constexpr char kData[] =
+ "The fault, dear Brutus, is not in our stars, but in ourselves.";
+ constexpr int kDataSize = sizeof(kData) - 1;
+ const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), kData, TempPath::kDefaultFileMode));
+
+ // Open the input file as read only.
+ const FileDescriptor inf =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+ // Setup the output named pipe.
+ int fds[2];
+ ASSERT_THAT(pipe(fds), SyscallSucceeds());
+ const FileDescriptor rfd(fds[0]);
+ const FileDescriptor wfd(fds[1]);
+
+ // Fill up the pipe's buffer.
+ int pipe_size = -1;
+ ASSERT_THAT(pipe_size = fcntl(wfd.get(), F_GETPIPE_SZ), SyscallSucceeds());
+ std::vector<char> buf(pipe_size);
+ ASSERT_THAT(write(wfd.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(pipe_size));
+
+ ScopedThread t([&]() {
+ absl::SleepFor(absl::Milliseconds(100));
+ ASSERT_THAT(read(rfd.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(pipe_size));
+ });
+
+ EXPECT_THAT(sendfile(wfd.get(), inf.get(), nullptr, kDataSize),
+ SyscallSucceedsWithValue(kDataSize));
+}
+
+TEST(SendFileTest, SendToSpecialFile) {
+ // Create temp file.
+ const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), "", TempPath::kDefaultFileMode));
+
+ const FileDescriptor inf =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDWR));
+ constexpr int kSize = 0x7ff;
+ ASSERT_THAT(ftruncate(inf.get(), kSize), SyscallSucceeds());
+
+ auto eventfd = ASSERT_NO_ERRNO_AND_VALUE(NewEventFD());
+
+ // eventfd can accept a number of bytes which is a multiple of 8.
+ EXPECT_THAT(sendfile(eventfd.get(), inf.get(), nullptr, 0xfffff),
+ SyscallSucceedsWithValue(kSize & (~7)));
+}
+
+TEST(SendFileTest, SendFileToPipe) {
+ // Create temp file.
+ constexpr char kData[] = "<insert-quote-here>";
+ constexpr int kDataSize = sizeof(kData) - 1;
+ const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), kData, TempPath::kDefaultFileMode));
+ const FileDescriptor inf =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+ // Create a pipe for sending to a pipe.
+ int fds[2];
+ ASSERT_THAT(pipe(fds), SyscallSucceeds());
+ const FileDescriptor rfd(fds[0]);
+ const FileDescriptor wfd(fds[1]);
+
+ // Expect to read up to the given size.
+ std::vector<char> buf(kDataSize);
+ ScopedThread t([&]() {
+ absl::SleepFor(absl::Milliseconds(100));
+ ASSERT_THAT(read(rfd.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(kDataSize));
+ });
+
+ // Send with twice the size of the file, which should hit EOF.
+ EXPECT_THAT(sendfile(wfd.get(), inf.get(), nullptr, kDataSize * 2),
+ SyscallSucceedsWithValue(kDataSize));
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/sendfile_socket.cc b/test/syscalls/linux/sendfile_socket.cc
new file mode 100644
index 000000000..c101fe9d2
--- /dev/null
+++ b/test/syscalls/linux/sendfile_socket.cc
@@ -0,0 +1,231 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <sys/sendfile.h>
+#include <sys/socket.h>
+#include <unistd.h>
+
+#include <iostream>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "absl/strings/string_view.h"
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+class SendFileTest : public ::testing::TestWithParam<int> {
+ protected:
+ PosixErrorOr<std::unique_ptr<SocketPair>> Sockets(int type) {
+ // Bind a server socket.
+ int family = GetParam();
+ switch (family) {
+ case AF_INET: {
+ if (type == SOCK_STREAM) {
+ return SocketPairKind{
+ "TCP", AF_INET, type, 0,
+ TCPAcceptBindSocketPairCreator(AF_INET, type, 0, false)}
+ .Create();
+ } else {
+ return SocketPairKind{
+ "UDP", AF_INET, type, 0,
+ UDPBidirectionalBindSocketPairCreator(AF_INET, type, 0, false)}
+ .Create();
+ }
+ }
+ case AF_UNIX: {
+ if (type == SOCK_STREAM) {
+ return SocketPairKind{
+ "UNIX", AF_UNIX, type, 0,
+ FilesystemAcceptBindSocketPairCreator(AF_UNIX, type, 0)}
+ .Create();
+ } else {
+ return SocketPairKind{
+ "UNIX", AF_UNIX, type, 0,
+ FilesystemBidirectionalBindSocketPairCreator(AF_UNIX, type, 0)}
+ .Create();
+ }
+ }
+ default:
+ return PosixError(EINVAL);
+ }
+ }
+};
+
+// Sends large file to exercise the path that read and writes data multiple
+// times, esp. when more data is read than can be written.
+TEST_P(SendFileTest, SendMultiple) {
+ std::vector<char> data(5 * 1024 * 1024);
+ RandomizeBuffer(data.data(), data.size());
+
+ // Create temp files.
+ const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), absl::string_view(data.data(), data.size()),
+ TempPath::kDefaultFileMode));
+ const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+ // Create sockets.
+ auto socks = ASSERT_NO_ERRNO_AND_VALUE(Sockets(SOCK_STREAM));
+
+ // Thread that reads data from socket and dumps to a file.
+ ScopedThread th([&] {
+ FileDescriptor outf =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_WRONLY));
+
+ // Read until socket is closed.
+ char buf[10240];
+ for (int cnt = 0;; cnt++) {
+ int r = RetryEINTR(read)(socks->first_fd(), buf, sizeof(buf));
+ // We cannot afford to save on every read() call.
+ if (cnt % 1000 == 0) {
+ ASSERT_THAT(r, SyscallSucceeds());
+ } else {
+ const DisableSave ds;
+ ASSERT_THAT(r, SyscallSucceeds());
+ }
+ if (r == 0) {
+ // EOF
+ break;
+ }
+ int w = RetryEINTR(write)(outf.get(), buf, r);
+ // We cannot afford to save on every write() call.
+ if (cnt % 1010 == 0) {
+ ASSERT_THAT(w, SyscallSucceedsWithValue(r));
+ } else {
+ const DisableSave ds;
+ ASSERT_THAT(w, SyscallSucceedsWithValue(r));
+ }
+ }
+ });
+
+ // Open the input file as read only.
+ const FileDescriptor inf =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+ int cnt = 0;
+ for (size_t sent = 0; sent < data.size(); cnt++) {
+ const size_t remain = data.size() - sent;
+ std::cout << "sendfile, size=" << data.size() << ", sent=" << sent
+ << ", remain=" << remain << std::endl;
+
+ // Send data and verify that sendfile returns the correct value.
+ int res = sendfile(socks->second_fd(), inf.get(), nullptr, remain);
+ // We cannot afford to save on every sendfile() call.
+ if (cnt % 120 == 0) {
+ MaybeSave();
+ }
+ if (res == 0) {
+ // EOF
+ break;
+ }
+ if (res > 0) {
+ sent += res;
+ } else {
+ ASSERT_TRUE(errno == EINTR || errno == EAGAIN) << "errno=" << errno;
+ }
+ }
+
+ // Close socket to stop thread.
+ close(socks->release_second_fd());
+ th.Join();
+
+ // Verify that the output file has the correct data.
+ const FileDescriptor outf =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_RDONLY));
+ std::vector<char> actual(data.size(), '\0');
+ ASSERT_THAT(RetryEINTR(read)(outf.get(), actual.data(), actual.size()),
+ SyscallSucceedsWithValue(actual.size()));
+ ASSERT_EQ(memcmp(data.data(), actual.data(), data.size()), 0);
+}
+
+TEST_P(SendFileTest, Shutdown) {
+ // Create a socket.
+ auto socks = ASSERT_NO_ERRNO_AND_VALUE(Sockets(SOCK_STREAM));
+
+ // If this is a TCP socket, then turn off linger.
+ if (GetParam() == AF_INET) {
+ struct linger sl;
+ sl.l_onoff = 1;
+ sl.l_linger = 0;
+ ASSERT_THAT(
+ setsockopt(socks->first_fd(), SOL_SOCKET, SO_LINGER, &sl, sizeof(sl)),
+ SyscallSucceeds());
+ }
+
+ // Create a 1m file with random data.
+ std::vector<char> data(1024 * 1024);
+ RandomizeBuffer(data.data(), data.size());
+ const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), absl::string_view(data.data(), data.size()),
+ TempPath::kDefaultFileMode));
+ const FileDescriptor inf =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+ // Read some data, then shutdown the socket. We don't actually care about
+ // checking the contents (other tests do that), so we just re-use the same
+ // buffer as above.
+ ScopedThread t([&]() {
+ size_t done = 0;
+ while (done < data.size()) {
+ int n = RetryEINTR(read)(socks->first_fd(), data.data(), data.size());
+ ASSERT_THAT(n, SyscallSucceeds());
+ done += n;
+ }
+ // Close the server side socket.
+ close(socks->release_first_fd());
+ });
+
+ // Continuously stream from the file to the socket. Note we do not assert
+ // that a specific amount of data has been written at any time, just that some
+ // data is written. Eventually, we should get a connection reset error.
+ while (1) {
+ off_t offset = 0; // Always read from the start.
+ int n = sendfile(socks->second_fd(), inf.get(), &offset, data.size());
+ EXPECT_THAT(n, AnyOf(SyscallFailsWithErrno(ECONNRESET),
+ SyscallFailsWithErrno(EPIPE), SyscallSucceeds()));
+ if (n <= 0) {
+ break;
+ }
+ }
+}
+
+TEST_P(SendFileTest, SendpageFromEmptyFileToUDP) {
+ auto socks = ASSERT_NO_ERRNO_AND_VALUE(Sockets(SOCK_DGRAM));
+
+ TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR));
+
+ // The value to the count argument has to be so that it is impossible to
+ // allocate a buffer of this size. In Linux, sendfile transfer at most
+ // 0x7ffff000 (MAX_RW_COUNT) bytes.
+ EXPECT_THAT(sendfile(socks->first_fd(), fd.get(), 0x0, 0x8000000000004),
+ SyscallSucceedsWithValue(0));
+}
+
+INSTANTIATE_TEST_SUITE_P(AddressFamily, SendFileTest,
+ ::testing::Values(AF_UNIX, AF_INET));
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/shm.cc b/test/syscalls/linux/shm.cc
new file mode 100644
index 000000000..c7fdbb924
--- /dev/null
+++ b/test/syscalls/linux/shm.cc
@@ -0,0 +1,508 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <sys/ipc.h>
+#include <sys/mman.h>
+#include <sys/shm.h>
+#include <sys/types.h>
+
+#include "absl/time/clock.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+using ::testing::_;
+
+const uint64_t kAllocSize = kPageSize * 128ULL;
+
+PosixErrorOr<char*> Shmat(int shmid, const void* shmaddr, int shmflg) {
+ const intptr_t addr =
+ reinterpret_cast<intptr_t>(shmat(shmid, shmaddr, shmflg));
+ if (addr == -1) {
+ return PosixError(errno, "shmat() failed");
+ }
+ return reinterpret_cast<char*>(addr);
+}
+
+PosixError Shmdt(const char* shmaddr) {
+ const int ret = shmdt(shmaddr);
+ if (ret == -1) {
+ return PosixError(errno, "shmdt() failed");
+ }
+ return NoError();
+}
+
+template <typename T>
+PosixErrorOr<int> Shmctl(int shmid, int cmd, T* buf) {
+ int ret = shmctl(shmid, cmd, reinterpret_cast<struct shmid_ds*>(buf));
+ if (ret == -1) {
+ return PosixError(errno, "shmctl() failed");
+ }
+ return ret;
+}
+
+// ShmSegment is a RAII object for automatically cleaning up shm segments.
+class ShmSegment {
+ public:
+ explicit ShmSegment(int id) : id_(id) {}
+
+ ~ShmSegment() {
+ if (id_ >= 0) {
+ EXPECT_NO_ERRNO(Rmid());
+ id_ = -1;
+ }
+ }
+
+ ShmSegment(ShmSegment&& other) : id_(other.release()) {}
+
+ ShmSegment& operator=(ShmSegment&& other) {
+ id_ = other.release();
+ return *this;
+ }
+
+ ShmSegment(ShmSegment const& other) = delete;
+ ShmSegment& operator=(ShmSegment const& other) = delete;
+
+ int id() const { return id_; }
+
+ int release() {
+ int id = id_;
+ id_ = -1;
+ return id;
+ }
+
+ PosixErrorOr<int> Rmid() {
+ RETURN_IF_ERRNO(Shmctl<void>(id_, IPC_RMID, nullptr));
+ return release();
+ }
+
+ private:
+ int id_ = -1;
+};
+
+PosixErrorOr<int> ShmgetRaw(key_t key, size_t size, int shmflg) {
+ int id = shmget(key, size, shmflg);
+ if (id == -1) {
+ return PosixError(errno, "shmget() failed");
+ }
+ return id;
+}
+
+PosixErrorOr<ShmSegment> Shmget(key_t key, size_t size, int shmflg) {
+ ASSIGN_OR_RETURN_ERRNO(int id, ShmgetRaw(key, size, shmflg));
+ return ShmSegment(id);
+}
+
+TEST(ShmTest, AttachDetach) {
+ const ShmSegment shm = ASSERT_NO_ERRNO_AND_VALUE(
+ Shmget(IPC_PRIVATE, kAllocSize, IPC_CREAT | 0777));
+ struct shmid_ds attr;
+ ASSERT_NO_ERRNO(Shmctl(shm.id(), IPC_STAT, &attr));
+ EXPECT_EQ(attr.shm_segsz, kAllocSize);
+ EXPECT_EQ(attr.shm_nattch, 0);
+
+ const char* addr = ASSERT_NO_ERRNO_AND_VALUE(Shmat(shm.id(), nullptr, 0));
+ ASSERT_NO_ERRNO(Shmctl(shm.id(), IPC_STAT, &attr));
+ EXPECT_EQ(attr.shm_nattch, 1);
+
+ const char* addr2 = ASSERT_NO_ERRNO_AND_VALUE(Shmat(shm.id(), nullptr, 0));
+ ASSERT_NO_ERRNO(Shmctl(shm.id(), IPC_STAT, &attr));
+ EXPECT_EQ(attr.shm_nattch, 2);
+
+ ASSERT_NO_ERRNO(Shmdt(addr));
+ ASSERT_NO_ERRNO(Shmctl(shm.id(), IPC_STAT, &attr));
+ EXPECT_EQ(attr.shm_nattch, 1);
+
+ ASSERT_NO_ERRNO(Shmdt(addr2));
+ ASSERT_NO_ERRNO(Shmctl(shm.id(), IPC_STAT, &attr));
+ EXPECT_EQ(attr.shm_nattch, 0);
+}
+
+TEST(ShmTest, LookupByKey) {
+ const TempPath keyfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const key_t key = ftok(keyfile.path().c_str(), 1);
+ const ShmSegment shm =
+ ASSERT_NO_ERRNO_AND_VALUE(Shmget(key, kAllocSize, IPC_CREAT | 0777));
+ const int id2 = ASSERT_NO_ERRNO_AND_VALUE(ShmgetRaw(key, kAllocSize, 0777));
+ EXPECT_EQ(shm.id(), id2);
+}
+
+TEST(ShmTest, DetachedSegmentsPersist) {
+ const ShmSegment shm = ASSERT_NO_ERRNO_AND_VALUE(
+ Shmget(IPC_PRIVATE, kAllocSize, IPC_CREAT | 0777));
+ char* addr = ASSERT_NO_ERRNO_AND_VALUE(Shmat(shm.id(), nullptr, 0));
+ addr[0] = 'x';
+ ASSERT_NO_ERRNO(Shmdt(addr));
+
+ // We should be able to re-attach to the same segment and get our data back.
+ addr = ASSERT_NO_ERRNO_AND_VALUE(Shmat(shm.id(), nullptr, 0));
+ EXPECT_EQ(addr[0], 'x');
+ ASSERT_NO_ERRNO(Shmdt(addr));
+}
+
+TEST(ShmTest, MultipleDetachFails) {
+ const ShmSegment shm = ASSERT_NO_ERRNO_AND_VALUE(
+ Shmget(IPC_PRIVATE, kAllocSize, IPC_CREAT | 0777));
+ const char* addr = ASSERT_NO_ERRNO_AND_VALUE(Shmat(shm.id(), nullptr, 0));
+ ASSERT_NO_ERRNO(Shmdt(addr));
+ EXPECT_THAT(Shmdt(addr), PosixErrorIs(EINVAL, _));
+}
+
+TEST(ShmTest, IpcStat) {
+ const TempPath keyfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const key_t key = ftok(keyfile.path().c_str(), 1);
+
+ const time_t start = time(nullptr);
+
+ const ShmSegment shm =
+ ASSERT_NO_ERRNO_AND_VALUE(Shmget(key, kAllocSize, IPC_CREAT | 0777));
+
+ const uid_t uid = getuid();
+ const gid_t gid = getgid();
+ const pid_t pid = getpid();
+
+ struct shmid_ds attr;
+ ASSERT_NO_ERRNO(Shmctl(shm.id(), IPC_STAT, &attr));
+
+ EXPECT_EQ(attr.shm_perm.__key, key);
+ EXPECT_EQ(attr.shm_perm.uid, uid);
+ EXPECT_EQ(attr.shm_perm.gid, gid);
+ EXPECT_EQ(attr.shm_perm.cuid, uid);
+ EXPECT_EQ(attr.shm_perm.cgid, gid);
+ EXPECT_EQ(attr.shm_perm.mode, 0777);
+
+ EXPECT_EQ(attr.shm_segsz, kAllocSize);
+
+ EXPECT_EQ(attr.shm_atime, 0);
+ EXPECT_EQ(attr.shm_dtime, 0);
+
+ // Change time is set on creation.
+ EXPECT_GE(attr.shm_ctime, start);
+
+ EXPECT_EQ(attr.shm_cpid, pid);
+ EXPECT_EQ(attr.shm_lpid, 0);
+
+ EXPECT_EQ(attr.shm_nattch, 0);
+
+ // The timestamps only have a resolution of seconds; slow down so we actually
+ // see the timestamps change.
+ absl::SleepFor(absl::Seconds(1));
+ const time_t pre_attach = time(nullptr);
+
+ const char* addr = ASSERT_NO_ERRNO_AND_VALUE(Shmat(shm.id(), nullptr, 0));
+ ASSERT_NO_ERRNO(Shmctl(shm.id(), IPC_STAT, &attr));
+
+ EXPECT_GE(attr.shm_atime, pre_attach);
+ EXPECT_EQ(attr.shm_dtime, 0);
+ EXPECT_LT(attr.shm_ctime, pre_attach);
+ EXPECT_EQ(attr.shm_lpid, pid);
+ EXPECT_EQ(attr.shm_nattch, 1);
+
+ absl::SleepFor(absl::Seconds(1));
+ const time_t pre_detach = time(nullptr);
+
+ ASSERT_NO_ERRNO(Shmdt(addr));
+ ASSERT_NO_ERRNO(Shmctl(shm.id(), IPC_STAT, &attr));
+
+ EXPECT_LT(attr.shm_atime, pre_detach);
+ EXPECT_GE(attr.shm_dtime, pre_detach);
+ EXPECT_LT(attr.shm_ctime, pre_detach);
+ EXPECT_EQ(attr.shm_lpid, pid);
+ EXPECT_EQ(attr.shm_nattch, 0);
+}
+
+TEST(ShmTest, ShmStat) {
+ // This test relies on the segment we create to be the first one on the
+ // system, causing it to occupy slot 1. We can't reasonably expect this on a
+ // general Linux host.
+ SKIP_IF(!IsRunningOnGvisor());
+
+ const ShmSegment shm = ASSERT_NO_ERRNO_AND_VALUE(
+ Shmget(IPC_PRIVATE, kAllocSize, IPC_CREAT | 0777));
+ struct shmid_ds attr;
+ ASSERT_NO_ERRNO(Shmctl(1, SHM_STAT, &attr));
+ // This does the same thing as IPC_STAT, so only test that the syscall
+ // succeeds here.
+}
+
+TEST(ShmTest, IpcInfo) {
+ struct shminfo info;
+ ASSERT_NO_ERRNO(Shmctl(0, IPC_INFO, &info));
+
+ EXPECT_EQ(info.shmmin, 1); // This is always 1, according to the man page.
+ EXPECT_GT(info.shmmax, info.shmmin);
+ EXPECT_GT(info.shmmni, 0);
+ EXPECT_GT(info.shmseg, 0);
+ EXPECT_GT(info.shmall, 0);
+}
+
+TEST(ShmTest, ShmInfo) {
+ struct shm_info info;
+
+ // We generally can't know what other processes on a linux machine
+ // does with shared memory segments, so we can't test specific
+ // numbers on Linux. When running under gvisor, we're guaranteed to
+ // be the only ones using shm, so we can easily verify machine-wide
+ // numbers.
+ if (IsRunningOnGvisor()) {
+ ASSERT_NO_ERRNO(Shmctl(0, SHM_INFO, &info));
+ EXPECT_EQ(info.used_ids, 0);
+ EXPECT_EQ(info.shm_tot, 0);
+ EXPECT_EQ(info.shm_rss, 0);
+ EXPECT_EQ(info.shm_swp, 0);
+ }
+
+ const ShmSegment shm = ASSERT_NO_ERRNO_AND_VALUE(
+ Shmget(IPC_PRIVATE, kAllocSize, IPC_CREAT | 0777));
+ const char* addr = ASSERT_NO_ERRNO_AND_VALUE(Shmat(shm.id(), nullptr, 0));
+
+ ASSERT_NO_ERRNO(Shmctl(1, SHM_INFO, &info));
+
+ if (IsRunningOnGvisor()) {
+ ASSERT_NO_ERRNO(Shmctl(shm.id(), SHM_INFO, &info));
+ EXPECT_EQ(info.used_ids, 1);
+ EXPECT_EQ(info.shm_tot, kAllocSize / kPageSize);
+ EXPECT_EQ(info.shm_rss, kAllocSize / kPageSize);
+ EXPECT_EQ(info.shm_swp, 0); // Gvisor currently never swaps.
+ }
+
+ ASSERT_NO_ERRNO(Shmdt(addr));
+}
+
+TEST(ShmTest, ShmCtlSet) {
+ const ShmSegment shm = ASSERT_NO_ERRNO_AND_VALUE(
+ Shmget(IPC_PRIVATE, kAllocSize, IPC_CREAT | 0777));
+ const char* addr = ASSERT_NO_ERRNO_AND_VALUE(Shmat(shm.id(), nullptr, 0));
+
+ struct shmid_ds attr;
+ ASSERT_NO_ERRNO(Shmctl(shm.id(), IPC_STAT, &attr));
+ ASSERT_EQ(attr.shm_perm.mode, 0777);
+
+ attr.shm_perm.mode = 0766;
+ ASSERT_NO_ERRNO(Shmctl(shm.id(), IPC_SET, &attr));
+
+ ASSERT_NO_ERRNO(Shmctl(shm.id(), IPC_STAT, &attr));
+ ASSERT_EQ(attr.shm_perm.mode, 0766);
+
+ ASSERT_NO_ERRNO(Shmdt(addr));
+}
+
+TEST(ShmTest, RemovedSegmentsAreMarkedDeleted) {
+ ShmSegment shm = ASSERT_NO_ERRNO_AND_VALUE(
+ Shmget(IPC_PRIVATE, kAllocSize, IPC_CREAT | 0777));
+ const char* addr = ASSERT_NO_ERRNO_AND_VALUE(Shmat(shm.id(), nullptr, 0));
+ const int id = ASSERT_NO_ERRNO_AND_VALUE(shm.Rmid());
+ struct shmid_ds attr;
+ ASSERT_NO_ERRNO(Shmctl(id, IPC_STAT, &attr));
+ EXPECT_NE(attr.shm_perm.mode & SHM_DEST, 0);
+ ASSERT_NO_ERRNO(Shmdt(addr));
+}
+
+TEST(ShmTest, RemovedSegmentsAreDestroyed) {
+ ShmSegment shm = ASSERT_NO_ERRNO_AND_VALUE(
+ Shmget(IPC_PRIVATE, kAllocSize, IPC_CREAT | 0777));
+ const char* addr = ASSERT_NO_ERRNO_AND_VALUE(Shmat(shm.id(), nullptr, 0));
+
+ const uint64_t alloc_pages = kAllocSize / kPageSize;
+
+ struct shm_info info;
+ ASSERT_NO_ERRNO(Shmctl(0 /*ignored*/, SHM_INFO, &info));
+ const uint64_t before = info.shm_tot;
+
+ ASSERT_NO_ERRNO(shm.Rmid());
+ ASSERT_NO_ERRNO(Shmdt(addr));
+
+ ASSERT_NO_ERRNO(Shmctl(0 /*ignored*/, SHM_INFO, &info));
+ if (IsRunningOnGvisor()) {
+ // No guarantees on system-wide shm memory usage on a generic linux host.
+ const uint64_t after = info.shm_tot;
+ EXPECT_EQ(after, before - alloc_pages);
+ }
+}
+
+TEST(ShmTest, AllowsAttachToRemovedSegmentWithRefs) {
+ ShmSegment shm = ASSERT_NO_ERRNO_AND_VALUE(
+ Shmget(IPC_PRIVATE, kAllocSize, IPC_CREAT | 0777));
+ const char* addr = ASSERT_NO_ERRNO_AND_VALUE(Shmat(shm.id(), nullptr, 0));
+ const int id = ASSERT_NO_ERRNO_AND_VALUE(shm.Rmid());
+ const char* addr2 = ASSERT_NO_ERRNO_AND_VALUE(Shmat(id, nullptr, 0));
+ ASSERT_NO_ERRNO(Shmdt(addr));
+ ASSERT_NO_ERRNO(Shmdt(addr2));
+}
+
+TEST(ShmTest, RemovedSegmentsAreNotDiscoverable) {
+ const TempPath keyfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const key_t key = ftok(keyfile.path().c_str(), 1);
+ ShmSegment shm =
+ ASSERT_NO_ERRNO_AND_VALUE(Shmget(key, kAllocSize, IPC_CREAT | 0777));
+ ASSERT_NO_ERRNO(shm.Rmid());
+ EXPECT_THAT(Shmget(key, kAllocSize, 0777), PosixErrorIs(ENOENT, _));
+}
+
+TEST(ShmDeathTest, ReadonlySegment) {
+ SetupGvisorDeathTest();
+ const ShmSegment shm = ASSERT_NO_ERRNO_AND_VALUE(
+ Shmget(IPC_PRIVATE, kAllocSize, IPC_CREAT | 0777));
+ char* addr = ASSERT_NO_ERRNO_AND_VALUE(Shmat(shm.id(), nullptr, SHM_RDONLY));
+ // Reading succeeds.
+ static_cast<void>(addr[0]);
+ // Writing fails.
+ EXPECT_EXIT(addr[0] = 'x', ::testing::KilledBySignal(SIGSEGV), "");
+}
+
+TEST(ShmDeathTest, SegmentNotAccessibleAfterDetach) {
+ // This test is susceptible to races with concurrent mmaps running in parallel
+ // gtest threads since the test relies on the address freed during a shm
+ // segment destruction to remain unused. We run the test body in a forked
+ // child to guarantee a single-threaded context to avoid this.
+
+ SetupGvisorDeathTest();
+
+ const auto rest = [&] {
+ ShmSegment shm = ASSERT_NO_ERRNO_AND_VALUE(
+ Shmget(IPC_PRIVATE, kAllocSize, IPC_CREAT | 0777));
+ char* addr = ASSERT_NO_ERRNO_AND_VALUE(Shmat(shm.id(), nullptr, 0));
+
+ // Mark the segment as destroyed so it's automatically cleaned up when we
+ // crash below. We can't rely on the standard cleanup since the destructor
+ // will not run after the SIGSEGV. Note that this doesn't destroy the
+ // segment immediately since we're still attached to it.
+ ASSERT_NO_ERRNO(shm.Rmid());
+
+ addr[0] = 'x';
+ ASSERT_NO_ERRNO(Shmdt(addr));
+
+ // This access should cause a SIGSEGV.
+ addr[0] = 'x';
+ };
+
+ EXPECT_THAT(InForkedProcess(rest),
+ IsPosixErrorOkAndHolds(W_EXITCODE(0, SIGSEGV)));
+}
+
+TEST(ShmTest, RequestingSegmentSmallerThanSHMMINFails) {
+ struct shminfo info;
+ ASSERT_NO_ERRNO(Shmctl(0, IPC_INFO, &info));
+ const uint64_t size = info.shmmin - 1;
+ EXPECT_THAT(Shmget(IPC_PRIVATE, size, IPC_CREAT | 0777),
+ PosixErrorIs(EINVAL, _));
+}
+
+TEST(ShmTest, RequestingSegmentLargerThanSHMMAXFails) {
+ struct shminfo info;
+ ASSERT_NO_ERRNO(Shmctl(0, IPC_INFO, &info));
+ const uint64_t size = info.shmmax + kPageSize;
+ EXPECT_THAT(Shmget(IPC_PRIVATE, size, IPC_CREAT | 0777),
+ PosixErrorIs(EINVAL, _));
+}
+
+TEST(ShmTest, RequestingUnalignedSizeSucceeds) {
+ EXPECT_NO_ERRNO(Shmget(IPC_PRIVATE, 4097, IPC_CREAT | 0777));
+}
+
+TEST(ShmTest, RequestingDuplicateCreationFails) {
+ const TempPath keyfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const key_t key = ftok(keyfile.path().c_str(), 1);
+ const ShmSegment shm = ASSERT_NO_ERRNO_AND_VALUE(
+ Shmget(key, kAllocSize, IPC_CREAT | IPC_EXCL | 0777));
+ EXPECT_THAT(Shmget(key, kAllocSize, IPC_CREAT | IPC_EXCL | 0777),
+ PosixErrorIs(EEXIST, _));
+}
+
+TEST(ShmTest, NonExistentSegmentsAreNotFound) {
+ const TempPath keyfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const key_t key = ftok(keyfile.path().c_str(), 1);
+ // Do not request creation.
+ EXPECT_THAT(Shmget(key, kAllocSize, 0777), PosixErrorIs(ENOENT, _));
+}
+
+TEST(ShmTest, SegmentsSizeFixedOnCreation) {
+ const TempPath keyfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const key_t key = ftok(keyfile.path().c_str(), 1);
+
+ // Base segment.
+ const ShmSegment shm =
+ ASSERT_NO_ERRNO_AND_VALUE(Shmget(key, kAllocSize, IPC_CREAT | 0777));
+
+ // Ask for the same segment at half size. This succeeds.
+ const int id2 =
+ ASSERT_NO_ERRNO_AND_VALUE(ShmgetRaw(key, kAllocSize / 2, 0777));
+
+ // Ask for the same segment at double size.
+ EXPECT_THAT(Shmget(key, kAllocSize * 2, 0777), PosixErrorIs(EINVAL, _));
+
+ char* addr = ASSERT_NO_ERRNO_AND_VALUE(Shmat(shm.id(), nullptr, 0));
+ char* addr2 = ASSERT_NO_ERRNO_AND_VALUE(Shmat(id2, nullptr, 0));
+
+ // We have 2 different maps...
+ EXPECT_NE(addr, addr2);
+
+ // ... And both maps are kAllocSize bytes; despite asking for a half-sized
+ // segment for the second map.
+ addr[kAllocSize - 1] = 'x';
+ addr2[kAllocSize - 1] = 'x';
+
+ ASSERT_NO_ERRNO(Shmdt(addr));
+ ASSERT_NO_ERRNO(Shmdt(addr2));
+}
+
+TEST(ShmTest, PartialUnmap) {
+ const ShmSegment shm = ASSERT_NO_ERRNO_AND_VALUE(
+ Shmget(IPC_PRIVATE, kAllocSize, IPC_CREAT | 0777));
+ char* addr = ASSERT_NO_ERRNO_AND_VALUE(Shmat(shm.id(), nullptr, 0));
+ EXPECT_THAT(munmap(addr + (kAllocSize / 4), kAllocSize / 2),
+ SyscallSucceeds());
+ ASSERT_NO_ERRNO(Shmdt(addr));
+}
+
+// Check that sentry does not panic when asked for a zero-length private shm
+// segment. Regression test for b/110694797.
+TEST(ShmTest, GracefullyFailOnZeroLenSegmentCreation) {
+ EXPECT_THAT(Shmget(IPC_PRIVATE, 0, 0), PosixErrorIs(EINVAL, _));
+}
+
+TEST(ShmTest, NoDestructionOfAttachedSegmentWithMultipleRmid) {
+ ShmSegment shm = ASSERT_NO_ERRNO_AND_VALUE(
+ Shmget(IPC_PRIVATE, kAllocSize, IPC_CREAT | 0777));
+ char* addr = ASSERT_NO_ERRNO_AND_VALUE(Shmat(shm.id(), nullptr, 0));
+ char* addr2 = ASSERT_NO_ERRNO_AND_VALUE(Shmat(shm.id(), nullptr, 0));
+
+ // There should be 2 refs to the segment from the 2 attachments, and a single
+ // self-reference. Mark the segment as destroyed more than 3 times through
+ // shmctl(RMID). If there's a bug with the ref counting, this should cause the
+ // count to drop to zero.
+ int id = shm.release();
+ for (int i = 0; i < 6; ++i) {
+ ASSERT_NO_ERRNO(Shmctl<void>(id, IPC_RMID, nullptr));
+ }
+
+ // Segment should remain accessible.
+ addr[0] = 'x';
+ ASSERT_NO_ERRNO(Shmdt(addr));
+
+ // Segment should remain accessible even after one of the two attachments are
+ // detached.
+ addr2[0] = 'x';
+ ASSERT_NO_ERRNO(Shmdt(addr2));
+}
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/sigaction.cc b/test/syscalls/linux/sigaction.cc
new file mode 100644
index 000000000..9d9dd57a8
--- /dev/null
+++ b/test/syscalls/linux/sigaction.cc
@@ -0,0 +1,79 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <signal.h>
+#include <sys/syscall.h>
+
+#include "gtest/gtest.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(SigactionTest, GetLessThanOrEqualToZeroFails) {
+ struct sigaction act = {};
+ ASSERT_THAT(sigaction(-1, nullptr, &act), SyscallFailsWithErrno(EINVAL));
+ ASSERT_THAT(sigaction(0, nullptr, &act), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(SigactionTest, SetLessThanOrEqualToZeroFails) {
+ struct sigaction act = {};
+ ASSERT_THAT(sigaction(0, &act, nullptr), SyscallFailsWithErrno(EINVAL));
+ ASSERT_THAT(sigaction(0, &act, nullptr), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(SigactionTest, GetGreaterThanMaxFails) {
+ struct sigaction act = {};
+ ASSERT_THAT(sigaction(SIGRTMAX + 1, nullptr, &act),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(SigactionTest, SetGreaterThanMaxFails) {
+ struct sigaction act = {};
+ ASSERT_THAT(sigaction(SIGRTMAX + 1, &act, nullptr),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(SigactionTest, SetSigkillFails) {
+ struct sigaction act = {};
+ ASSERT_THAT(sigaction(SIGKILL, nullptr, &act), SyscallSucceeds());
+ ASSERT_THAT(sigaction(SIGKILL, &act, nullptr), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(SigactionTest, SetSigstopFails) {
+ struct sigaction act = {};
+ ASSERT_THAT(sigaction(SIGSTOP, nullptr, &act), SyscallSucceeds());
+ ASSERT_THAT(sigaction(SIGSTOP, &act, nullptr), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(SigactionTest, BadSigsetFails) {
+ constexpr size_t kWrongSigSetSize = 43;
+
+ struct sigaction act = {};
+
+ // The syscall itself (rather than the libc wrapper) takes the sigset_t size.
+ ASSERT_THAT(
+ syscall(SYS_rt_sigaction, SIGTERM, nullptr, &act, kWrongSigSetSize),
+ SyscallFailsWithErrno(EINVAL));
+ ASSERT_THAT(
+ syscall(SYS_rt_sigaction, SIGTERM, &act, nullptr, kWrongSigSetSize),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/sigaltstack.cc b/test/syscalls/linux/sigaltstack.cc
new file mode 100644
index 000000000..24e7c4960
--- /dev/null
+++ b/test/syscalls/linux/sigaltstack.cc
@@ -0,0 +1,268 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <functional>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "test/util/cleanup.h"
+#include "test/util/fs_util.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/signal_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+PosixErrorOr<Cleanup> ScopedSigaltstack(stack_t const& stack) {
+ stack_t old_stack;
+ int rc = sigaltstack(&stack, &old_stack);
+ MaybeSave();
+ if (rc < 0) {
+ return PosixError(errno, "sigaltstack failed");
+ }
+ return Cleanup([old_stack] {
+ EXPECT_THAT(sigaltstack(&old_stack, nullptr), SyscallSucceeds());
+ });
+}
+
+volatile bool got_signal = false;
+volatile int sigaltstack_errno = 0;
+volatile int ss_flags = 0;
+
+void sigaltstack_handler(int sig, siginfo_t* siginfo, void* arg) {
+ got_signal = true;
+
+ stack_t stack;
+ int ret = sigaltstack(nullptr, &stack);
+ MaybeSave();
+ if (ret < 0) {
+ sigaltstack_errno = errno;
+ return;
+ }
+ ss_flags = stack.ss_flags;
+}
+
+TEST(SigaltstackTest, Success) {
+ std::vector<char> stack_mem(SIGSTKSZ);
+ stack_t stack = {};
+ stack.ss_sp = stack_mem.data();
+ stack.ss_size = stack_mem.size();
+ auto const cleanup_sigstack =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaltstack(stack));
+
+ struct sigaction sa = {};
+ sa.sa_sigaction = sigaltstack_handler;
+ sigfillset(&sa.sa_mask);
+ sa.sa_flags = SA_SIGINFO | SA_ONSTACK;
+ auto const cleanup_sa =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGUSR1, sa));
+
+ // Send signal to this thread, as sigaltstack is per-thread.
+ EXPECT_THAT(tgkill(getpid(), gettid(), SIGUSR1), SyscallSucceeds());
+
+ EXPECT_TRUE(got_signal);
+ EXPECT_EQ(sigaltstack_errno, 0);
+ EXPECT_NE(0, ss_flags & SS_ONSTACK);
+}
+
+TEST(SigaltstackTest, ResetByExecve) {
+ std::vector<char> stack_mem(SIGSTKSZ);
+ stack_t stack = {};
+ stack.ss_sp = stack_mem.data();
+ stack.ss_size = stack_mem.size();
+ auto const cleanup_sigstack =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaltstack(stack));
+
+ std::string full_path = RunfilePath("test/syscalls/linux/sigaltstack_check");
+
+ pid_t child_pid = -1;
+ int execve_errno = 0;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec(full_path, {"sigaltstack_check"}, {}, nullptr, &child_pid,
+ &execve_errno));
+
+ ASSERT_GT(child_pid, 0);
+ ASSERT_EQ(execve_errno, 0);
+
+ int status = 0;
+ ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), 0);
+}
+
+volatile bool badhandler_on_sigaltstack = true; // Set by the handler.
+char* volatile badhandler_low_water_mark = nullptr; // Set by the handler.
+volatile uint8_t badhandler_recursive_faults = 0; // Consumed by the handler.
+
+void badhandler(int sig, siginfo_t* siginfo, void* arg) {
+ char stack_var = 0;
+ char* current_ss = &stack_var;
+
+ stack_t stack;
+ int ret = sigaltstack(nullptr, &stack);
+ if (ret < 0 || (stack.ss_flags & SS_ONSTACK) != SS_ONSTACK) {
+ // We should always be marked as being on the stack. Don't allow this to hit
+ // the bottom if this is ever not true (the main test will fail as a
+ // result, but we still need to unwind the recursive faults).
+ badhandler_on_sigaltstack = false;
+ }
+ if (current_ss < badhandler_low_water_mark) {
+ // Record the low point for the signal stack. We never expected this to be
+ // before stack bottom, but this is asserted in the actual test.
+ badhandler_low_water_mark = current_ss;
+ }
+ if (badhandler_recursive_faults > 0) {
+ badhandler_recursive_faults--;
+ Fault();
+ }
+ FixupFault(reinterpret_cast<ucontext_t*>(arg));
+}
+
+TEST(SigaltstackTest, WalksOffBottom) {
+ // This test marks the upper half of the stack_mem array as the signal stack.
+ // It asserts that when a fault occurs in the handler (already on the signal
+ // stack), we eventually continue to fault our way off the stack. We should
+ // not revert to the top of the signal stack when we fall off the bottom and
+ // the signal stack should remain "in use". When we fall off the signal stack,
+ // we should have an unconditional signal delivered and not start using the
+ // first part of the stack_mem array.
+ std::vector<char> stack_mem(SIGSTKSZ * 2);
+ stack_t stack = {};
+ stack.ss_sp = stack_mem.data() + SIGSTKSZ; // See above: upper half.
+ stack.ss_size = SIGSTKSZ; // Only one half the array.
+ auto const cleanup_sigstack =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaltstack(stack));
+
+ // Setup the handler: this must be for SIGSEGV, and it must allow proper
+ // nesting (no signal mask, no defer) so that we can trigger multiple times.
+ //
+ // When we walk off the bottom of the signal stack and force signal delivery
+ // of a SIGSEGV, the handler will revert to the default behavior (kill).
+ struct sigaction sa = {};
+ sa.sa_sigaction = badhandler;
+ sa.sa_flags = SA_SIGINFO | SA_ONSTACK | SA_NODEFER;
+ auto const cleanup_sa =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGSEGV, sa));
+
+ // Trigger a single fault.
+ badhandler_low_water_mark =
+ static_cast<char*>(stack.ss_sp) + SIGSTKSZ; // Expected top.
+ badhandler_recursive_faults = 0; // Disable refault.
+ Fault();
+ EXPECT_TRUE(badhandler_on_sigaltstack);
+ EXPECT_THAT(sigaltstack(nullptr, &stack), SyscallSucceeds());
+ EXPECT_EQ(stack.ss_flags & SS_ONSTACK, 0);
+ EXPECT_LT(badhandler_low_water_mark,
+ reinterpret_cast<char*>(stack.ss_sp) + 2 * SIGSTKSZ);
+ EXPECT_GT(badhandler_low_water_mark, reinterpret_cast<char*>(stack.ss_sp));
+
+ // Trigger two faults.
+ char* prev_low_water_mark = badhandler_low_water_mark; // Previous top.
+ badhandler_recursive_faults = 1; // One refault.
+ Fault();
+ ASSERT_TRUE(badhandler_on_sigaltstack);
+ EXPECT_THAT(sigaltstack(nullptr, &stack), SyscallSucceeds());
+ EXPECT_EQ(stack.ss_flags & SS_ONSTACK, 0);
+ EXPECT_LT(badhandler_low_water_mark, prev_low_water_mark);
+ EXPECT_GT(badhandler_low_water_mark, reinterpret_cast<char*>(stack.ss_sp));
+
+ // Calculate the stack growth for a fault, and set the recursive faults to
+ // ensure that the signal handler stack required exceeds our marked stack area
+ // by a minimal amount. It should remain in the valid stack_mem area so that
+ // we can test the signal is forced merely by going out of the signal stack
+ // bounds, not by a genuine fault.
+ uintptr_t frame_size =
+ static_cast<uintptr_t>(prev_low_water_mark - badhandler_low_water_mark);
+ badhandler_recursive_faults = (SIGSTKSZ + frame_size) / frame_size;
+ EXPECT_EXIT(Fault(), ::testing::KilledBySignal(SIGSEGV), "");
+}
+
+volatile int setonstack_retval = 0; // Set by the handler.
+volatile int setonstack_errno = 0; // Set by the handler.
+
+void setonstack(int sig, siginfo_t* siginfo, void* arg) {
+ char stack_mem[SIGSTKSZ];
+ stack_t stack = {};
+ stack.ss_sp = &stack_mem[0];
+ stack.ss_size = SIGSTKSZ;
+ setonstack_retval = sigaltstack(&stack, nullptr);
+ setonstack_errno = errno;
+ FixupFault(reinterpret_cast<ucontext_t*>(arg));
+}
+
+TEST(SigaltstackTest, SetWhileOnStack) {
+ // Reserve twice as much stack here, since the handler will allocate a vector
+ // of size SIGTKSZ and attempt to set the sigaltstack to that value.
+ std::vector<char> stack_mem(2 * SIGSTKSZ);
+ stack_t stack = {};
+ stack.ss_sp = stack_mem.data();
+ stack.ss_size = stack_mem.size();
+ auto const cleanup_sigstack =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaltstack(stack));
+
+ // See above.
+ struct sigaction sa = {};
+ sa.sa_sigaction = setonstack;
+ sa.sa_flags = SA_SIGINFO | SA_ONSTACK;
+ auto const cleanup_sa =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGSEGV, sa));
+
+ // Trigger a fault.
+ Fault();
+
+ // The set should have failed.
+ EXPECT_EQ(setonstack_retval, -1);
+ EXPECT_EQ(setonstack_errno, EPERM);
+}
+
+TEST(SigaltstackTest, SetCurrentStack) {
+ // This is executed as an exit test because once the signal stack is set to
+ // the local stack, there's no good way to unwind. We don't want to taint the
+ // test of any other tests that might run within this process.
+ EXPECT_EXIT(
+ {
+ char stack_value = 0;
+ stack_t stack = {};
+ stack.ss_sp = &stack_value - kPageSize; // Lower than current level.
+ stack.ss_size = 2 * kPageSize; // => &stack_value +/- kPageSize.
+ TEST_CHECK(sigaltstack(&stack, nullptr) == 0);
+ TEST_CHECK(sigaltstack(nullptr, &stack) == 0);
+ TEST_CHECK((stack.ss_flags & SS_ONSTACK) != 0);
+
+ // Should not be able to change the stack (even no-op).
+ TEST_CHECK(sigaltstack(&stack, nullptr) == -1 && errno == EPERM);
+
+ // Should not be able to disable the stack.
+ stack.ss_flags = SS_DISABLE;
+ TEST_CHECK(sigaltstack(&stack, nullptr) == -1 && errno == EPERM);
+ exit(0);
+ },
+ ::testing::ExitedWithCode(0), "");
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/sigaltstack_check.cc b/test/syscalls/linux/sigaltstack_check.cc
new file mode 100644
index 000000000..5ac1b661d
--- /dev/null
+++ b/test/syscalls/linux/sigaltstack_check.cc
@@ -0,0 +1,33 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Checks that there is no alternate signal stack by default.
+//
+// Used by a test in sigaltstack.cc.
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "test/util/logging.h"
+
+int main(int /* argc */, char** /* argv */) {
+ stack_t stack;
+ TEST_CHECK(sigaltstack(nullptr, &stack) >= 0);
+ TEST_CHECK(stack.ss_flags == SS_DISABLE);
+ TEST_CHECK(stack.ss_sp == 0);
+ TEST_CHECK(stack.ss_size == 0);
+ return 0;
+}
diff --git a/test/syscalls/linux/sigiret.cc b/test/syscalls/linux/sigiret.cc
new file mode 100644
index 000000000..6227774a4
--- /dev/null
+++ b/test/syscalls/linux/sigiret.cc
@@ -0,0 +1,136 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/ucontext.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/util/logging.h"
+#include "test/util/signal_util.h"
+#include "test/util/test_util.h"
+#include "test/util/timer_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+constexpr uint64_t kOrigRcx = 0xdeadbeeffacefeed;
+constexpr uint64_t kOrigR11 = 0xfacefeedbaad1dea;
+
+volatile int gotvtalrm, ready;
+
+void sigvtalrm(int sig, siginfo_t* siginfo, void* _uc) {
+ ucontext_t* uc = reinterpret_cast<ucontext_t*>(_uc);
+
+ // Verify that:
+ // - test is in the busy-wait loop waiting for signal.
+ // - %rcx and %r11 values in mcontext_t match kOrigRcx and kOrigR11.
+ if (ready &&
+ static_cast<uint64_t>(uc->uc_mcontext.gregs[REG_RCX]) == kOrigRcx &&
+ static_cast<uint64_t>(uc->uc_mcontext.gregs[REG_R11]) == kOrigR11) {
+ // Modify the values %rcx and %r11 in the ucontext. These are the
+ // values seen by the application after the signal handler returns.
+ uc->uc_mcontext.gregs[REG_RCX] = ~kOrigRcx;
+ uc->uc_mcontext.gregs[REG_R11] = ~kOrigR11;
+ gotvtalrm = 1;
+ }
+}
+
+TEST(SigIretTest, CheckRcxR11) {
+ // Setup signal handler for SIGVTALRM.
+ struct sigaction sa = {};
+ sigfillset(&sa.sa_mask);
+ sa.sa_sigaction = sigvtalrm;
+ sa.sa_flags = SA_SIGINFO;
+ auto const action_cleanup =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGVTALRM, sa));
+
+ auto const mask_cleanup =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_UNBLOCK, SIGVTALRM));
+
+ // Setup itimer to fire after 500 msecs.
+ struct itimerval itimer = {};
+ itimer.it_value.tv_usec = 500 * 1000; // 500 msecs.
+ auto const timer_cleanup =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedItimer(ITIMER_VIRTUAL, itimer));
+
+ // Initialize %rcx and %r11 and spin until the signal handler returns.
+ uint64_t rcx = kOrigRcx;
+ uint64_t r11 = kOrigR11;
+ asm volatile(
+ "movq %[rcx], %%rcx;" // %rcx = rcx
+ "movq %[r11], %%r11;" // %r11 = r11
+ "movl $1, %[ready];" // ready = 1
+ "1: pause; cmpl $0, %[gotvtalrm]; je 1b;" // while (!gotvtalrm);
+ "movq %%rcx, %[rcx];" // rcx = %rcx
+ "movq %%r11, %[r11];" // r11 = %r11
+ : [ ready ] "=m"(ready), [ rcx ] "+m"(rcx), [ r11 ] "+m"(r11)
+ : [ gotvtalrm ] "m"(gotvtalrm)
+ : "cc", "memory", "rcx", "r11");
+
+ // If sigreturn(2) returns via 'sysret' then %rcx and %r11 will be
+ // clobbered and set to 'ptregs->rip' and 'ptregs->rflags' respectively.
+ //
+ // The following check verifies that %rcx and %r11 were not clobbered
+ // when returning from the signal handler (via sigreturn(2)).
+ EXPECT_EQ(rcx, ~kOrigRcx);
+ EXPECT_EQ(r11, ~kOrigR11);
+}
+
+constexpr uint64_t kNonCanonicalRip = 0xCCCC000000000000;
+
+// Test that a non-canonical signal handler faults as expected.
+TEST(SigIretTest, BadHandler) {
+ struct sigaction sa = {};
+ sa.sa_sigaction =
+ reinterpret_cast<void (*)(int, siginfo_t*, void*)>(kNonCanonicalRip);
+ auto const cleanup = ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGUSR1, sa));
+
+ pid_t pid = fork();
+ if (pid == 0) {
+ // Child, wait for signal.
+ while (1) {
+ pause();
+ }
+ }
+ ASSERT_THAT(pid, SyscallSucceeds());
+
+ EXPECT_THAT(kill(pid, SIGUSR1), SyscallSucceeds());
+
+ int status;
+ EXPECT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
+ EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGSEGV)
+ << "status = " << status;
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
+
+int main(int argc, char** argv) {
+ // SigIretTest.CheckRcxR11 depends on delivering SIGVTALRM to the main thread.
+ // Block SIGVTALRM so that any other threads created by TestInit will also
+ // have SIGVTALRM blocked.
+ sigset_t set;
+ sigemptyset(&set);
+ sigaddset(&set, SIGVTALRM);
+ TEST_PCHECK(sigprocmask(SIG_BLOCK, &set, nullptr) == 0);
+
+ gvisor::testing::TestInit(&argc, &argv);
+ return gvisor::testing::RunAllTests();
+}
diff --git a/test/syscalls/linux/signalfd.cc b/test/syscalls/linux/signalfd.cc
new file mode 100644
index 000000000..389e5fca2
--- /dev/null
+++ b/test/syscalls/linux/signalfd.cc
@@ -0,0 +1,373 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <poll.h>
+#include <signal.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/signalfd.h>
+#include <unistd.h>
+
+#include <functional>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "absl/synchronization/mutex.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/posix_error.h"
+#include "test/util/signal_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+using ::testing::KilledBySignal;
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+constexpr int kSigno = SIGUSR1;
+constexpr int kSignoMax = 64; // SIGRTMAX
+constexpr int kSignoAlt = SIGUSR2;
+
+// Returns a new signalfd.
+inline PosixErrorOr<FileDescriptor> NewSignalFD(sigset_t* mask, int flags = 0) {
+ int fd = signalfd(-1, mask, flags);
+ MaybeSave();
+ if (fd < 0) {
+ return PosixError(errno, "signalfd");
+ }
+ return FileDescriptor(fd);
+}
+
+class SignalfdTest : public ::testing::TestWithParam<int> {};
+
+TEST_P(SignalfdTest, Basic) {
+ int signo = GetParam();
+ // Create the signalfd.
+ sigset_t mask;
+ sigemptyset(&mask);
+ sigaddset(&mask, signo);
+ FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask, 0));
+
+ // Deliver the blocked signal.
+ const auto scoped_sigmask =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, signo));
+ ASSERT_THAT(tgkill(getpid(), gettid(), signo), SyscallSucceeds());
+
+ // We should now read the signal.
+ struct signalfd_siginfo rbuf;
+ ASSERT_THAT(read(fd.get(), &rbuf, sizeof(rbuf)),
+ SyscallSucceedsWithValue(sizeof(rbuf)));
+ EXPECT_EQ(rbuf.ssi_signo, signo);
+}
+
+TEST_P(SignalfdTest, MaskWorks) {
+ int signo = GetParam();
+ // Create two signalfds with different masks.
+ sigset_t mask1, mask2;
+ sigemptyset(&mask1);
+ sigemptyset(&mask2);
+ sigaddset(&mask1, signo);
+ sigaddset(&mask2, kSignoAlt);
+ FileDescriptor fd1 = ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask1, 0));
+ FileDescriptor fd2 = ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask2, 0));
+
+ // Deliver the two signals.
+ const auto scoped_sigmask1 =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, signo));
+ const auto scoped_sigmask2 =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, kSignoAlt));
+ ASSERT_THAT(tgkill(getpid(), gettid(), signo), SyscallSucceeds());
+ ASSERT_THAT(tgkill(getpid(), gettid(), kSignoAlt), SyscallSucceeds());
+
+ // We should see the signals on the appropriate signalfds.
+ //
+ // We read in the opposite order as the signals deliver above, to ensure that
+ // we don't happen to read the correct signal from the correct signalfd.
+ struct signalfd_siginfo rbuf1, rbuf2;
+ ASSERT_THAT(read(fd2.get(), &rbuf2, sizeof(rbuf2)),
+ SyscallSucceedsWithValue(sizeof(rbuf2)));
+ EXPECT_EQ(rbuf2.ssi_signo, kSignoAlt);
+ ASSERT_THAT(read(fd1.get(), &rbuf1, sizeof(rbuf1)),
+ SyscallSucceedsWithValue(sizeof(rbuf1)));
+ EXPECT_EQ(rbuf1.ssi_signo, signo);
+}
+
+TEST(Signalfd, Cloexec) {
+ // Exec tests confirm that O_CLOEXEC has the intended effect. We just create a
+ // signalfd with the appropriate flag here and assert that the FD has it set.
+ sigset_t mask;
+ sigemptyset(&mask);
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask, SFD_CLOEXEC));
+ EXPECT_THAT(fcntl(fd.get(), F_GETFD), SyscallSucceedsWithValue(FD_CLOEXEC));
+}
+
+TEST_P(SignalfdTest, Blocking) {
+ int signo = GetParam();
+ // Create the signalfd in blocking mode.
+ sigset_t mask;
+ sigemptyset(&mask);
+ sigaddset(&mask, signo);
+ FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask, 0));
+
+ // Shared tid variable.
+ absl::Mutex mu;
+ bool has_tid;
+ pid_t tid;
+
+ // Start a thread reading.
+ ScopedThread t([&] {
+ // Copy the tid and notify the caller.
+ {
+ absl::MutexLock ml(&mu);
+ tid = gettid();
+ has_tid = true;
+ }
+
+ // Read the signal from the signalfd.
+ struct signalfd_siginfo rbuf;
+ ASSERT_THAT(read(fd.get(), &rbuf, sizeof(rbuf)),
+ SyscallSucceedsWithValue(sizeof(rbuf)));
+ EXPECT_EQ(rbuf.ssi_signo, signo);
+ });
+
+ // Wait until blocked.
+ absl::MutexLock ml(&mu);
+ mu.Await(absl::Condition(&has_tid));
+
+ // Deliver the signal to either the waiting thread, or
+ // to this thread. N.B. this is a bug in the core gVisor
+ // behavior for signalfd, and needs to be fixed.
+ //
+ // See gvisor.dev/issue/139.
+ if (IsRunningOnGvisor()) {
+ ASSERT_THAT(tgkill(getpid(), gettid(), signo), SyscallSucceeds());
+ } else {
+ ASSERT_THAT(tgkill(getpid(), tid, signo), SyscallSucceeds());
+ }
+
+ // Ensure that it was received.
+ t.Join();
+}
+
+TEST_P(SignalfdTest, ThreadGroup) {
+ int signo = GetParam();
+ // Create the signalfd in blocking mode.
+ sigset_t mask;
+ sigemptyset(&mask);
+ sigaddset(&mask, signo);
+ FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask, 0));
+
+ // Shared variable.
+ absl::Mutex mu;
+ bool first = false;
+ bool second = false;
+
+ // Start a thread reading.
+ ScopedThread t([&] {
+ // Read the signal from the signalfd.
+ struct signalfd_siginfo rbuf;
+ ASSERT_THAT(read(fd.get(), &rbuf, sizeof(rbuf)),
+ SyscallSucceedsWithValue(sizeof(rbuf)));
+ EXPECT_EQ(rbuf.ssi_signo, signo);
+
+ // Wait for the other thread.
+ absl::MutexLock ml(&mu);
+ first = true;
+ mu.Await(absl::Condition(&second));
+ });
+
+ // Deliver the signal to the threadgroup.
+ ASSERT_THAT(kill(getpid(), signo), SyscallSucceeds());
+
+ // Wait for the first thread to process.
+ {
+ absl::MutexLock ml(&mu);
+ mu.Await(absl::Condition(&first));
+ }
+
+ // Deliver to the thread group again (other thread still exists).
+ ASSERT_THAT(kill(getpid(), signo), SyscallSucceeds());
+
+ // Ensure that we can also receive it.
+ struct signalfd_siginfo rbuf;
+ ASSERT_THAT(read(fd.get(), &rbuf, sizeof(rbuf)),
+ SyscallSucceedsWithValue(sizeof(rbuf)));
+ EXPECT_EQ(rbuf.ssi_signo, signo);
+
+ // Mark the test as done.
+ {
+ absl::MutexLock ml(&mu);
+ second = true;
+ }
+
+ // The other thread should be joinable.
+ t.Join();
+}
+
+TEST_P(SignalfdTest, Nonblock) {
+ int signo = GetParam();
+ // Create the signalfd in non-blocking mode.
+ sigset_t mask;
+ sigemptyset(&mask);
+ sigaddset(&mask, signo);
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask, SFD_NONBLOCK));
+
+ // We should return if we attempt to read.
+ struct signalfd_siginfo rbuf;
+ ASSERT_THAT(read(fd.get(), &rbuf, sizeof(rbuf)),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+
+ // Block and deliver the signal.
+ const auto scoped_sigmask =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, signo));
+ ASSERT_THAT(tgkill(getpid(), gettid(), signo), SyscallSucceeds());
+
+ // Ensure that a read actually works.
+ ASSERT_THAT(read(fd.get(), &rbuf, sizeof(rbuf)),
+ SyscallSucceedsWithValue(sizeof(rbuf)));
+ EXPECT_EQ(rbuf.ssi_signo, signo);
+
+ // Should block again.
+ EXPECT_THAT(read(fd.get(), &rbuf, sizeof(rbuf)),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+}
+
+TEST_P(SignalfdTest, SetMask) {
+ int signo = GetParam();
+ // Create the signalfd matching nothing.
+ sigset_t mask;
+ sigemptyset(&mask);
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask, SFD_NONBLOCK));
+
+ // Block and deliver a signal.
+ const auto scoped_sigmask =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, signo));
+ ASSERT_THAT(tgkill(getpid(), gettid(), signo), SyscallSucceeds());
+
+ // We should have nothing.
+ struct signalfd_siginfo rbuf;
+ ASSERT_THAT(read(fd.get(), &rbuf, sizeof(rbuf)),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+
+ // Change the signal mask.
+ sigaddset(&mask, signo);
+ ASSERT_THAT(signalfd(fd.get(), &mask, 0), SyscallSucceeds());
+
+ // We should now have the signal.
+ ASSERT_THAT(read(fd.get(), &rbuf, sizeof(rbuf)),
+ SyscallSucceedsWithValue(sizeof(rbuf)));
+ EXPECT_EQ(rbuf.ssi_signo, signo);
+}
+
+TEST_P(SignalfdTest, Poll) {
+ int signo = GetParam();
+ // Create the signalfd.
+ sigset_t mask;
+ sigemptyset(&mask);
+ sigaddset(&mask, signo);
+ FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask, 0));
+
+ // Block the signal, and start a thread to deliver it.
+ const auto scoped_sigmask =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, signo));
+ pid_t orig_tid = gettid();
+ ScopedThread t([&] {
+ absl::SleepFor(absl::Seconds(5));
+ ASSERT_THAT(tgkill(getpid(), orig_tid, signo), SyscallSucceeds());
+ });
+
+ // Start polling for the signal. We expect that it is not available at the
+ // outset, but then becomes available when the signal is sent. We give a
+ // timeout of 10000ms (or the delay above + 5 seconds of additional grace
+ // time).
+ struct pollfd poll_fd = {fd.get(), POLLIN, 0};
+ EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, 10000),
+ SyscallSucceedsWithValue(1));
+
+ // Actually read the signal to prevent delivery.
+ struct signalfd_siginfo rbuf;
+ EXPECT_THAT(read(fd.get(), &rbuf, sizeof(rbuf)),
+ SyscallSucceedsWithValue(sizeof(rbuf)));
+}
+
+std::string PrintSigno(::testing::TestParamInfo<int> info) {
+ switch (info.param) {
+ case kSigno:
+ return "kSigno";
+ case kSignoMax:
+ return "kSignoMax";
+ default:
+ return absl::StrCat(info.param);
+ }
+}
+INSTANTIATE_TEST_SUITE_P(Signalfd, SignalfdTest,
+ ::testing::Values(kSigno, kSignoMax), PrintSigno);
+
+TEST(Signalfd, Ppoll) {
+ sigset_t mask;
+ sigemptyset(&mask);
+ sigaddset(&mask, SIGKILL);
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask, SFD_CLOEXEC));
+
+ // Ensure that the given ppoll blocks.
+ struct pollfd pfd = {};
+ pfd.fd = fd.get();
+ pfd.events = POLLIN;
+ struct timespec timeout = {};
+ timeout.tv_sec = 1;
+ EXPECT_THAT(RetryEINTR(ppoll)(&pfd, 1, &timeout, &mask),
+ SyscallSucceedsWithValue(0));
+}
+
+TEST(Signalfd, KillStillKills) {
+ sigset_t mask;
+ sigemptyset(&mask);
+ sigaddset(&mask, SIGKILL);
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(NewSignalFD(&mask, SFD_CLOEXEC));
+
+ // Just because there is a signalfd, we shouldn't see any change in behavior
+ // for unblockable signals. It's easier to test this with SIGKILL.
+ const auto scoped_sigmask =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, SIGKILL));
+ EXPECT_EXIT(tgkill(getpid(), gettid(), SIGKILL), KilledBySignal(SIGKILL), "");
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
+
+int main(int argc, char** argv) {
+ // These tests depend on delivering signals. Block them up front so that all
+ // other threads created by TestInit will also have them blocked, and they
+ // will not interface with the rest of the test.
+ sigset_t set;
+ sigemptyset(&set);
+ sigaddset(&set, gvisor::testing::kSigno);
+ sigaddset(&set, gvisor::testing::kSignoMax);
+ sigaddset(&set, gvisor::testing::kSignoAlt);
+ TEST_PCHECK(sigprocmask(SIG_BLOCK, &set, nullptr) == 0);
+
+ gvisor::testing::TestInit(&argc, &argv);
+
+ return gvisor::testing::RunAllTests();
+}
diff --git a/test/syscalls/linux/sigprocmask.cc b/test/syscalls/linux/sigprocmask.cc
new file mode 100644
index 000000000..a603fc1d1
--- /dev/null
+++ b/test/syscalls/linux/sigprocmask.cc
@@ -0,0 +1,269 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <signal.h>
+#include <stddef.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/util/signal_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Signals numbers used for testing.
+static constexpr int kTestSignal1 = SIGUSR1;
+static constexpr int kTestSignal2 = SIGUSR2;
+
+static int raw_sigprocmask(int how, const sigset_t* set, sigset_t* oldset) {
+ return syscall(SYS_rt_sigprocmask, how, set, oldset, _NSIG / 8);
+}
+
+// count of the number of signals received
+int signal_count[kMaxSignal + 1];
+
+// signal handler increments the signal counter
+void SigHandler(int sig, siginfo_t* info, void* context) {
+ TEST_CHECK(sig > 0 && sig <= kMaxSignal);
+ signal_count[sig] += 1;
+}
+
+// The test fixture saves and restores the signal mask and
+// sets up handlers for kTestSignal1 and kTestSignal2.
+class SigProcMaskTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ // Save the current signal mask.
+ EXPECT_THAT(raw_sigprocmask(SIG_SETMASK, nullptr, &mask_),
+ SyscallSucceeds());
+
+ // Setup signal handlers for kTestSignal1 and kTestSignal2.
+ struct sigaction sa;
+ sa.sa_sigaction = SigHandler;
+ sigfillset(&sa.sa_mask);
+ sa.sa_flags = SA_SIGINFO;
+ EXPECT_THAT(sigaction(kTestSignal1, &sa, &sa_test_sig_1_),
+ SyscallSucceeds());
+ EXPECT_THAT(sigaction(kTestSignal2, &sa, &sa_test_sig_2_),
+ SyscallSucceeds());
+
+ // Clear the signal counters.
+ memset(signal_count, 0, sizeof(signal_count));
+ }
+
+ void TearDown() override {
+ // Restore the signal mask.
+ EXPECT_THAT(raw_sigprocmask(SIG_SETMASK, &mask_, nullptr),
+ SyscallSucceeds());
+
+ // Restore the signal handlers for kTestSignal1 and kTestSignal2.
+ EXPECT_THAT(sigaction(kTestSignal1, &sa_test_sig_1_, nullptr),
+ SyscallSucceeds());
+ EXPECT_THAT(sigaction(kTestSignal2, &sa_test_sig_2_, nullptr),
+ SyscallSucceeds());
+ }
+
+ private:
+ sigset_t mask_;
+ struct sigaction sa_test_sig_1_;
+ struct sigaction sa_test_sig_2_;
+};
+
+// Both sigsets nullptr should succeed and do nothing.
+TEST_F(SigProcMaskTest, NullAddress) {
+ EXPECT_THAT(raw_sigprocmask(SIG_BLOCK, nullptr, NULL), SyscallSucceeds());
+ EXPECT_THAT(raw_sigprocmask(SIG_UNBLOCK, nullptr, NULL), SyscallSucceeds());
+ EXPECT_THAT(raw_sigprocmask(SIG_SETMASK, nullptr, NULL), SyscallSucceeds());
+}
+
+// Bad address for either sigset should fail with EFAULT.
+TEST_F(SigProcMaskTest, BadAddress) {
+ sigset_t* bad_addr = reinterpret_cast<sigset_t*>(-1);
+
+ EXPECT_THAT(raw_sigprocmask(SIG_SETMASK, bad_addr, nullptr),
+ SyscallFailsWithErrno(EFAULT));
+
+ EXPECT_THAT(raw_sigprocmask(SIG_SETMASK, nullptr, bad_addr),
+ SyscallFailsWithErrno(EFAULT));
+}
+
+// Bad value of the "how" parameter should fail with EINVAL.
+TEST_F(SigProcMaskTest, BadParameter) {
+ int bad_param_1 = -1;
+ int bad_param_2 = 42;
+
+ sigset_t set1;
+ sigemptyset(&set1);
+
+ EXPECT_THAT(raw_sigprocmask(bad_param_1, &set1, nullptr),
+ SyscallFailsWithErrno(EINVAL));
+
+ EXPECT_THAT(raw_sigprocmask(bad_param_2, &set1, nullptr),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+// Check that we can get the current signal mask.
+TEST_F(SigProcMaskTest, GetMask) {
+ sigset_t set1;
+ sigset_t set2;
+
+ sigemptyset(&set1);
+ sigfillset(&set2);
+ EXPECT_THAT(raw_sigprocmask(SIG_SETMASK, nullptr, &set1), SyscallSucceeds());
+ EXPECT_THAT(raw_sigprocmask(SIG_SETMASK, nullptr, &set2), SyscallSucceeds());
+ EXPECT_THAT(set1, EqualsSigset(set2));
+}
+
+// Check that we can set the signal mask.
+TEST_F(SigProcMaskTest, SetMask) {
+ sigset_t actual;
+ sigset_t expected;
+
+ // Try to mask all signals
+ sigfillset(&expected);
+ EXPECT_THAT(raw_sigprocmask(SIG_SETMASK, &expected, nullptr),
+ SyscallSucceeds());
+ EXPECT_THAT(raw_sigprocmask(SIG_SETMASK, nullptr, &actual),
+ SyscallSucceeds());
+ // sigprocmask() should have silently ignored SIGKILL and SIGSTOP.
+ sigdelset(&expected, SIGSTOP);
+ sigdelset(&expected, SIGKILL);
+ EXPECT_THAT(actual, EqualsSigset(expected));
+
+ // Try to clear the signal mask
+ sigemptyset(&expected);
+ EXPECT_THAT(raw_sigprocmask(SIG_SETMASK, &expected, nullptr),
+ SyscallSucceeds());
+ EXPECT_THAT(raw_sigprocmask(SIG_SETMASK, nullptr, &actual),
+ SyscallSucceeds());
+ EXPECT_THAT(actual, EqualsSigset(expected));
+
+ // Try to set a mask with one signal.
+ sigemptyset(&expected);
+ sigaddset(&expected, kTestSignal1);
+ EXPECT_THAT(raw_sigprocmask(SIG_SETMASK, &expected, nullptr),
+ SyscallSucceeds());
+ EXPECT_THAT(raw_sigprocmask(SIG_SETMASK, nullptr, &actual),
+ SyscallSucceeds());
+ EXPECT_THAT(actual, EqualsSigset(expected));
+}
+
+// Check that we can add and remove signals.
+TEST_F(SigProcMaskTest, BlockUnblock) {
+ sigset_t actual;
+ sigset_t expected;
+
+ // Try to set a mask with one signal.
+ sigemptyset(&expected);
+ sigaddset(&expected, kTestSignal1);
+ EXPECT_THAT(raw_sigprocmask(SIG_SETMASK, &expected, nullptr),
+ SyscallSucceeds());
+ EXPECT_THAT(raw_sigprocmask(SIG_SETMASK, nullptr, &actual),
+ SyscallSucceeds());
+ EXPECT_THAT(actual, EqualsSigset(expected));
+
+ // Try to add another signal.
+ sigset_t block;
+ sigemptyset(&block);
+ sigaddset(&block, kTestSignal2);
+ EXPECT_THAT(raw_sigprocmask(SIG_BLOCK, &block, nullptr), SyscallSucceeds());
+ EXPECT_THAT(raw_sigprocmask(SIG_SETMASK, nullptr, &actual),
+ SyscallSucceeds());
+ sigaddset(&expected, kTestSignal2);
+ EXPECT_THAT(actual, EqualsSigset(expected));
+
+ // Try to remove a signal.
+ sigset_t unblock;
+ sigemptyset(&unblock);
+ sigaddset(&unblock, kTestSignal1);
+ EXPECT_THAT(raw_sigprocmask(SIG_UNBLOCK, &unblock, nullptr),
+ SyscallSucceeds());
+ EXPECT_THAT(raw_sigprocmask(SIG_SETMASK, nullptr, &actual),
+ SyscallSucceeds());
+ sigdelset(&expected, kTestSignal1);
+ EXPECT_THAT(actual, EqualsSigset(expected));
+}
+
+// Test that the signal mask actually blocks signals.
+TEST_F(SigProcMaskTest, SignalHandler) {
+ sigset_t mask;
+
+ // clear the signal mask
+ sigemptyset(&mask);
+ EXPECT_THAT(raw_sigprocmask(SIG_SETMASK, &mask, nullptr), SyscallSucceeds());
+
+ // Check the initial signal counts.
+ EXPECT_EQ(0, signal_count[kTestSignal1]);
+ EXPECT_EQ(0, signal_count[kTestSignal2]);
+
+ // Check that both kTestSignal1 and kTestSignal2 are not blocked.
+ raise(kTestSignal1);
+ raise(kTestSignal2);
+ EXPECT_EQ(1, signal_count[kTestSignal1]);
+ EXPECT_EQ(1, signal_count[kTestSignal2]);
+
+ // Block kTestSignal1.
+ sigaddset(&mask, kTestSignal1);
+ EXPECT_THAT(raw_sigprocmask(SIG_BLOCK, &mask, nullptr), SyscallSucceeds());
+
+ // Check that kTestSignal1 is blocked.
+ raise(kTestSignal1);
+ raise(kTestSignal2);
+ EXPECT_EQ(1, signal_count[kTestSignal1]);
+ EXPECT_EQ(2, signal_count[kTestSignal2]);
+
+ // Unblock kTestSignal1.
+ sigaddset(&mask, kTestSignal1);
+ EXPECT_THAT(raw_sigprocmask(SIG_UNBLOCK, &mask, nullptr), SyscallSucceeds());
+
+ // Check that the unblocked kTestSignal1 has been delivered.
+ EXPECT_EQ(2, signal_count[kTestSignal1]);
+ EXPECT_EQ(2, signal_count[kTestSignal2]);
+}
+
+// Check that sigprocmask correctly handles aliasing of the set and oldset
+// pointers. Regression test for b/30502311.
+TEST_F(SigProcMaskTest, AliasedSets) {
+ sigset_t mask;
+
+ // Set a mask in which only kTestSignal1 is blocked.
+ sigset_t mask1;
+ sigemptyset(&mask1);
+ sigaddset(&mask1, kTestSignal1);
+ mask = mask1;
+ ASSERT_THAT(raw_sigprocmask(SIG_SETMASK, &mask, nullptr), SyscallSucceeds());
+
+ // Exchange it with a mask in which only kTestSignal2 is blocked.
+ sigset_t mask2;
+ sigemptyset(&mask2);
+ sigaddset(&mask2, kTestSignal2);
+ mask = mask2;
+ ASSERT_THAT(raw_sigprocmask(SIG_SETMASK, &mask, &mask), SyscallSucceeds());
+
+ // Check that the exchange succeeeded:
+ // mask should now contain the previously-set mask blocking only kTestSignal1.
+ EXPECT_THAT(mask, EqualsSigset(mask1));
+ // The current mask should block only kTestSignal2.
+ ASSERT_THAT(raw_sigprocmask(0, nullptr, &mask), SyscallSucceeds());
+ EXPECT_THAT(mask, EqualsSigset(mask2));
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/sigstop.cc b/test/syscalls/linux/sigstop.cc
new file mode 100644
index 000000000..b2fcedd62
--- /dev/null
+++ b/test/syscalls/linux/sigstop.cc
@@ -0,0 +1,151 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <signal.h>
+#include <stdlib.h>
+#include <sys/select.h>
+
+#include "gtest/gtest.h"
+#include "absl/flags/flag.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+ABSL_FLAG(bool, sigstop_test_child, false,
+ "If true, run the SigstopTest child workload.");
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+constexpr absl::Duration kChildStartupDelay = absl::Seconds(5);
+constexpr absl::Duration kChildMainThreadDelay = absl::Seconds(10);
+constexpr absl::Duration kChildExtraThreadDelay = absl::Seconds(15);
+constexpr absl::Duration kPostSIGSTOPDelay = absl::Seconds(20);
+
+// Comparisons on absl::Duration aren't yet constexpr (2017-07-14), so we
+// can't just use static_assert.
+TEST(SigstopTest, TimesAreRelativelyConsistent) {
+ EXPECT_LT(kChildStartupDelay, kChildMainThreadDelay)
+ << "Child process will exit before the parent process attempts to stop "
+ "it";
+ EXPECT_LT(kChildMainThreadDelay, kChildExtraThreadDelay)
+ << "Secondary thread in child process will exit before main thread, "
+ "causing it to exit with the wrong code";
+ EXPECT_LT(kChildExtraThreadDelay, kPostSIGSTOPDelay)
+ << "Parent process stops waiting before child process may exit if "
+ "improperly stopped, rendering the test ineffective";
+}
+
+// Exit codes communicated from the child workload to the parent test process.
+constexpr int kChildMainThreadExitCode = 10;
+constexpr int kChildExtraThreadExitCode = 11;
+
+TEST(SigstopTest, Correctness) {
+ pid_t child_pid = -1;
+ int execve_errno = 0;
+ auto cleanup = ASSERT_NO_ERRNO_AND_VALUE(
+ ForkAndExec("/proc/self/exe", {"/proc/self/exe", "--sigstop_test_child"},
+ {}, nullptr, &child_pid, &execve_errno));
+
+ ASSERT_GT(child_pid, 0);
+ ASSERT_EQ(execve_errno, 0);
+
+ // Wait for the child subprocess to start the second thread before stopping
+ // it.
+ absl::SleepFor(kChildStartupDelay);
+ ASSERT_THAT(kill(child_pid, SIGSTOP), SyscallSucceeds());
+ int status;
+ EXPECT_THAT(RetryEINTR(waitpid)(child_pid, &status, WUNTRACED),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFSTOPPED(status));
+ EXPECT_EQ(SIGSTOP, WSTOPSIG(status));
+
+ // Sleep for longer than either of the sleeps in the child subprocess,
+ // expecting the child to stay alive because it's stopped.
+ absl::SleepFor(kPostSIGSTOPDelay);
+ ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, WNOHANG),
+ SyscallSucceedsWithValue(0));
+
+ // Resume the child.
+ ASSERT_THAT(kill(child_pid, SIGCONT), SyscallSucceeds());
+
+ EXPECT_THAT(RetryEINTR(waitpid)(child_pid, &status, WCONTINUED),
+ SyscallSucceedsWithValue(child_pid));
+ EXPECT_TRUE(WIFCONTINUED(status));
+
+ // Expect it to die.
+ ASSERT_THAT(RetryEINTR(waitpid)(child_pid, &status, 0), SyscallSucceeds());
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(WEXITSTATUS(status), kChildMainThreadExitCode);
+}
+
+// Like base:SleepFor, but tries to avoid counting time spent stopped due to a
+// stop signal toward the sleep.
+//
+// This is required due to an inconsistency in how nanosleep(2) and stop signals
+// interact on Linux. When nanosleep is interrupted, it writes the remaining
+// time back to its second timespec argument, so that if nanosleep is
+// interrupted by a signal handler then userspace can immediately call nanosleep
+// again with that timespec. However, if nanosleep is automatically restarted
+// (because it's interrupted by a signal that is not delivered to a handler,
+// such as a stop signal), it's restarted based on the timer's former *absolute*
+// expiration time (via ERESTART_RESTARTBLOCK => SYS_restart_syscall =>
+// hrtimer_nanosleep_restart). This means that time spent stopped is effectively
+// counted as time spent sleeping, resulting in less time spent sleeping than
+// expected.
+//
+// Dividing the sleep into multiple smaller sleeps limits the impact of this
+// effect to the length of each sleep during which a stop occurs; for example,
+// if a sleeping process is only stopped once, SleepIgnoreStopped can
+// under-sleep by at most 100ms.
+void SleepIgnoreStopped(absl::Duration d) {
+ absl::Duration const max_sleep = absl::Milliseconds(100);
+ while (d > absl::ZeroDuration()) {
+ absl::Duration to_sleep = std::min(d, max_sleep);
+ absl::SleepFor(to_sleep);
+ d -= to_sleep;
+ }
+}
+
+void RunChild() {
+ // Start another thread that attempts to call exit_group with a different
+ // error code, in order to verify that SIGSTOP stops this thread as well.
+ ScopedThread t([] {
+ SleepIgnoreStopped(kChildExtraThreadDelay);
+ exit(kChildExtraThreadExitCode);
+ });
+ SleepIgnoreStopped(kChildMainThreadDelay);
+ exit(kChildMainThreadExitCode);
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
+
+int main(int argc, char** argv) {
+ gvisor::testing::TestInit(&argc, &argv);
+
+ if (absl::GetFlag(FLAGS_sigstop_test_child)) {
+ gvisor::testing::RunChild();
+ return 1;
+ }
+
+ return gvisor::testing::RunAllTests();
+}
diff --git a/test/syscalls/linux/sigtimedwait.cc b/test/syscalls/linux/sigtimedwait.cc
new file mode 100644
index 000000000..4f8afff15
--- /dev/null
+++ b/test/syscalls/linux/sigtimedwait.cc
@@ -0,0 +1,323 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/logging.h"
+#include "test/util/signal_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+#include "test/util/timer_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// N.B. main() blocks SIGALRM and SIGCHLD on all threads.
+
+constexpr int kAlarmSecs = 12;
+
+void NoopHandler(int sig, siginfo_t* info, void* context) {}
+
+TEST(SigtimedwaitTest, InvalidTimeout) {
+ sigset_t mask;
+ sigemptyset(&mask);
+ struct timespec timeout = {0, 1000000001};
+ EXPECT_THAT(sigtimedwait(&mask, nullptr, &timeout),
+ SyscallFailsWithErrno(EINVAL));
+ timeout = {-1, 0};
+ EXPECT_THAT(sigtimedwait(&mask, nullptr, &timeout),
+ SyscallFailsWithErrno(EINVAL));
+ timeout = {0, -1};
+ EXPECT_THAT(sigtimedwait(&mask, nullptr, &timeout),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+// No random save as the test relies on alarm timing. Cooperative save tests
+// already cover the save between alarm and wait.
+TEST(SigtimedwaitTest, AlarmReturnsAlarm_NoRandomSave) {
+ struct itimerval itv = {};
+ itv.it_value.tv_sec = kAlarmSecs;
+ const auto itimer_cleanup =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedItimer(ITIMER_REAL, itv));
+
+ sigset_t mask;
+ sigemptyset(&mask);
+ sigaddset(&mask, SIGALRM);
+ siginfo_t info = {};
+ EXPECT_THAT(RetryEINTR(sigtimedwait)(&mask, &info, nullptr),
+ SyscallSucceedsWithValue(SIGALRM));
+ EXPECT_EQ(SIGALRM, info.si_signo);
+}
+
+// No random save as the test relies on alarm timing. Cooperative save tests
+// already cover the save between alarm and wait.
+TEST(SigtimedwaitTest, NullTimeoutReturnsEINTR_NoRandomSave) {
+ struct sigaction sa;
+ sa.sa_sigaction = NoopHandler;
+ sigfillset(&sa.sa_mask);
+ sa.sa_flags = SA_SIGINFO;
+ const auto action_cleanup =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGALRM, sa));
+
+ const auto mask_cleanup =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_UNBLOCK, SIGALRM));
+
+ struct itimerval itv = {};
+ itv.it_value.tv_sec = kAlarmSecs;
+ const auto itimer_cleanup =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedItimer(ITIMER_REAL, itv));
+
+ sigset_t mask;
+ sigemptyset(&mask);
+ EXPECT_THAT(sigtimedwait(&mask, nullptr, nullptr),
+ SyscallFailsWithErrno(EINTR));
+}
+
+TEST(SigtimedwaitTest, LegitTimeoutReturnsEAGAIN) {
+ sigset_t mask;
+ sigemptyset(&mask);
+ struct timespec timeout = {1, 0}; // 1 second
+ EXPECT_THAT(RetryEINTR(sigtimedwait)(&mask, nullptr, &timeout),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST(SigtimedwaitTest, ZeroTimeoutReturnsEAGAIN) {
+ sigset_t mask;
+ sigemptyset(&mask);
+ struct timespec timeout = {0, 0}; // 0 second
+ EXPECT_THAT(sigtimedwait(&mask, nullptr, &timeout),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST(SigtimedwaitTest, KillGeneratedSIGCHLD) {
+ EXPECT_THAT(kill(getpid(), SIGCHLD), SyscallSucceeds());
+
+ sigset_t mask;
+ sigemptyset(&mask);
+ sigaddset(&mask, SIGCHLD);
+ struct timespec ts = {5, 0};
+ EXPECT_THAT(RetryEINTR(sigtimedwait)(&mask, nullptr, &ts),
+ SyscallSucceedsWithValue(SIGCHLD));
+}
+
+TEST(SigtimedwaitTest, ChildExitGeneratedSIGCHLD) {
+ pid_t pid = fork();
+ if (pid == 0) {
+ _exit(0);
+ }
+ ASSERT_THAT(pid, SyscallSucceeds());
+
+ int status;
+ EXPECT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0) << status;
+
+ sigset_t mask;
+ sigemptyset(&mask);
+ sigaddset(&mask, SIGCHLD);
+ struct timespec ts = {5, 0};
+ EXPECT_THAT(RetryEINTR(sigtimedwait)(&mask, nullptr, &ts),
+ SyscallSucceedsWithValue(SIGCHLD));
+}
+
+TEST(SigtimedwaitTest, ChildExitGeneratedSIGCHLDWithHandler) {
+ // Setup handler for SIGCHLD, but don't unblock it.
+ struct sigaction sa;
+ sa.sa_sigaction = NoopHandler;
+ sigfillset(&sa.sa_mask);
+ sa.sa_flags = SA_SIGINFO;
+ const auto action_cleanup =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGCHLD, sa));
+
+ pid_t pid = fork();
+ if (pid == 0) {
+ _exit(0);
+ }
+ ASSERT_THAT(pid, SyscallSucceeds());
+
+ sigset_t mask;
+ sigemptyset(&mask);
+ sigaddset(&mask, SIGCHLD);
+ struct timespec ts = {5, 0};
+ EXPECT_THAT(RetryEINTR(sigtimedwait)(&mask, nullptr, &ts),
+ SyscallSucceedsWithValue(SIGCHLD));
+
+ int status;
+ EXPECT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0) << status;
+}
+
+// sigtimedwait cannot catch SIGKILL.
+TEST(SigtimedwaitTest, SIGKILLUncaught) {
+ // This is a regression test for sigtimedwait dequeuing SIGKILLs, thus
+ // preventing the task from exiting.
+ //
+ // The explanation below is specific to behavior in gVisor. The Linux behavior
+ // here is irrelevant because without a bug that prevents delivery of SIGKILL,
+ // none of this behavior is visible (in Linux or gVisor).
+ //
+ // SIGKILL is rather intrusive. Simply sending the SIGKILL marks
+ // ThreadGroup.exitStatus as exiting with SIGKILL, before the SIGKILL is even
+ // delivered.
+ //
+ // As a result, we cannot simply exit the child with a different exit code if
+ // it survives and expect to see that code in waitpid because:
+ // 1. PrepareGroupExit will override Task.exitStatus with
+ // ThreadGroup.exitStatus.
+ // 2. waitpid(2) will always return ThreadGroup.exitStatus rather than
+ // Task.exitStatus.
+ //
+ // We could use exit(2) to set Task.exitStatus without override, and a SIGCHLD
+ // handler to receive Task.exitStatus in the parent, but with that much
+ // test complexity, it is cleaner to simply use a pipe to notify the parent
+ // that we survived.
+ constexpr auto kSigtimedwaitSetupTime = absl::Seconds(2);
+
+ int pipe_fds[2];
+ ASSERT_THAT(pipe(pipe_fds), SyscallSucceeds());
+ FileDescriptor rfd(pipe_fds[0]);
+ FileDescriptor wfd(pipe_fds[1]);
+
+ pid_t pid = fork();
+ if (pid == 0) {
+ rfd.reset();
+
+ sigset_t mask;
+ sigemptyset(&mask);
+ sigaddset(&mask, SIGKILL);
+ RetryEINTR(sigtimedwait)(&mask, nullptr, nullptr);
+
+ // Survived.
+ char c = 'a';
+ TEST_PCHECK(WriteFd(wfd.get(), &c, 1) == 1);
+ _exit(1);
+ }
+ ASSERT_THAT(pid, SyscallSucceeds());
+
+ wfd.reset();
+
+ // Wait for child to block in sigtimedwait, then kill it.
+ absl::SleepFor(kSigtimedwaitSetupTime);
+
+ // Sending SIGKILL will attempt to enqueue the signal twice: once in the
+ // normal signal sending path, and once to all Tasks in the ThreadGroup when
+ // applying SIGKILL side-effects.
+ //
+ // If we use kill(2), the former will be on the ThreadGroup signal queue and
+ // the latter will be on the Task signal queue. sigtimedwait can only dequeue
+ // one signal, so the other would kill the Task, masking bugs.
+ //
+ // If we use tkill(2), the former will be on the Task signal queue and the
+ // latter will be dropped as a duplicate. Then sigtimedwait can theoretically
+ // dequeue the single SIGKILL.
+ EXPECT_THAT(syscall(SYS_tkill, pid, SIGKILL), SyscallSucceeds());
+
+ int status;
+ EXPECT_THAT(RetryEINTR(waitpid)(pid, &status, 0),
+ SyscallSucceedsWithValue(pid));
+ EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGKILL) << status;
+
+ // Child shouldn't have survived.
+ char c;
+ EXPECT_THAT(ReadFd(rfd.get(), &c, 1), SyscallSucceedsWithValue(0));
+}
+
+TEST(SigtimedwaitTest, IgnoredUnmaskedSignal) {
+ constexpr int kSigno = SIGUSR1;
+ constexpr auto kSigtimedwaitSetupTime = absl::Seconds(2);
+ constexpr auto kSigtimedwaitTimeout = absl::Seconds(5);
+ ASSERT_GT(kSigtimedwaitTimeout, kSigtimedwaitSetupTime);
+
+ // Ensure that kSigno is ignored, and unmasked on this thread.
+ struct sigaction sa = {};
+ sa.sa_handler = SIG_IGN;
+ const auto scoped_sigaction =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(kSigno, sa));
+ sigset_t mask;
+ sigemptyset(&mask);
+ sigaddset(&mask, kSigno);
+ auto scoped_sigmask =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_UNBLOCK, mask));
+
+ // Create a thread which will send us kSigno while we are blocked in
+ // sigtimedwait.
+ pid_t tid = gettid();
+ ScopedThread sigthread([&] {
+ absl::SleepFor(kSigtimedwaitSetupTime);
+ EXPECT_THAT(tgkill(getpid(), tid, kSigno), SyscallSucceeds());
+ });
+
+ // sigtimedwait should not observe kSigno since it is ignored and already
+ // unmasked, causing it to be dropped before it is enqueued.
+ struct timespec timeout_ts = absl::ToTimespec(kSigtimedwaitTimeout);
+ EXPECT_THAT(RetryEINTR(sigtimedwait)(&mask, nullptr, &timeout_ts),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST(SigtimedwaitTest, IgnoredMaskedSignal) {
+ constexpr int kSigno = SIGUSR1;
+ constexpr auto kSigtimedwaitSetupTime = absl::Seconds(2);
+ constexpr auto kSigtimedwaitTimeout = absl::Seconds(5);
+ ASSERT_GT(kSigtimedwaitTimeout, kSigtimedwaitSetupTime);
+
+ // Ensure that kSigno is ignored, and masked on this thread.
+ struct sigaction sa = {};
+ sa.sa_handler = SIG_IGN;
+ const auto scoped_sigaction =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(kSigno, sa));
+ sigset_t mask;
+ sigemptyset(&mask);
+ sigaddset(&mask, kSigno);
+ auto scoped_sigmask =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, mask));
+
+ // Create a thread which will send us kSigno while we are blocked in
+ // sigtimedwait.
+ pid_t tid = gettid();
+ ScopedThread sigthread([&] {
+ absl::SleepFor(kSigtimedwaitSetupTime);
+ EXPECT_THAT(tgkill(getpid(), tid, kSigno), SyscallSucceeds());
+ });
+
+ // sigtimedwait should observe kSigno since it is normally masked, causing it
+ // to be enqueued despite being ignored.
+ struct timespec timeout_ts = absl::ToTimespec(kSigtimedwaitTimeout);
+ EXPECT_THAT(RetryEINTR(sigtimedwait)(&mask, nullptr, &timeout_ts),
+ SyscallSucceedsWithValue(kSigno));
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
+
+int main(int argc, char** argv) {
+ // These tests depend on delivering SIGALRM/SIGCHLD to the main thread or in
+ // sigtimedwait. Block them so that any other threads created by TestInit will
+ // also have them blocked.
+ sigset_t set;
+ sigemptyset(&set);
+ sigaddset(&set, SIGALRM);
+ sigaddset(&set, SIGCHLD);
+ TEST_PCHECK(sigprocmask(SIG_BLOCK, &set, nullptr) == 0);
+
+ gvisor::testing::TestInit(&argc, &argv);
+ return gvisor::testing::RunAllTests();
+}
diff --git a/test/syscalls/linux/socket.cc b/test/syscalls/linux/socket.cc
new file mode 100644
index 000000000..c20cd3fcc
--- /dev/null
+++ b/test/syscalls/linux/socket.cc
@@ -0,0 +1,121 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/temp_umask.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+TEST(SocketTest, UnixSocketPairProtocol) {
+ int socks[2];
+ ASSERT_THAT(socketpair(AF_UNIX, SOCK_STREAM, PF_UNIX, socks),
+ SyscallSucceeds());
+ close(socks[0]);
+ close(socks[1]);
+}
+
+TEST(SocketTest, ProtocolUnix) {
+ struct {
+ int domain, type, protocol;
+ } tests[] = {
+ {AF_UNIX, SOCK_STREAM, PF_UNIX},
+ {AF_UNIX, SOCK_SEQPACKET, PF_UNIX},
+ {AF_UNIX, SOCK_DGRAM, PF_UNIX},
+ };
+ for (int i = 0; i < ABSL_ARRAYSIZE(tests); i++) {
+ ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(tests[i].domain, tests[i].type, tests[i].protocol));
+ }
+}
+
+TEST(SocketTest, ProtocolInet) {
+ struct {
+ int domain, type, protocol;
+ } tests[] = {
+ {AF_INET, SOCK_DGRAM, IPPROTO_UDP},
+ {AF_INET, SOCK_STREAM, IPPROTO_TCP},
+ };
+ for (int i = 0; i < ABSL_ARRAYSIZE(tests); i++) {
+ ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(tests[i].domain, tests[i].type, tests[i].protocol));
+ }
+}
+
+TEST(SocketTest, UnixSocketStat) {
+ SKIP_IF(IsRunningWithVFS1());
+
+ FileDescriptor bound =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_UNIX, SOCK_STREAM, PF_UNIX));
+
+ // The permissions of the file created with bind(2) should be defined by the
+ // permissions of the bound socket and the umask.
+ mode_t sock_perm = 0765, mask = 0123;
+ ASSERT_THAT(fchmod(bound.get(), sock_perm), SyscallSucceeds());
+ TempUmask m(mask);
+
+ struct sockaddr_un addr =
+ ASSERT_NO_ERRNO_AND_VALUE(UniqueUnixAddr(/*abstract=*/false, AF_UNIX));
+ ASSERT_THAT(bind(bound.get(), reinterpret_cast<struct sockaddr*>(&addr),
+ sizeof(addr)),
+ SyscallSucceeds());
+
+ struct stat statbuf = {};
+ ASSERT_THAT(stat(addr.sun_path, &statbuf), SyscallSucceeds());
+
+ // Mode should be S_IFSOCK.
+ EXPECT_EQ(statbuf.st_mode, S_IFSOCK | sock_perm & ~mask);
+
+ // Timestamps should be equal and non-zero.
+ // TODO(b/158882152): Sockets currently don't implement timestamps.
+ if (!IsRunningOnGvisor()) {
+ EXPECT_NE(statbuf.st_atime, 0);
+ EXPECT_EQ(statbuf.st_atime, statbuf.st_mtime);
+ EXPECT_EQ(statbuf.st_atime, statbuf.st_ctime);
+ }
+}
+
+using SocketOpenTest = ::testing::TestWithParam<int>;
+
+// UDS cannot be opened.
+TEST_P(SocketOpenTest, Unix) {
+ // FIXME(b/142001530): Open incorrectly succeeds on gVisor.
+ SKIP_IF(IsRunningWithVFS1());
+
+ FileDescriptor bound =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_UNIX, SOCK_STREAM, PF_UNIX));
+
+ struct sockaddr_un addr =
+ ASSERT_NO_ERRNO_AND_VALUE(UniqueUnixAddr(/*abstract=*/false, AF_UNIX));
+
+ ASSERT_THAT(bind(bound.get(), reinterpret_cast<struct sockaddr*>(&addr),
+ sizeof(addr)),
+ SyscallSucceeds());
+
+ EXPECT_THAT(open(addr.sun_path, GetParam()), SyscallFailsWithErrno(ENXIO));
+}
+
+INSTANTIATE_TEST_SUITE_P(OpenModes, SocketOpenTest,
+ ::testing::Values(O_RDONLY, O_RDWR));
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_abstract.cc b/test/syscalls/linux/socket_abstract.cc
new file mode 100644
index 000000000..00999f192
--- /dev/null
+++ b/test/syscalls/linux/socket_abstract.cc
@@ -0,0 +1,49 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "test/syscalls/linux/socket_generic.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/socket_unix.h"
+#include "test/syscalls/linux/socket_unix_cmsg.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+ return ApplyVec<SocketPairKind>(
+ AbstractBoundUnixDomainSocketPair,
+ AllBitwiseCombinations(List<int>{SOCK_STREAM, SOCK_DGRAM, SOCK_SEQPACKET},
+ List<int>{0, SOCK_NONBLOCK}));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ AbstractUnixSockets, AllSocketPairTest,
+ ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+INSTANTIATE_TEST_SUITE_P(
+ AbstractUnixSockets, UnixSocketPairTest,
+ ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+INSTANTIATE_TEST_SUITE_P(
+ AbstractUnixSockets, UnixSocketPairCmsgTest,
+ ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_bind_to_device.cc b/test/syscalls/linux/socket_bind_to_device.cc
new file mode 100644
index 000000000..6b27f6eab
--- /dev/null
+++ b/test/syscalls/linux/socket_bind_to_device.cc
@@ -0,0 +1,313 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arpa/inet.h>
+#include <linux/if_tun.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+
+#include <cstdio>
+#include <cstring>
+#include <map>
+#include <memory>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_bind_to_device_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/capability_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+using std::string;
+
+// Test fixture for SO_BINDTODEVICE tests.
+class BindToDeviceTest : public ::testing::TestWithParam<SocketKind> {
+ protected:
+ void SetUp() override {
+ printf("Testing case: %s\n", GetParam().description.c_str());
+ ASSERT_TRUE(ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)))
+ << "CAP_NET_RAW is required to use SO_BINDTODEVICE";
+
+ interface_name_ = "eth1";
+ auto interface_names = GetInterfaceNames();
+ if (interface_names.find(interface_name_) == interface_names.end()) {
+ // Need a tunnel.
+ tunnel_ = ASSERT_NO_ERRNO_AND_VALUE(Tunnel::New());
+ interface_name_ = tunnel_->GetName();
+ ASSERT_FALSE(interface_name_.empty());
+ }
+ socket_ = ASSERT_NO_ERRNO_AND_VALUE(GetParam().Create());
+ }
+
+ string interface_name() const { return interface_name_; }
+
+ int socket_fd() const { return socket_->get(); }
+
+ private:
+ std::unique_ptr<Tunnel> tunnel_;
+ string interface_name_;
+ std::unique_ptr<FileDescriptor> socket_;
+};
+
+constexpr char kIllegalIfnameChar = '/';
+
+// Tests getsockopt of the default value.
+TEST_P(BindToDeviceTest, GetsockoptDefault) {
+ char name_buffer[IFNAMSIZ * 2];
+ char original_name_buffer[IFNAMSIZ * 2];
+ socklen_t name_buffer_size;
+
+ // Read the default SO_BINDTODEVICE.
+ memset(original_name_buffer, kIllegalIfnameChar, sizeof(name_buffer));
+ for (size_t i = 0; i <= sizeof(name_buffer); i++) {
+ memset(name_buffer, kIllegalIfnameChar, sizeof(name_buffer));
+ name_buffer_size = i;
+ EXPECT_THAT(getsockopt(socket_fd(), SOL_SOCKET, SO_BINDTODEVICE,
+ name_buffer, &name_buffer_size),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(name_buffer_size, 0);
+ EXPECT_EQ(memcmp(name_buffer, original_name_buffer, sizeof(name_buffer)),
+ 0);
+ }
+}
+
+// Tests setsockopt of invalid device name.
+TEST_P(BindToDeviceTest, SetsockoptInvalidDeviceName) {
+ char name_buffer[IFNAMSIZ * 2];
+ socklen_t name_buffer_size;
+
+ // Set an invalid device name.
+ memset(name_buffer, kIllegalIfnameChar, 5);
+ name_buffer_size = 5;
+ EXPECT_THAT(setsockopt(socket_fd(), SOL_SOCKET, SO_BINDTODEVICE, name_buffer,
+ name_buffer_size),
+ SyscallFailsWithErrno(ENODEV));
+}
+
+// Tests setsockopt of a buffer with a valid device name but not
+// null-terminated, with different sizes of buffer.
+TEST_P(BindToDeviceTest, SetsockoptValidDeviceNameWithoutNullTermination) {
+ char name_buffer[IFNAMSIZ * 2];
+ socklen_t name_buffer_size;
+
+ strncpy(name_buffer, interface_name().c_str(), interface_name().size() + 1);
+ // Intentionally overwrite the null at the end.
+ memset(name_buffer + interface_name().size(), kIllegalIfnameChar,
+ sizeof(name_buffer) - interface_name().size());
+ for (size_t i = 1; i <= sizeof(name_buffer); i++) {
+ name_buffer_size = i;
+ SCOPED_TRACE(absl::StrCat("Buffer size: ", i));
+ // It should only work if the size provided is exactly right.
+ if (name_buffer_size == interface_name().size()) {
+ EXPECT_THAT(setsockopt(socket_fd(), SOL_SOCKET, SO_BINDTODEVICE,
+ name_buffer, name_buffer_size),
+ SyscallSucceeds());
+ } else {
+ EXPECT_THAT(setsockopt(socket_fd(), SOL_SOCKET, SO_BINDTODEVICE,
+ name_buffer, name_buffer_size),
+ SyscallFailsWithErrno(ENODEV));
+ }
+ }
+}
+
+// Tests setsockopt of a buffer with a valid device name and null-terminated,
+// with different sizes of buffer.
+TEST_P(BindToDeviceTest, SetsockoptValidDeviceNameWithNullTermination) {
+ char name_buffer[IFNAMSIZ * 2];
+ socklen_t name_buffer_size;
+
+ strncpy(name_buffer, interface_name().c_str(), interface_name().size() + 1);
+ // Don't overwrite the null at the end.
+ memset(name_buffer + interface_name().size() + 1, kIllegalIfnameChar,
+ sizeof(name_buffer) - interface_name().size() - 1);
+ for (size_t i = 1; i <= sizeof(name_buffer); i++) {
+ name_buffer_size = i;
+ SCOPED_TRACE(absl::StrCat("Buffer size: ", i));
+ // It should only work if the size provided is at least the right size.
+ if (name_buffer_size >= interface_name().size()) {
+ EXPECT_THAT(setsockopt(socket_fd(), SOL_SOCKET, SO_BINDTODEVICE,
+ name_buffer, name_buffer_size),
+ SyscallSucceeds());
+ } else {
+ EXPECT_THAT(setsockopt(socket_fd(), SOL_SOCKET, SO_BINDTODEVICE,
+ name_buffer, name_buffer_size),
+ SyscallFailsWithErrno(ENODEV));
+ }
+ }
+}
+
+// Tests that setsockopt of an invalid device name doesn't unset the previous
+// valid setsockopt.
+TEST_P(BindToDeviceTest, SetsockoptValidThenInvalid) {
+ char name_buffer[IFNAMSIZ * 2];
+ socklen_t name_buffer_size;
+
+ // Write successfully.
+ strncpy(name_buffer, interface_name().c_str(), sizeof(name_buffer));
+ ASSERT_THAT(setsockopt(socket_fd(), SOL_SOCKET, SO_BINDTODEVICE, name_buffer,
+ sizeof(name_buffer)),
+ SyscallSucceeds());
+
+ // Read it back successfully.
+ memset(name_buffer, kIllegalIfnameChar, sizeof(name_buffer));
+ name_buffer_size = sizeof(name_buffer);
+ EXPECT_THAT(getsockopt(socket_fd(), SOL_SOCKET, SO_BINDTODEVICE, name_buffer,
+ &name_buffer_size),
+ SyscallSucceeds());
+ EXPECT_EQ(name_buffer_size, interface_name().size() + 1);
+ EXPECT_STREQ(name_buffer, interface_name().c_str());
+
+ // Write unsuccessfully.
+ memset(name_buffer, kIllegalIfnameChar, sizeof(name_buffer));
+ name_buffer_size = 5;
+ EXPECT_THAT(setsockopt(socket_fd(), SOL_SOCKET, SO_BINDTODEVICE, name_buffer,
+ sizeof(name_buffer)),
+ SyscallFailsWithErrno(ENODEV));
+
+ // Read it back successfully, it's unchanged.
+ memset(name_buffer, kIllegalIfnameChar, sizeof(name_buffer));
+ name_buffer_size = sizeof(name_buffer);
+ EXPECT_THAT(getsockopt(socket_fd(), SOL_SOCKET, SO_BINDTODEVICE, name_buffer,
+ &name_buffer_size),
+ SyscallSucceeds());
+ EXPECT_EQ(name_buffer_size, interface_name().size() + 1);
+ EXPECT_STREQ(name_buffer, interface_name().c_str());
+}
+
+// Tests that setsockopt of zero-length string correctly unsets the previous
+// value.
+TEST_P(BindToDeviceTest, SetsockoptValidThenClear) {
+ char name_buffer[IFNAMSIZ * 2];
+ socklen_t name_buffer_size;
+
+ // Write successfully.
+ strncpy(name_buffer, interface_name().c_str(), sizeof(name_buffer));
+ EXPECT_THAT(setsockopt(socket_fd(), SOL_SOCKET, SO_BINDTODEVICE, name_buffer,
+ sizeof(name_buffer)),
+ SyscallSucceeds());
+
+ // Read it back successfully.
+ memset(name_buffer, kIllegalIfnameChar, sizeof(name_buffer));
+ name_buffer_size = sizeof(name_buffer);
+ EXPECT_THAT(getsockopt(socket_fd(), SOL_SOCKET, SO_BINDTODEVICE, name_buffer,
+ &name_buffer_size),
+ SyscallSucceeds());
+ EXPECT_EQ(name_buffer_size, interface_name().size() + 1);
+ EXPECT_STREQ(name_buffer, interface_name().c_str());
+
+ // Clear it successfully.
+ name_buffer_size = 0;
+ EXPECT_THAT(setsockopt(socket_fd(), SOL_SOCKET, SO_BINDTODEVICE, name_buffer,
+ name_buffer_size),
+ SyscallSucceeds());
+
+ // Read it back successfully, it's cleared.
+ memset(name_buffer, kIllegalIfnameChar, sizeof(name_buffer));
+ name_buffer_size = sizeof(name_buffer);
+ EXPECT_THAT(getsockopt(socket_fd(), SOL_SOCKET, SO_BINDTODEVICE, name_buffer,
+ &name_buffer_size),
+ SyscallSucceeds());
+ EXPECT_EQ(name_buffer_size, 0);
+}
+
+// Tests that setsockopt of empty string correctly unsets the previous
+// value.
+TEST_P(BindToDeviceTest, SetsockoptValidThenClearWithNull) {
+ char name_buffer[IFNAMSIZ * 2];
+ socklen_t name_buffer_size;
+
+ // Write successfully.
+ strncpy(name_buffer, interface_name().c_str(), sizeof(name_buffer));
+ EXPECT_THAT(setsockopt(socket_fd(), SOL_SOCKET, SO_BINDTODEVICE, name_buffer,
+ sizeof(name_buffer)),
+ SyscallSucceeds());
+
+ // Read it back successfully.
+ memset(name_buffer, kIllegalIfnameChar, sizeof(name_buffer));
+ name_buffer_size = sizeof(name_buffer);
+ EXPECT_THAT(getsockopt(socket_fd(), SOL_SOCKET, SO_BINDTODEVICE, name_buffer,
+ &name_buffer_size),
+ SyscallSucceeds());
+ EXPECT_EQ(name_buffer_size, interface_name().size() + 1);
+ EXPECT_STREQ(name_buffer, interface_name().c_str());
+
+ // Clear it successfully.
+ memset(name_buffer, kIllegalIfnameChar, sizeof(name_buffer));
+ name_buffer[0] = 0;
+ name_buffer_size = sizeof(name_buffer);
+ EXPECT_THAT(setsockopt(socket_fd(), SOL_SOCKET, SO_BINDTODEVICE, name_buffer,
+ name_buffer_size),
+ SyscallSucceeds());
+
+ // Read it back successfully, it's cleared.
+ memset(name_buffer, kIllegalIfnameChar, sizeof(name_buffer));
+ name_buffer_size = sizeof(name_buffer);
+ EXPECT_THAT(getsockopt(socket_fd(), SOL_SOCKET, SO_BINDTODEVICE, name_buffer,
+ &name_buffer_size),
+ SyscallSucceeds());
+ EXPECT_EQ(name_buffer_size, 0);
+}
+
+// Tests getsockopt with different buffer sizes.
+TEST_P(BindToDeviceTest, GetsockoptDevice) {
+ char name_buffer[IFNAMSIZ * 2];
+ socklen_t name_buffer_size;
+
+ // Write successfully.
+ strncpy(name_buffer, interface_name().c_str(), sizeof(name_buffer));
+ ASSERT_THAT(setsockopt(socket_fd(), SOL_SOCKET, SO_BINDTODEVICE, name_buffer,
+ sizeof(name_buffer)),
+ SyscallSucceeds());
+
+ // Read it back at various buffer sizes.
+ for (size_t i = 0; i <= sizeof(name_buffer); i++) {
+ memset(name_buffer, kIllegalIfnameChar, sizeof(name_buffer));
+ name_buffer_size = i;
+ SCOPED_TRACE(absl::StrCat("Buffer size: ", i));
+ // Linux only allows a buffer at least IFNAMSIZ, even if less would suffice
+ // for this interface name.
+ if (name_buffer_size >= IFNAMSIZ) {
+ EXPECT_THAT(getsockopt(socket_fd(), SOL_SOCKET, SO_BINDTODEVICE,
+ name_buffer, &name_buffer_size),
+ SyscallSucceeds());
+ EXPECT_EQ(name_buffer_size, interface_name().size() + 1);
+ EXPECT_STREQ(name_buffer, interface_name().c_str());
+ } else {
+ EXPECT_THAT(getsockopt(socket_fd(), SOL_SOCKET, SO_BINDTODEVICE,
+ name_buffer, &name_buffer_size),
+ SyscallFailsWithErrno(EINVAL));
+ EXPECT_EQ(name_buffer_size, i);
+ }
+ }
+}
+
+INSTANTIATE_TEST_SUITE_P(BindToDeviceTest, BindToDeviceTest,
+ ::testing::Values(IPv4UDPUnboundSocket(0),
+ IPv4TCPUnboundSocket(0)));
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_bind_to_device_distribution.cc b/test/syscalls/linux/socket_bind_to_device_distribution.cc
new file mode 100644
index 000000000..5ed57625c
--- /dev/null
+++ b/test/syscalls/linux/socket_bind_to_device_distribution.cc
@@ -0,0 +1,401 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arpa/inet.h>
+#include <linux/if_tun.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+
+#include <atomic>
+#include <cstdio>
+#include <cstring>
+#include <map>
+#include <memory>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_bind_to_device_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/capability_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+using std::string;
+using std::vector;
+
+struct EndpointConfig {
+ std::string bind_to_device;
+ double expected_ratio;
+};
+
+struct DistributionTestCase {
+ std::string name;
+ std::vector<EndpointConfig> endpoints;
+};
+
+struct ListenerConnector {
+ TestAddress listener;
+ TestAddress connector;
+};
+
+// Test fixture for SO_BINDTODEVICE tests the distribution of packets received
+// with varying SO_BINDTODEVICE settings.
+class BindToDeviceDistributionTest
+ : public ::testing::TestWithParam<
+ ::testing::tuple<ListenerConnector, DistributionTestCase>> {
+ protected:
+ void SetUp() override {
+ printf("Testing case: %s, listener=%s, connector=%s\n",
+ ::testing::get<1>(GetParam()).name.c_str(),
+ ::testing::get<0>(GetParam()).listener.description.c_str(),
+ ::testing::get<0>(GetParam()).connector.description.c_str());
+ ASSERT_TRUE(ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)))
+ << "CAP_NET_RAW is required to use SO_BINDTODEVICE";
+ }
+};
+
+PosixErrorOr<uint16_t> AddrPort(int family, sockaddr_storage const& addr) {
+ switch (family) {
+ case AF_INET:
+ return static_cast<uint16_t>(
+ reinterpret_cast<sockaddr_in const*>(&addr)->sin_port);
+ case AF_INET6:
+ return static_cast<uint16_t>(
+ reinterpret_cast<sockaddr_in6 const*>(&addr)->sin6_port);
+ default:
+ return PosixError(EINVAL,
+ absl::StrCat("unknown socket family: ", family));
+ }
+}
+
+PosixError SetAddrPort(int family, sockaddr_storage* addr, uint16_t port) {
+ switch (family) {
+ case AF_INET:
+ reinterpret_cast<sockaddr_in*>(addr)->sin_port = port;
+ return NoError();
+ case AF_INET6:
+ reinterpret_cast<sockaddr_in6*>(addr)->sin6_port = port;
+ return NoError();
+ default:
+ return PosixError(EINVAL,
+ absl::StrCat("unknown socket family: ", family));
+ }
+}
+
+// Binds sockets to different devices and then creates many TCP connections.
+// Checks that the distribution of connections received on the sockets matches
+// the expectation.
+TEST_P(BindToDeviceDistributionTest, Tcp) {
+ auto const& [listener_connector, test] = GetParam();
+
+ TestAddress const& listener = listener_connector.listener;
+ TestAddress const& connector = listener_connector.connector;
+ sockaddr_storage listen_addr = listener.addr;
+ sockaddr_storage conn_addr = connector.addr;
+
+ auto interface_names = GetInterfaceNames();
+
+ // Create the listening sockets.
+ std::vector<FileDescriptor> listener_fds;
+ std::vector<std::unique_ptr<Tunnel>> all_tunnels;
+ for (auto const& endpoint : test.endpoints) {
+ if (!endpoint.bind_to_device.empty() &&
+ interface_names.find(endpoint.bind_to_device) ==
+ interface_names.end()) {
+ all_tunnels.push_back(
+ ASSERT_NO_ERRNO_AND_VALUE(Tunnel::New(endpoint.bind_to_device)));
+ interface_names.insert(endpoint.bind_to_device);
+ }
+
+ listener_fds.push_back(ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP)));
+ int fd = listener_fds.back().get();
+
+ ASSERT_THAT(setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ ASSERT_THAT(setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE,
+ endpoint.bind_to_device.c_str(),
+ endpoint.bind_to_device.size() + 1),
+ SyscallSucceeds());
+ ASSERT_THAT(
+ bind(fd, reinterpret_cast<sockaddr*>(&listen_addr), listener.addr_len),
+ SyscallSucceeds());
+ ASSERT_THAT(listen(fd, 40), SyscallSucceeds());
+
+ // On the first bind we need to determine which port was bound.
+ if (listener_fds.size() > 1) {
+ continue;
+ }
+
+ // Get the port bound by the listening socket.
+ socklen_t addrlen = listener.addr_len;
+ ASSERT_THAT(
+ getsockname(listener_fds[0].get(),
+ reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+ SyscallSucceeds());
+ uint16_t const port =
+ ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+ ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+ }
+
+ constexpr int kConnectAttempts = 10000;
+ std::atomic<int> connects_received = ATOMIC_VAR_INIT(0);
+ std::vector<int> accept_counts(listener_fds.size(), 0);
+ std::vector<std::unique_ptr<ScopedThread>> listen_threads(
+ listener_fds.size());
+
+ for (int i = 0; i < listener_fds.size(); i++) {
+ listen_threads[i] = absl::make_unique<ScopedThread>(
+ [&listener_fds, &accept_counts, &connects_received, i,
+ kConnectAttempts]() {
+ do {
+ auto fd = Accept(listener_fds[i].get(), nullptr, nullptr);
+ if (!fd.ok()) {
+ // Another thread has shutdown our read side causing the accept to
+ // fail.
+ ASSERT_GE(connects_received, kConnectAttempts)
+ << "errno = " << fd.error();
+ return;
+ }
+ // Receive some data from a socket to be sure that the connect()
+ // system call has been completed on another side.
+ // Do a short read and then close the socket to trigger a RST. This
+ // ensures that both ends of the connection are cleaned up and no
+ // goroutines hang around in TIME-WAIT. We do this so that this test
+ // does not timeout under gotsan runs where lots of goroutines can
+ // cause the test to use absurd amounts of memory.
+ //
+ // See: https://tools.ietf.org/html/rfc2525#page-50 section 2.17
+ uint16_t data;
+ EXPECT_THAT(
+ RetryEINTR(recv)(fd.ValueOrDie().get(), &data, sizeof(data), 0),
+ SyscallSucceedsWithValue(sizeof(data)));
+ accept_counts[i]++;
+ } while (++connects_received < kConnectAttempts);
+
+ // Shutdown all sockets to wake up other threads.
+ for (auto const& listener_fd : listener_fds) {
+ shutdown(listener_fd.get(), SHUT_RDWR);
+ }
+ });
+ }
+
+ for (int i = 0; i < kConnectAttempts; i++) {
+ const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+ ASSERT_THAT(
+ RetryEINTR(connect)(fd.get(), reinterpret_cast<sockaddr*>(&conn_addr),
+ connector.addr_len),
+ SyscallSucceeds());
+
+ // Do two separate sends to ensure two segments are received. This is
+ // required for netstack where read is incorrectly assuming a whole
+ // segment is read when endpoint.Read() is called which is technically
+ // incorrect as the syscall that invoked endpoint.Read() may only
+ // consume it partially. This results in a case where a close() of
+ // such a socket does not trigger a RST in netstack due to the
+ // endpoint assuming that the endpoint has no unread data.
+ EXPECT_THAT(RetryEINTR(send)(fd.get(), &i, sizeof(i), 0),
+ SyscallSucceedsWithValue(sizeof(i)));
+
+ // TODO(gvisor.dev/issue/1449): Remove this block once netstack correctly
+ // generates a RST.
+ if (IsRunningOnGvisor()) {
+ EXPECT_THAT(RetryEINTR(send)(fd.get(), &i, sizeof(i), 0),
+ SyscallSucceedsWithValue(sizeof(i)));
+ }
+ }
+
+ // Join threads to be sure that all connections have been counted.
+ for (auto const& listen_thread : listen_threads) {
+ listen_thread->Join();
+ }
+ // Check that connections are distributed correctly among listening sockets.
+ for (int i = 0; i < accept_counts.size(); i++) {
+ EXPECT_THAT(
+ accept_counts[i],
+ EquivalentWithin(static_cast<int>(kConnectAttempts *
+ test.endpoints[i].expected_ratio),
+ 0.10))
+ << "endpoint " << i << " got the wrong number of packets";
+ }
+}
+
+// Binds sockets to different devices and then sends many UDP packets. Checks
+// that the distribution of packets received on the sockets matches the
+// expectation.
+TEST_P(BindToDeviceDistributionTest, Udp) {
+ auto const& [listener_connector, test] = GetParam();
+
+ TestAddress const& listener = listener_connector.listener;
+ TestAddress const& connector = listener_connector.connector;
+ sockaddr_storage listen_addr = listener.addr;
+ sockaddr_storage conn_addr = connector.addr;
+
+ auto interface_names = GetInterfaceNames();
+
+ // Create the listening socket.
+ std::vector<FileDescriptor> listener_fds;
+ std::vector<std::unique_ptr<Tunnel>> all_tunnels;
+ for (auto const& endpoint : test.endpoints) {
+ if (!endpoint.bind_to_device.empty() &&
+ interface_names.find(endpoint.bind_to_device) ==
+ interface_names.end()) {
+ all_tunnels.push_back(
+ ASSERT_NO_ERRNO_AND_VALUE(Tunnel::New(endpoint.bind_to_device)));
+ interface_names.insert(endpoint.bind_to_device);
+ }
+
+ listener_fds.push_back(
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(listener.family(), SOCK_DGRAM, 0)));
+ int fd = listener_fds.back().get();
+
+ ASSERT_THAT(setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ ASSERT_THAT(setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE,
+ endpoint.bind_to_device.c_str(),
+ endpoint.bind_to_device.size() + 1),
+ SyscallSucceeds());
+ ASSERT_THAT(
+ bind(fd, reinterpret_cast<sockaddr*>(&listen_addr), listener.addr_len),
+ SyscallSucceeds());
+
+ // On the first bind we need to determine which port was bound.
+ if (listener_fds.size() > 1) {
+ continue;
+ }
+
+ // Get the port bound by the listening socket.
+ socklen_t addrlen = listener.addr_len;
+ ASSERT_THAT(
+ getsockname(listener_fds[0].get(),
+ reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+ SyscallSucceeds());
+ uint16_t const port =
+ ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+ ASSERT_NO_ERRNO(SetAddrPort(listener.family(), &listen_addr, port));
+ ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+ }
+
+ constexpr int kConnectAttempts = 10000;
+ std::atomic<int> packets_received = ATOMIC_VAR_INIT(0);
+ std::vector<int> packets_per_socket(listener_fds.size(), 0);
+ std::vector<std::unique_ptr<ScopedThread>> receiver_threads(
+ listener_fds.size());
+
+ for (int i = 0; i < listener_fds.size(); i++) {
+ receiver_threads[i] = absl::make_unique<ScopedThread>(
+ [&listener_fds, &packets_per_socket, &packets_received, i]() {
+ do {
+ struct sockaddr_storage addr = {};
+ socklen_t addrlen = sizeof(addr);
+ int data;
+
+ auto ret = RetryEINTR(recvfrom)(
+ listener_fds[i].get(), &data, sizeof(data), 0,
+ reinterpret_cast<struct sockaddr*>(&addr), &addrlen);
+
+ if (packets_received < kConnectAttempts) {
+ ASSERT_THAT(ret, SyscallSucceedsWithValue(sizeof(data)));
+ }
+
+ if (ret != sizeof(data)) {
+ // Another thread may have shutdown our read side causing the
+ // recvfrom to fail.
+ break;
+ }
+
+ packets_received++;
+ packets_per_socket[i]++;
+
+ // A response is required to synchronize with the main thread,
+ // otherwise the main thread can send more than can fit into receive
+ // queues.
+ EXPECT_THAT(RetryEINTR(sendto)(
+ listener_fds[i].get(), &data, sizeof(data), 0,
+ reinterpret_cast<sockaddr*>(&addr), addrlen),
+ SyscallSucceedsWithValue(sizeof(data)));
+ } while (packets_received < kConnectAttempts);
+
+ // Shutdown all sockets to wake up other threads.
+ for (auto const& listener_fd : listener_fds) {
+ shutdown(listener_fd.get(), SHUT_RDWR);
+ }
+ });
+ }
+
+ for (int i = 0; i < kConnectAttempts; i++) {
+ FileDescriptor const fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(connector.family(), SOCK_DGRAM, 0));
+ EXPECT_THAT(RetryEINTR(sendto)(fd.get(), &i, sizeof(i), 0,
+ reinterpret_cast<sockaddr*>(&conn_addr),
+ connector.addr_len),
+ SyscallSucceedsWithValue(sizeof(i)));
+ int data;
+ EXPECT_THAT(RetryEINTR(recv)(fd.get(), &data, sizeof(data), 0),
+ SyscallSucceedsWithValue(sizeof(data)));
+ }
+
+ // Join threads to be sure that all connections have been counted.
+ for (auto const& receiver_thread : receiver_threads) {
+ receiver_thread->Join();
+ }
+ // Check that packets are distributed correctly among listening sockets.
+ for (int i = 0; i < packets_per_socket.size(); i++) {
+ EXPECT_THAT(
+ packets_per_socket[i],
+ EquivalentWithin(static_cast<int>(kConnectAttempts *
+ test.endpoints[i].expected_ratio),
+ 0.10))
+ << "endpoint " << i << " got the wrong number of packets";
+ }
+}
+
+std::vector<DistributionTestCase> GetDistributionTestCases() {
+ return std::vector<DistributionTestCase>{
+ {"Even distribution among sockets not bound to device",
+ {{"", 1. / 3}, {"", 1. / 3}, {"", 1. / 3}}},
+ {"Sockets bound to other interfaces get no packets",
+ {{"eth1", 0}, {"", 1. / 2}, {"", 1. / 2}}},
+ {"Bound has priority over unbound", {{"eth1", 0}, {"", 0}, {"lo", 1}}},
+ {"Even distribution among sockets bound to device",
+ {{"eth1", 0}, {"lo", 1. / 2}, {"lo", 1. / 2}}},
+ };
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ BindToDeviceTest, BindToDeviceDistributionTest,
+ ::testing::Combine(::testing::Values(
+ // Listeners bound to IPv4 addresses refuse
+ // connections using IPv6 addresses.
+ ListenerConnector{V4Any(), V4Loopback()},
+ ListenerConnector{V4Loopback(), V4MappedLoopback()}),
+ ::testing::ValuesIn(GetDistributionTestCases())));
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_bind_to_device_sequence.cc b/test/syscalls/linux/socket_bind_to_device_sequence.cc
new file mode 100644
index 000000000..d3cc71dbf
--- /dev/null
+++ b/test/syscalls/linux/socket_bind_to_device_sequence.cc
@@ -0,0 +1,513 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arpa/inet.h>
+#include <linux/capability.h>
+#include <linux/if_tun.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+
+#include <cstdio>
+#include <cstring>
+#include <map>
+#include <memory>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/container/node_hash_map.h"
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_bind_to_device_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/capability_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+using std::string;
+using std::vector;
+
+// Test fixture for SO_BINDTODEVICE tests the results of sequences of socket
+// binding.
+class BindToDeviceSequenceTest : public ::testing::TestWithParam<SocketKind> {
+ protected:
+ void SetUp() override {
+ printf("Testing case: %s\n", GetParam().description.c_str());
+ ASSERT_TRUE(ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW)))
+ << "CAP_NET_RAW is required to use SO_BINDTODEVICE";
+ socket_factory_ = GetParam();
+
+ interface_names_ = GetInterfaceNames();
+ }
+
+ PosixErrorOr<std::unique_ptr<FileDescriptor>> NewSocket() const {
+ return socket_factory_.Create();
+ }
+
+ // Gets a device by device_id. If the device_id has been seen before, returns
+ // the previously returned device. If not, finds or creates a new device.
+ // Returns an empty string on failure.
+ void GetDevice(int device_id, string* device_name) {
+ auto device = devices_.find(device_id);
+ if (device != devices_.end()) {
+ *device_name = device->second;
+ return;
+ }
+
+ // Need to pick a new device. Try ethernet first.
+ *device_name = absl::StrCat("eth", next_unused_eth_);
+ if (interface_names_.find(*device_name) != interface_names_.end()) {
+ devices_[device_id] = *device_name;
+ next_unused_eth_++;
+ return;
+ }
+
+ // Need to make a new tunnel device. gVisor tests should have enough
+ // ethernet devices to never reach here.
+ ASSERT_FALSE(IsRunningOnGvisor());
+ // Need a tunnel.
+ tunnels_.push_back(ASSERT_NO_ERRNO_AND_VALUE(Tunnel::New()));
+ devices_[device_id] = tunnels_.back()->GetName();
+ *device_name = devices_[device_id];
+ }
+
+ // Release the socket
+ void ReleaseSocket(int socket_id) {
+ // Close the socket that was made in a previous action. The socket_id
+ // indicates which socket to close based on index into the list of actions.
+ sockets_to_close_.erase(socket_id);
+ }
+
+ // SetDevice changes the bind_to_device option. It does not bind or re-bind.
+ void SetDevice(int socket_id, int device_id) {
+ auto socket_fd = sockets_to_close_[socket_id]->get();
+ string device_name;
+ ASSERT_NO_FATAL_FAILURE(GetDevice(device_id, &device_name));
+ EXPECT_THAT(setsockopt(socket_fd, SOL_SOCKET, SO_BINDTODEVICE,
+ device_name.c_str(), device_name.size() + 1),
+ SyscallSucceedsWithValue(0));
+ }
+
+ // Bind a socket with the reuse options and bind_to_device options. Checks
+ // that all steps succeed and that the bind command's error matches want.
+ // Sets the socket_id to uniquely identify the socket bound if it is not
+ // nullptr.
+ void BindSocket(bool reuse_port, bool reuse_addr, int device_id = 0,
+ int want = 0, int* socket_id = nullptr) {
+ next_socket_id_++;
+ sockets_to_close_[next_socket_id_] = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket_fd = sockets_to_close_[next_socket_id_]->get();
+ if (socket_id != nullptr) {
+ *socket_id = next_socket_id_;
+ }
+
+ // If reuse_port is indicated, do that.
+ if (reuse_port) {
+ EXPECT_THAT(setsockopt(socket_fd, SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceedsWithValue(0));
+ }
+
+ // If reuse_addr is indicated, do that.
+ if (reuse_addr) {
+ EXPECT_THAT(setsockopt(socket_fd, SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceedsWithValue(0));
+ }
+
+ // If the device is non-zero, bind to that device.
+ if (device_id != 0) {
+ string device_name;
+ ASSERT_NO_FATAL_FAILURE(GetDevice(device_id, &device_name));
+ EXPECT_THAT(setsockopt(socket_fd, SOL_SOCKET, SO_BINDTODEVICE,
+ device_name.c_str(), device_name.size() + 1),
+ SyscallSucceedsWithValue(0));
+ char get_device[100];
+ socklen_t get_device_size = 100;
+ EXPECT_THAT(getsockopt(socket_fd, SOL_SOCKET, SO_BINDTODEVICE, get_device,
+ &get_device_size),
+ SyscallSucceedsWithValue(0));
+ }
+
+ struct sockaddr_in addr = {};
+ addr.sin_family = AF_INET;
+ addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ addr.sin_port = port_;
+ if (want == 0) {
+ ASSERT_THAT(
+ bind(socket_fd, reinterpret_cast<const struct sockaddr*>(&addr),
+ sizeof(addr)),
+ SyscallSucceeds());
+ } else {
+ ASSERT_THAT(
+ bind(socket_fd, reinterpret_cast<const struct sockaddr*>(&addr),
+ sizeof(addr)),
+ SyscallFailsWithErrno(want));
+ }
+
+ if (port_ == 0) {
+ // We don't yet know what port we'll be using so we need to fetch it and
+ // remember it for future commands.
+ socklen_t addr_size = sizeof(addr);
+ ASSERT_THAT(
+ getsockname(socket_fd, reinterpret_cast<struct sockaddr*>(&addr),
+ &addr_size),
+ SyscallSucceeds());
+ port_ = addr.sin_port;
+ }
+ }
+
+ private:
+ SocketKind socket_factory_;
+ // devices maps from the device id in the test case to the name of the device.
+ absl::node_hash_map<int, string> devices_;
+ // These are the tunnels that were created for the test and will be destroyed
+ // by the destructor.
+ vector<std::unique_ptr<Tunnel>> tunnels_;
+ // A list of all interface names before the test started.
+ std::unordered_set<string> interface_names_;
+ // The next ethernet device to use when requested a device.
+ int next_unused_eth_ = 1;
+ // The port for all tests. Originally 0 (any) and later set to the port that
+ // all further commands will use.
+ in_port_t port_ = 0;
+ // sockets_to_close_ is a map from action index to the socket that was
+ // created.
+ absl::node_hash_map<int,
+ std::unique_ptr<gvisor::testing::FileDescriptor>>
+ sockets_to_close_;
+ int next_socket_id_ = 0;
+};
+
+TEST_P(BindToDeviceSequenceTest, BindTwiceWithDeviceFails) {
+ ASSERT_NO_FATAL_FAILURE(BindSocket(
+ /* reuse_port */ false, /* reuse_addr */ false, /* bind_to_device */ 3));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 3, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindToDevice) {
+ ASSERT_NO_FATAL_FAILURE(BindSocket(
+ /* reuse_port */ false, /* reuse_addr */ false, /* bind_to_device */ 1));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(
+ /* reuse_port */ false, /* reuse_addr */ false, /* bind_to_device */ 2));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindToDeviceAndThenWithoutDevice) {
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 123));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindWithoutDevice) {
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 123, EADDRINUSE));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 123, EADDRINUSE));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 0, EADDRINUSE));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindWithDevice) {
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 123, 0));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 123, EADDRINUSE));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 123, EADDRINUSE));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 0, EADDRINUSE));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 0, EADDRINUSE));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 456, 0));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 789, 0));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 0, EADDRINUSE));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindWithReuse) {
+ ASSERT_NO_FATAL_FAILURE(
+ BindSocket(/* reusePort */ true, /* reuse_addr */ false));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 123, EADDRINUSE));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(
+ /* reuse_port */ true, /* reuse_addr */ false,
+ /* bind_to_device */ 123));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 0, EADDRINUSE));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(
+ /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 0));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindingWithReuseAndDevice) {
+ ASSERT_NO_FATAL_FAILURE(BindSocket(
+ /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 123));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 123, EADDRINUSE));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(
+ /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 123));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 0, EADDRINUSE));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(
+ /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 456));
+ ASSERT_NO_FATAL_FAILURE(
+ BindSocket(/* reuse_port */ true, /* reuse_addr */ false));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(
+ /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 789));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 999, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, MixingReuseAndNotReuseByBindingToDevice) {
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 123, 0));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 456, 0));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 789, 0));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 999, 0));
+}
+
+TEST_P(BindToDeviceSequenceTest, CannotBindTo0AfterMixingReuseAndNotReuse) {
+ ASSERT_NO_FATAL_FAILURE(BindSocket(
+ /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 123));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 456));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindAndRelease) {
+ ASSERT_NO_FATAL_FAILURE(BindSocket(
+ /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 123));
+ int to_release;
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 0, 0, &to_release));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 345, EADDRINUSE));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(
+ /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 789));
+ // Release the bind to device 0 and try again.
+ ASSERT_NO_FATAL_FAILURE(ReleaseSocket(to_release));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 345));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindTwiceWithReuseOnce) {
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 123));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindWithReuseAddr) {
+ ASSERT_NO_FATAL_FAILURE(
+ BindSocket(/* reusePort */ false, /* reuse_addr */ true));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 123, EADDRINUSE));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(
+ /* reuse_port */ false, /* reuse_addr */ true, /* bind_to_device */ 123));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 0, EADDRINUSE));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(
+ /* reuse_port */ false, /* reuse_addr */ true, /* bind_to_device */ 0));
+}
+
+TEST_P(BindToDeviceSequenceTest,
+ CannotBindTo0AfterMixingReuseAddrAndNotReuseAddr) {
+ ASSERT_NO_FATAL_FAILURE(BindSocket(
+ /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 123));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 456));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ true,
+ /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindReuseAddrReusePortThenReusePort) {
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ true,
+ /* bind_to_device */ 0));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 0));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ true,
+ /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindReuseAddrReusePortThenReuseAddr) {
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ true,
+ /* bind_to_device */ 0));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ true,
+ /* bind_to_device */ 0));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindDoubleReuseAddrReusePortThenReusePort) {
+ ASSERT_NO_FATAL_FAILURE(BindSocket(
+ /* reuse_port */ true, /* reuse_addr */ true, /* bind_to_device */ 0));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ true,
+ /* bind_to_device */ 0));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 0));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ true,
+ /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindDoubleReuseAddrReusePortThenReuseAddr) {
+ ASSERT_NO_FATAL_FAILURE(BindSocket(
+ /* reuse_port */ true, /* reuse_addr */ true, /* bind_to_device */ 0));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ true,
+ /* bind_to_device */ 0));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ true,
+ /* bind_to_device */ 0));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindReusePortThenReuseAddrReusePort) {
+ ASSERT_NO_FATAL_FAILURE(BindSocket(
+ /* reuse_port */ true, /* reuse_addr */ false, /* bind_to_device */ 0));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ true,
+ /* bind_to_device */ 0));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ true,
+ /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest, BindReuseAddrThenReuseAddr) {
+ ASSERT_NO_FATAL_FAILURE(BindSocket(
+ /* reuse_port */ false, /* reuse_addr */ true, /* bind_to_device */ 0));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 0, EADDRINUSE));
+}
+
+TEST_P(BindToDeviceSequenceTest,
+ BindReuseAddrThenReuseAddrReusePortThenReuseAddr) {
+ // The behavior described in this test seems like a Linux bug. It doesn't
+ // make any sense and it is unlikely that any applications rely on it.
+ //
+ // Both SO_REUSEADDR and SO_REUSEPORT allow binding multiple UDP sockets to
+ // the same address and deliver each packet to exactly one of the bound
+ // sockets. If both are enabled, one of the strategies is selected to route
+ // packets. The strategy is selected dynamically based on the settings of the
+ // currently bound sockets. Usually, the strategy is selected based on the
+ // common setting (SO_REUSEADDR or SO_REUSEPORT) amongst the sockets, but for
+ // some reason, Linux allows binding sets of sockets with no overlapping
+ // settings in some situations. In this case, it is not obvious which strategy
+ // would be selected as the configured setting is a contradiction.
+ SKIP_IF(IsRunningOnGvisor());
+
+ ASSERT_NO_FATAL_FAILURE(BindSocket(
+ /* reuse_port */ false, /* reuse_addr */ true, /* bind_to_device */ 0));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ true,
+ /* bind_to_device */ 0));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ true,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 0));
+}
+
+// Repro test for gvisor.dev/issue/1217. Not replicated in ports_test.go as this
+// test is different from the others and wouldn't fit well there.
+TEST_P(BindToDeviceSequenceTest, BindAndReleaseDifferentDevice) {
+ int to_release;
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 3, 0, &to_release));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(/* reuse_port */ false,
+ /* reuse_addr */ false,
+ /* bind_to_device */ 3, EADDRINUSE));
+ // Change the device. Since the socket was already bound, this should have no
+ // effect.
+ SetDevice(to_release, 2);
+ // Release the bind to device 3 and try again.
+ ASSERT_NO_FATAL_FAILURE(ReleaseSocket(to_release));
+ ASSERT_NO_FATAL_FAILURE(BindSocket(
+ /* reuse_port */ false, /* reuse_addr */ false, /* bind_to_device */ 3));
+}
+
+INSTANTIATE_TEST_SUITE_P(BindToDeviceTest, BindToDeviceSequenceTest,
+ ::testing::Values(IPv4UDPUnboundSocket(0),
+ IPv4TCPUnboundSocket(0)));
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_bind_to_device_util.cc b/test/syscalls/linux/socket_bind_to_device_util.cc
new file mode 100644
index 000000000..f4ee775bd
--- /dev/null
+++ b/test/syscalls/linux/socket_bind_to_device_util.cc
@@ -0,0 +1,75 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_bind_to_device_util.h"
+
+#include <arpa/inet.h>
+#include <fcntl.h>
+#include <linux/if_tun.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+#include <unistd.h>
+
+#include <cstdio>
+#include <cstring>
+#include <map>
+#include <memory>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+using std::string;
+
+PosixErrorOr<std::unique_ptr<Tunnel>> Tunnel::New(string tunnel_name) {
+ int fd;
+ RETURN_ERROR_IF_SYSCALL_FAIL(fd = open("/dev/net/tun", O_RDWR));
+
+ // Using `new` to access a non-public constructor.
+ auto new_tunnel = absl::WrapUnique(new Tunnel(fd));
+
+ ifreq ifr = {};
+ ifr.ifr_flags = IFF_TUN;
+ strncpy(ifr.ifr_name, tunnel_name.c_str(), sizeof(ifr.ifr_name));
+
+ RETURN_ERROR_IF_SYSCALL_FAIL(ioctl(fd, TUNSETIFF, &ifr));
+ new_tunnel->name_ = ifr.ifr_name;
+ return new_tunnel;
+}
+
+std::unordered_set<string> GetInterfaceNames() {
+ struct if_nameindex* interfaces = if_nameindex();
+ std::unordered_set<string> names;
+ if (interfaces == nullptr) {
+ return names;
+ }
+ for (auto interface = interfaces;
+ interface->if_index != 0 || interface->if_name != nullptr; interface++) {
+ names.insert(interface->if_name);
+ }
+ if_freenameindex(interfaces);
+ return names;
+}
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_bind_to_device_util.h b/test/syscalls/linux/socket_bind_to_device_util.h
new file mode 100644
index 000000000..f941ccc86
--- /dev/null
+++ b/test/syscalls/linux/socket_bind_to_device_util.h
@@ -0,0 +1,67 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_SOCKET_BIND_TO_DEVICE_UTILS_H_
+#define GVISOR_TEST_SYSCALLS_SOCKET_BIND_TO_DEVICE_UTILS_H_
+
+#include <arpa/inet.h>
+#include <linux/if_tun.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+#include <unistd.h>
+
+#include <cstdio>
+#include <cstring>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "absl/memory/memory.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+class Tunnel {
+ public:
+ static PosixErrorOr<std::unique_ptr<Tunnel>> New(
+ std::string tunnel_name = "");
+ const std::string& GetName() const { return name_; }
+
+ ~Tunnel() {
+ if (fd_ != -1) {
+ close(fd_);
+ }
+ }
+
+ private:
+ Tunnel(int fd) : fd_(fd) {}
+ int fd_ = -1;
+ std::string name_;
+};
+
+std::unordered_set<std::string> GetInterfaceNames();
+
+} // namespace testing
+} // namespace gvisor
+
+#endif // GVISOR_TEST_SYSCALLS_SOCKET_BIND_TO_DEVICE_UTILS_H_
diff --git a/test/syscalls/linux/socket_blocking.cc b/test/syscalls/linux/socket_blocking.cc
new file mode 100644
index 000000000..7e88aa2d9
--- /dev/null
+++ b/test/syscalls/linux/socket_blocking.cc
@@ -0,0 +1,60 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_blocking.h"
+
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+
+#include <cstdio>
+
+#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+#include "test/util/timer_util.h"
+
+namespace gvisor {
+namespace testing {
+
+TEST_P(BlockingSocketPairTest, RecvBlocks) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[100];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ constexpr auto kDuration = absl::Milliseconds(200);
+ auto before = Now(CLOCK_MONOTONIC);
+
+ const ScopedThread t([&]() {
+ absl::SleepFor(kDuration);
+ ASSERT_THAT(write(sockets->first_fd(), sent_data, sizeof(sent_data)),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+ });
+
+ char received_data[sizeof(sent_data)] = {};
+ ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+ sizeof(received_data), 0),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+
+ auto after = Now(CLOCK_MONOTONIC);
+ EXPECT_GE(after - before, kDuration);
+}
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_blocking.h b/test/syscalls/linux/socket_blocking.h
new file mode 100644
index 000000000..db26e5ef5
--- /dev/null
+++ b/test/syscalls/linux/socket_blocking.h
@@ -0,0 +1,29 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_BLOCKING_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_BLOCKING_H_
+
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to pairs of blocking connected sockets.
+using BlockingSocketPairTest = SocketPairTest;
+
+} // namespace testing
+} // namespace gvisor
+
+#endif // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_BLOCKING_H_
diff --git a/test/syscalls/linux/socket_capability.cc b/test/syscalls/linux/socket_capability.cc
new file mode 100644
index 000000000..84b5b2b21
--- /dev/null
+++ b/test/syscalls/linux/socket_capability.cc
@@ -0,0 +1,61 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Subset of socket tests that need Linux-specific headers (compared to POSIX
+// headers).
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+TEST(SocketTest, UnixConnectNeedsWritePerm) {
+ SKIP_IF(IsRunningWithVFS1());
+
+ FileDescriptor bound =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_UNIX, SOCK_STREAM, PF_UNIX));
+
+ struct sockaddr_un addr =
+ ASSERT_NO_ERRNO_AND_VALUE(UniqueUnixAddr(/*abstract=*/false, AF_UNIX));
+ ASSERT_THAT(bind(bound.get(), reinterpret_cast<struct sockaddr*>(&addr),
+ sizeof(addr)),
+ SyscallSucceeds());
+ ASSERT_THAT(listen(bound.get(), 1), SyscallSucceeds());
+
+ // Drop capabilites that allow us to override permision checks. Otherwise if
+ // the test is run as root, the connect below will bypass permission checks
+ // and succeed unexpectedly.
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+
+ // Connect should fail without write perms.
+ ASSERT_THAT(chmod(addr.sun_path, 0500), SyscallSucceeds());
+ FileDescriptor client =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_UNIX, SOCK_STREAM, PF_UNIX));
+ ASSERT_THAT(connect(client.get(), reinterpret_cast<struct sockaddr*>(&addr),
+ sizeof(addr)),
+ SyscallFailsWithErrno(EACCES));
+
+ // Connect should succeed with write perms.
+ ASSERT_THAT(chmod(addr.sun_path, 0200), SyscallSucceeds());
+ EXPECT_THAT(connect(client.get(), reinterpret_cast<struct sockaddr*>(&addr),
+ sizeof(addr)),
+ SyscallSucceeds());
+}
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_filesystem.cc b/test/syscalls/linux/socket_filesystem.cc
new file mode 100644
index 000000000..287359363
--- /dev/null
+++ b/test/syscalls/linux/socket_filesystem.cc
@@ -0,0 +1,49 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "test/syscalls/linux/socket_generic.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/socket_unix.h"
+#include "test/syscalls/linux/socket_unix_cmsg.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+ return ApplyVec<SocketPairKind>(
+ FilesystemBoundUnixDomainSocketPair,
+ AllBitwiseCombinations(List<int>{SOCK_STREAM, SOCK_DGRAM, SOCK_SEQPACKET},
+ List<int>{0, SOCK_NONBLOCK}));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ FilesystemUnixSockets, AllSocketPairTest,
+ ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+INSTANTIATE_TEST_SUITE_P(
+ FilesystemUnixSockets, UnixSocketPairTest,
+ ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+INSTANTIATE_TEST_SUITE_P(
+ FilesystemUnixSockets, UnixSocketPairCmsgTest,
+ ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_generic.cc b/test/syscalls/linux/socket_generic.cc
new file mode 100644
index 000000000..f7d6139f1
--- /dev/null
+++ b/test/syscalls/linux/socket_generic.cc
@@ -0,0 +1,820 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_generic.h"
+
+#include <stdio.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include "gtest/gtest.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+// This file is a generic socket test file. It must be built with another file
+// that provides the test types.
+
+namespace gvisor {
+namespace testing {
+
+TEST_P(AllSocketPairTest, BasicReadWrite) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ char buf[20];
+ const std::string data = "abc";
+ ASSERT_THAT(WriteFd(sockets->first_fd(), data.c_str(), 3),
+ SyscallSucceedsWithValue(3));
+ ASSERT_THAT(ReadFd(sockets->second_fd(), buf, 3),
+ SyscallSucceedsWithValue(3));
+ EXPECT_EQ(data, absl::string_view(buf, 3));
+}
+
+TEST_P(AllSocketPairTest, BasicSendRecv) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ char sent_data[512];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+ ASSERT_THAT(
+ RetryEINTR(send)(sockets->first_fd(), sent_data, sizeof(sent_data), 0),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+ char received_data[sizeof(sent_data)];
+ ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+ sizeof(received_data), 0),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+}
+
+TEST_P(AllSocketPairTest, BasicSendmmsg) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ char sent_data[200];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ std::vector<struct mmsghdr> msgs(10);
+ std::vector<struct iovec> iovs(msgs.size());
+ const int chunk_size = sizeof(sent_data) / msgs.size();
+ for (size_t i = 0; i < msgs.size(); i++) {
+ iovs[i].iov_len = chunk_size;
+ iovs[i].iov_base = &sent_data[i * chunk_size];
+ msgs[i].msg_hdr.msg_iov = &iovs[i];
+ msgs[i].msg_hdr.msg_iovlen = 1;
+ }
+
+ ASSERT_THAT(
+ RetryEINTR(sendmmsg)(sockets->first_fd(), &msgs[0], msgs.size(), 0),
+ SyscallSucceedsWithValue(msgs.size()));
+
+ for (const struct mmsghdr& msg : msgs) {
+ EXPECT_EQ(chunk_size, msg.msg_len);
+ }
+
+ char received_data[sizeof(sent_data)];
+ for (size_t i = 0; i < msgs.size(); i++) {
+ ASSERT_THAT(ReadFd(sockets->second_fd(), &received_data[i * chunk_size],
+ chunk_size),
+ SyscallSucceedsWithValue(chunk_size));
+ }
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+}
+
+TEST_P(AllSocketPairTest, BasicRecvmmsg) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ char sent_data[200];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ char received_data[sizeof(sent_data)];
+ std::vector<struct mmsghdr> msgs(10);
+ std::vector<struct iovec> iovs(msgs.size());
+ const int chunk_size = sizeof(sent_data) / msgs.size();
+ for (size_t i = 0; i < msgs.size(); i++) {
+ iovs[i].iov_len = chunk_size;
+ iovs[i].iov_base = &received_data[i * chunk_size];
+ msgs[i].msg_hdr.msg_iov = &iovs[i];
+ msgs[i].msg_hdr.msg_iovlen = 1;
+ }
+
+ for (size_t i = 0; i < msgs.size(); i++) {
+ ASSERT_THAT(
+ WriteFd(sockets->first_fd(), &sent_data[i * chunk_size], chunk_size),
+ SyscallSucceedsWithValue(chunk_size));
+ }
+
+ ASSERT_THAT(RetryEINTR(recvmmsg)(sockets->second_fd(), &msgs[0], msgs.size(),
+ 0, nullptr),
+ SyscallSucceedsWithValue(msgs.size()));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ for (const struct mmsghdr& msg : msgs) {
+ EXPECT_EQ(chunk_size, msg.msg_len);
+ }
+}
+
+TEST_P(AllSocketPairTest, SendmsgRecvmsg10KB) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ std::vector<char> sent_data(10 * 1024);
+ RandomizeBuffer(sent_data.data(), sent_data.size());
+ ASSERT_NO_FATAL_FAILURE(
+ SendNullCmsg(sockets->first_fd(), sent_data.data(), sent_data.size()));
+
+ std::vector<char> received_data(sent_data.size());
+ ASSERT_NO_FATAL_FAILURE(RecvNoCmsg(sockets->second_fd(), received_data.data(),
+ received_data.size()));
+
+ EXPECT_EQ(0,
+ memcmp(sent_data.data(), received_data.data(), sent_data.size()));
+}
+
+// This test validates that a sendmsg/recvmsg w/ MSG_CTRUNC is a no-op on
+// input flags.
+TEST_P(AllSocketPairTest, SendmsgRecvmsgMsgCtruncNoop) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ std::vector<char> sent_data(10 * 1024);
+ RandomizeBuffer(sent_data.data(), sent_data.size());
+ ASSERT_NO_FATAL_FAILURE(
+ SendNullCmsg(sockets->first_fd(), sent_data.data(), sent_data.size()));
+
+ std::vector<char> received_data(sent_data.size());
+ struct msghdr msg = {};
+ char control[CMSG_SPACE(sizeof(int)) + CMSG_SPACE(sizeof(struct ucred))];
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ struct iovec iov;
+ iov.iov_base = &received_data[0];
+ iov.iov_len = received_data.size();
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ // MSG_CTRUNC should be a no-op.
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, MSG_CTRUNC),
+ SyscallSucceedsWithValue(received_data.size()));
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+ EXPECT_EQ(cmsg, nullptr);
+ EXPECT_EQ(msg.msg_controllen, 0);
+ EXPECT_EQ(0,
+ memcmp(sent_data.data(), received_data.data(), sent_data.size()));
+}
+
+TEST_P(AllSocketPairTest, SendmsgRecvmsg16KB) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ std::vector<char> sent_data(16 * 1024);
+ RandomizeBuffer(sent_data.data(), sent_data.size());
+ ASSERT_NO_FATAL_FAILURE(
+ SendNullCmsg(sockets->first_fd(), sent_data.data(), sent_data.size()));
+
+ std::vector<char> received_data(sent_data.size());
+ ASSERT_NO_FATAL_FAILURE(RecvNoCmsg(sockets->second_fd(), received_data.data(),
+ received_data.size()));
+
+ EXPECT_EQ(0,
+ memcmp(sent_data.data(), received_data.data(), sent_data.size()));
+}
+
+TEST_P(AllSocketPairTest, RecvmsgMsghdrFlagsNotClearedOnFailure) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char received_data[10] = {};
+
+ struct iovec iov;
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ struct msghdr msg = {};
+ msg.msg_flags = -1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, MSG_DONTWAIT),
+ SyscallFailsWithErrno(EAGAIN));
+
+ // Check that msghdr flags were not changed.
+ EXPECT_EQ(msg.msg_flags, -1);
+}
+
+TEST_P(AllSocketPairTest, RecvmsgMsghdrFlagsCleared) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[10];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+ ASSERT_THAT(
+ RetryEINTR(send)(sockets->first_fd(), sent_data, sizeof(sent_data), 0),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ char received_data[sizeof(sent_data)] = {};
+
+ struct iovec iov;
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ struct msghdr msg = {};
+ msg.msg_flags = -1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+ EXPECT_EQ(0, memcmp(received_data, sent_data, sizeof(sent_data)));
+
+ // Check that msghdr flags were cleared.
+ EXPECT_EQ(msg.msg_flags, 0);
+}
+
+TEST_P(AllSocketPairTest, RecvmsgPeekMsghdrFlagsCleared) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[10];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+ ASSERT_THAT(
+ RetryEINTR(send)(sockets->first_fd(), sent_data, sizeof(sent_data), 0),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ char received_data[sizeof(sent_data)] = {};
+
+ struct iovec iov;
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ struct msghdr msg = {};
+ msg.msg_flags = -1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, MSG_PEEK),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+ EXPECT_EQ(0, memcmp(received_data, sent_data, sizeof(sent_data)));
+
+ // Check that msghdr flags were cleared.
+ EXPECT_EQ(msg.msg_flags, 0);
+}
+
+TEST_P(AllSocketPairTest, RecvmsgIovNotUpdated) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[10];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+ ASSERT_THAT(
+ RetryEINTR(send)(sockets->first_fd(), sent_data, sizeof(sent_data), 0),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ char received_data[sizeof(sent_data) * 2] = {};
+
+ struct iovec iov;
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ struct msghdr msg = {};
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+ EXPECT_EQ(0, memcmp(received_data, sent_data, sizeof(sent_data)));
+
+ // Check that the iovec length was not updated.
+ EXPECT_EQ(msg.msg_iov->iov_len, sizeof(received_data));
+}
+
+TEST_P(AllSocketPairTest, RecvmmsgInvalidTimeout) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ char buf[10];
+ struct mmsghdr msg = {};
+ struct iovec iov = {};
+ iov.iov_len = sizeof(buf);
+ iov.iov_base = buf;
+ msg.msg_hdr.msg_iov = &iov;
+ msg.msg_hdr.msg_iovlen = 1;
+ struct timespec timeout = {-1, -1};
+ ASSERT_THAT(RetryEINTR(recvmmsg)(sockets->first_fd(), &msg, 1, 0, &timeout),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(AllSocketPairTest, RecvmmsgTimeoutBeforeRecv) {
+ // There is a known bug in the Linux recvmmsg(2) causing it to block forever
+ // if the timeout expires while blocking for the first message.
+ SKIP_IF(!IsRunningOnGvisor());
+
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ char buf[10];
+ struct mmsghdr msg = {};
+ struct iovec iov = {};
+ iov.iov_len = sizeof(buf);
+ iov.iov_base = buf;
+ msg.msg_hdr.msg_iov = &iov;
+ msg.msg_hdr.msg_iovlen = 1;
+ struct timespec timeout = {};
+ ASSERT_THAT(RetryEINTR(recvmmsg)(sockets->first_fd(), &msg, 1, 0, &timeout),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST_P(AllSocketPairTest, MsgPeek) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ char sent_data[50];
+ memset(&sent_data, 0, sizeof(sent_data));
+ ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data, sizeof(sent_data)),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ char received_data[sizeof(sent_data)];
+ for (int i = 0; i < 3; i++) {
+ memset(received_data, 0, sizeof(received_data));
+ EXPECT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+ sizeof(received_data), MSG_PEEK),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(received_data)));
+ }
+
+ ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+ sizeof(received_data), 0),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(received_data)));
+}
+
+TEST_P(AllSocketPairTest, LingerSocketOption) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ struct linger got_linger = {-1, -1};
+ socklen_t length = sizeof(struct linger);
+ EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_LINGER,
+ &got_linger, &length),
+ SyscallSucceedsWithValue(0));
+ struct linger want_linger = {};
+ EXPECT_EQ(0, memcmp(&want_linger, &got_linger, sizeof(struct linger)));
+ EXPECT_EQ(sizeof(struct linger), length);
+}
+
+TEST_P(AllSocketPairTest, KeepAliveSocketOption) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ int keepalive = -1;
+ socklen_t length = sizeof(int);
+ EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_KEEPALIVE,
+ &keepalive, &length),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(0, keepalive);
+ EXPECT_EQ(sizeof(int), length);
+}
+
+TEST_P(AllSocketPairTest, RcvBufSucceeds) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ int size = 0;
+ socklen_t size_size = sizeof(size);
+ EXPECT_THAT(
+ getsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVBUF, &size, &size_size),
+ SyscallSucceeds());
+ EXPECT_GT(size, 0);
+}
+
+TEST_P(AllSocketPairTest, SndBufSucceeds) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ int size = 0;
+ socklen_t size_size = sizeof(size);
+ EXPECT_THAT(
+ getsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDBUF, &size, &size_size),
+ SyscallSucceeds());
+ EXPECT_GT(size, 0);
+}
+
+TEST_P(AllSocketPairTest, RecvTimeoutReadSucceeds) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ struct timeval tv {
+ .tv_sec = 0, .tv_usec = 10
+ };
+ EXPECT_THAT(
+ setsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)),
+ SyscallSucceeds());
+
+ char buf[20] = {};
+ EXPECT_THAT(RetryEINTR(read)(sockets->first_fd(), buf, sizeof(buf)),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST_P(AllSocketPairTest, RecvTimeoutRecvSucceeds) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ struct timeval tv {
+ .tv_sec = 0, .tv_usec = 10
+ };
+ EXPECT_THAT(
+ setsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)),
+ SyscallSucceeds());
+
+ char buf[20] = {};
+ EXPECT_THAT(RetryEINTR(recv)(sockets->first_fd(), buf, sizeof(buf), 0),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST_P(AllSocketPairTest, RecvTimeoutRecvOneSecondSucceeds) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ struct timeval tv {
+ .tv_sec = 1, .tv_usec = 0
+ };
+ EXPECT_THAT(
+ setsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)),
+ SyscallSucceeds());
+
+ char buf[20] = {};
+ EXPECT_THAT(RetryEINTR(recv)(sockets->first_fd(), buf, sizeof(buf), 0),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST_P(AllSocketPairTest, RecvTimeoutRecvmsgSucceeds) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ struct timeval tv {
+ .tv_sec = 0, .tv_usec = 10
+ };
+ EXPECT_THAT(
+ setsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)),
+ SyscallSucceeds());
+
+ struct msghdr msg = {};
+ char buf[20] = {};
+ struct iovec iov;
+ iov.iov_base = buf;
+ iov.iov_len = sizeof(buf);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ EXPECT_THAT(RetryEINTR(recvmsg)(sockets->first_fd(), &msg, 0),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST_P(AllSocketPairTest, SendTimeoutDefault) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ timeval actual_tv = {.tv_sec = -1, .tv_usec = -1};
+ socklen_t len = sizeof(actual_tv);
+ EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDTIMEO,
+ &actual_tv, &len),
+ SyscallSucceeds());
+ EXPECT_EQ(actual_tv.tv_sec, 0);
+ EXPECT_EQ(actual_tv.tv_usec, 0);
+}
+
+TEST_P(AllSocketPairTest, SetGetSendTimeout) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ timeval tv = {.tv_sec = 89, .tv_usec = 42000};
+ EXPECT_THAT(
+ setsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)),
+ SyscallSucceeds());
+
+ timeval actual_tv = {};
+ socklen_t len = sizeof(actual_tv);
+ EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDTIMEO,
+ &actual_tv, &len),
+ SyscallSucceeds());
+ EXPECT_EQ(actual_tv.tv_sec, 89);
+ EXPECT_EQ(actual_tv.tv_usec, 42000);
+}
+
+TEST_P(AllSocketPairTest, SetGetSendTimeoutLargerArg) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ struct timeval_with_extra {
+ struct timeval tv;
+ int64_t extra_data;
+ } ABSL_ATTRIBUTE_PACKED;
+
+ timeval_with_extra tv_extra = {
+ .tv = {.tv_sec = 0, .tv_usec = 123000},
+ };
+
+ EXPECT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDTIMEO,
+ &tv_extra, sizeof(tv_extra)),
+ SyscallSucceeds());
+
+ timeval_with_extra actual_tv = {};
+ socklen_t len = sizeof(actual_tv);
+ EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDTIMEO,
+ &actual_tv, &len),
+ SyscallSucceeds());
+ EXPECT_EQ(actual_tv.tv.tv_sec, 0);
+ EXPECT_EQ(actual_tv.tv.tv_usec, 123000);
+}
+
+TEST_P(AllSocketPairTest, SendTimeoutAllowsWrite) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ struct timeval tv {
+ .tv_sec = 0, .tv_usec = 10
+ };
+ EXPECT_THAT(
+ setsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)),
+ SyscallSucceeds());
+
+ char buf[20] = {};
+ ASSERT_THAT(RetryEINTR(write)(sockets->first_fd(), buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+}
+
+TEST_P(AllSocketPairTest, SendTimeoutAllowsSend) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ struct timeval tv {
+ .tv_sec = 0, .tv_usec = 10
+ };
+ EXPECT_THAT(
+ setsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)),
+ SyscallSucceeds());
+
+ char buf[20] = {};
+ ASSERT_THAT(RetryEINTR(send)(sockets->first_fd(), buf, sizeof(buf), 0),
+ SyscallSucceedsWithValue(sizeof(buf)));
+}
+
+TEST_P(AllSocketPairTest, SendTimeoutAllowsSendmsg) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ struct timeval tv {
+ .tv_sec = 0, .tv_usec = 10
+ };
+ EXPECT_THAT(
+ setsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)),
+ SyscallSucceeds());
+
+ char buf[20] = {};
+ ASSERT_NO_FATAL_FAILURE(SendNullCmsg(sockets->first_fd(), buf, sizeof(buf)));
+}
+
+TEST_P(AllSocketPairTest, RecvTimeoutDefault) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ timeval actual_tv = {.tv_sec = -1, .tv_usec = -1};
+ socklen_t len = sizeof(actual_tv);
+ EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO,
+ &actual_tv, &len),
+ SyscallSucceeds());
+ EXPECT_EQ(actual_tv.tv_sec, 0);
+ EXPECT_EQ(actual_tv.tv_usec, 0);
+}
+
+TEST_P(AllSocketPairTest, SetGetRecvTimeout) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ timeval tv = {.tv_sec = 123, .tv_usec = 456000};
+ EXPECT_THAT(
+ setsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)),
+ SyscallSucceeds());
+
+ timeval actual_tv = {};
+ socklen_t len = sizeof(actual_tv);
+ EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO,
+ &actual_tv, &len),
+ SyscallSucceeds());
+ EXPECT_EQ(actual_tv.tv_sec, 123);
+ EXPECT_EQ(actual_tv.tv_usec, 456000);
+}
+
+TEST_P(AllSocketPairTest, SetGetRecvTimeoutLargerArg) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ struct timeval_with_extra {
+ struct timeval tv;
+ int64_t extra_data;
+ } ABSL_ATTRIBUTE_PACKED;
+
+ timeval_with_extra tv_extra = {
+ .tv = {.tv_sec = 0, .tv_usec = 432000},
+ };
+
+ EXPECT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO,
+ &tv_extra, sizeof(tv_extra)),
+ SyscallSucceeds());
+
+ timeval_with_extra actual_tv = {};
+ socklen_t len = sizeof(actual_tv);
+ EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO,
+ &actual_tv, &len),
+ SyscallSucceeds());
+ EXPECT_EQ(actual_tv.tv.tv_sec, 0);
+ EXPECT_EQ(actual_tv.tv.tv_usec, 432000);
+}
+
+TEST_P(AllSocketPairTest, RecvTimeoutRecvmsgOneSecondSucceeds) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ struct timeval tv {
+ .tv_sec = 1, .tv_usec = 0
+ };
+ EXPECT_THAT(
+ setsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)),
+ SyscallSucceeds());
+
+ struct msghdr msg = {};
+ char buf[20] = {};
+ struct iovec iov;
+ iov.iov_base = buf;
+ iov.iov_len = sizeof(buf);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ EXPECT_THAT(RetryEINTR(recvmsg)(sockets->first_fd(), &msg, 0),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST_P(AllSocketPairTest, RecvTimeoutUsecTooLarge) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ struct timeval tv {
+ .tv_sec = 0, .tv_usec = 2000000 // 2 seconds.
+ };
+ EXPECT_THAT(
+ setsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)),
+ SyscallFailsWithErrno(EDOM));
+}
+
+TEST_P(AllSocketPairTest, SendTimeoutUsecTooLarge) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ struct timeval tv {
+ .tv_sec = 0, .tv_usec = 2000000 // 2 seconds.
+ };
+ EXPECT_THAT(
+ setsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)),
+ SyscallFailsWithErrno(EDOM));
+}
+
+TEST_P(AllSocketPairTest, RecvTimeoutUsecNeg) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ struct timeval tv {
+ .tv_sec = 0, .tv_usec = -1
+ };
+ EXPECT_THAT(
+ setsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)),
+ SyscallFailsWithErrno(EDOM));
+}
+
+TEST_P(AllSocketPairTest, SendTimeoutUsecNeg) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ struct timeval tv {
+ .tv_sec = 0, .tv_usec = -1
+ };
+ EXPECT_THAT(
+ setsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)),
+ SyscallFailsWithErrno(EDOM));
+}
+
+TEST_P(AllSocketPairTest, RecvTimeoutNegSecRead) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ struct timeval tv {
+ .tv_sec = -1, .tv_usec = 0
+ };
+ EXPECT_THAT(
+ setsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)),
+ SyscallSucceeds());
+
+ char buf[20] = {};
+ EXPECT_THAT(RetryEINTR(read)(sockets->first_fd(), buf, sizeof(buf)),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST_P(AllSocketPairTest, RecvTimeoutNegSecRecv) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ struct timeval tv {
+ .tv_sec = -1, .tv_usec = 0
+ };
+ EXPECT_THAT(
+ setsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)),
+ SyscallSucceeds());
+
+ char buf[20] = {};
+ EXPECT_THAT(RetryEINTR(recv)(sockets->first_fd(), buf, sizeof(buf), 0),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST_P(AllSocketPairTest, RecvTimeoutNegSecRecvmsg) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ struct timeval tv {
+ .tv_sec = -1, .tv_usec = 0
+ };
+ EXPECT_THAT(
+ setsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)),
+ SyscallSucceeds());
+
+ struct msghdr msg = {};
+ char buf[20] = {};
+ struct iovec iov;
+ iov.iov_base = buf;
+ iov.iov_len = sizeof(buf);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ EXPECT_THAT(RetryEINTR(recvmsg)(sockets->first_fd(), &msg, 0),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST_P(AllSocketPairTest, RecvWaitAll) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[100];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ ASSERT_THAT(write(sockets->first_fd(), sent_data, sizeof(sent_data)),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ char received_data[sizeof(sent_data)] = {};
+ ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+ sizeof(received_data), MSG_WAITALL),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+}
+
+TEST_P(AllSocketPairTest, RecvWaitAllDontWait) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char data[100] = {};
+ ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), data, sizeof(data),
+ MSG_WAITALL | MSG_DONTWAIT),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST_P(AllSocketPairTest, RecvTimeoutWaitAll) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ struct timeval tv {
+ .tv_sec = 0, .tv_usec = 200000 // 200ms
+ };
+ EXPECT_THAT(setsockopt(sockets->second_fd(), SOL_SOCKET, SO_RCVTIMEO, &tv,
+ sizeof(tv)),
+ SyscallSucceeds());
+
+ char sent_data[100];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ ASSERT_THAT(write(sockets->first_fd(), sent_data, sizeof(sent_data)),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ char received_data[sizeof(sent_data) * 2] = {};
+ ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+ sizeof(received_data), MSG_WAITALL),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+}
+
+TEST_P(AllSocketPairTest, GetSockoptType) {
+ int type = GetParam().type;
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ for (const int fd : {sockets->first_fd(), sockets->second_fd()}) {
+ int opt;
+ socklen_t optlen = sizeof(opt);
+ EXPECT_THAT(getsockopt(fd, SOL_SOCKET, SO_TYPE, &opt, &optlen),
+ SyscallSucceeds());
+
+ // Type may have SOCK_NONBLOCK and SOCK_CLOEXEC ORed into it. Remove these
+ // before comparison.
+ type &= ~(SOCK_NONBLOCK | SOCK_CLOEXEC);
+ EXPECT_EQ(opt, type) << absl::StrFormat(
+ "getsockopt(%d, SOL_SOCKET, SO_TYPE, &opt, &optlen) => opt=%d was "
+ "unexpected",
+ fd, opt);
+ }
+}
+
+TEST_P(AllSocketPairTest, GetSockoptDomain) {
+ const int domain = GetParam().domain;
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ for (const int fd : {sockets->first_fd(), sockets->second_fd()}) {
+ int opt;
+ socklen_t optlen = sizeof(opt);
+ EXPECT_THAT(getsockopt(fd, SOL_SOCKET, SO_DOMAIN, &opt, &optlen),
+ SyscallSucceeds());
+ EXPECT_EQ(opt, domain) << absl::StrFormat(
+ "getsockopt(%d, SOL_SOCKET, SO_DOMAIN, &opt, &optlen) => opt=%d was "
+ "unexpected",
+ fd, opt);
+ }
+}
+
+TEST_P(AllSocketPairTest, GetSockoptProtocol) {
+ const int protocol = GetParam().protocol;
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ for (const int fd : {sockets->first_fd(), sockets->second_fd()}) {
+ int opt;
+ socklen_t optlen = sizeof(opt);
+ EXPECT_THAT(getsockopt(fd, SOL_SOCKET, SO_PROTOCOL, &opt, &optlen),
+ SyscallSucceeds());
+ EXPECT_EQ(opt, protocol) << absl::StrFormat(
+ "getsockopt(%d, SOL_SOCKET, SO_PROTOCOL, &opt, &optlen) => opt=%d was "
+ "unexpected",
+ fd, opt);
+ }
+}
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_generic.h b/test/syscalls/linux/socket_generic.h
new file mode 100644
index 000000000..00ae7bfc3
--- /dev/null
+++ b/test/syscalls/linux/socket_generic.h
@@ -0,0 +1,30 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_GENERIC_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_GENERIC_H_
+
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to pairs of blocking and non-blocking
+// connected stream sockets.
+using AllSocketPairTest = SocketPairTest;
+
+} // namespace testing
+} // namespace gvisor
+
+#endif // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_GENERIC_H_
diff --git a/test/syscalls/linux/socket_generic_stress.cc b/test/syscalls/linux/socket_generic_stress.cc
new file mode 100644
index 000000000..6a232238d
--- /dev/null
+++ b/test/syscalls/linux/socket_generic_stress.cc
@@ -0,0 +1,83 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to pairs of connected sockets.
+using ConnectStressTest = SocketPairTest;
+
+TEST_P(ConnectStressTest, Reset65kTimes) {
+ for (int i = 0; i < 1 << 16; ++i) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ // Send some data to ensure that the connection gets reset and the port gets
+ // released immediately. This avoids either end entering TIME-WAIT.
+ char sent_data[100] = {};
+ ASSERT_THAT(write(sockets->first_fd(), sent_data, sizeof(sent_data)),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+ }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ AllConnectedSockets, ConnectStressTest,
+ ::testing::Values(IPv6UDPBidirectionalBindSocketPair(0),
+ IPv4UDPBidirectionalBindSocketPair(0),
+ DualStackUDPBidirectionalBindSocketPair(0),
+
+ // Without REUSEADDR, we get port exhaustion on Linux.
+ SetSockOpt(SOL_SOCKET, SO_REUSEADDR,
+ &kSockOptOn)(IPv6TCPAcceptBindSocketPair(0)),
+ SetSockOpt(SOL_SOCKET, SO_REUSEADDR,
+ &kSockOptOn)(IPv4TCPAcceptBindSocketPair(0)),
+ SetSockOpt(SOL_SOCKET, SO_REUSEADDR, &kSockOptOn)(
+ DualStackTCPAcceptBindSocketPair(0))));
+
+// Test fixture for tests that apply to pairs of connected sockets created with
+// a persistent listener (if applicable).
+using PersistentListenerConnectStressTest = SocketPairTest;
+
+TEST_P(PersistentListenerConnectStressTest, 65kTimes) {
+ for (int i = 0; i < 1 << 16; ++i) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ AllConnectedSockets, PersistentListenerConnectStressTest,
+ ::testing::Values(
+ IPv6UDPBidirectionalBindSocketPair(0),
+ IPv4UDPBidirectionalBindSocketPair(0),
+ DualStackUDPBidirectionalBindSocketPair(0),
+
+ // Without REUSEADDR, we get port exhaustion on Linux.
+ SetSockOpt(SOL_SOCKET, SO_REUSEADDR, &kSockOptOn)(
+ IPv6TCPAcceptBindPersistentListenerSocketPair(0)),
+ SetSockOpt(SOL_SOCKET, SO_REUSEADDR, &kSockOptOn)(
+ IPv4TCPAcceptBindPersistentListenerSocketPair(0)),
+ SetSockOpt(SOL_SOCKET, SO_REUSEADDR, &kSockOptOn)(
+ DualStackTCPAcceptBindPersistentListenerSocketPair(0))));
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
new file mode 100644
index 000000000..18b9e4b70
--- /dev/null
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -0,0 +1,2566 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <poll.h>
+#include <string.h>
+#include <sys/socket.h>
+
+#include <atomic>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/posix_error.h"
+#include "test/util/save_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+using ::testing::Gt;
+
+PosixErrorOr<uint16_t> AddrPort(int family, sockaddr_storage const& addr) {
+ switch (family) {
+ case AF_INET:
+ return static_cast<uint16_t>(
+ reinterpret_cast<sockaddr_in const*>(&addr)->sin_port);
+ case AF_INET6:
+ return static_cast<uint16_t>(
+ reinterpret_cast<sockaddr_in6 const*>(&addr)->sin6_port);
+ default:
+ return PosixError(EINVAL,
+ absl::StrCat("unknown socket family: ", family));
+ }
+}
+
+PosixError SetAddrPort(int family, sockaddr_storage* addr, uint16_t port) {
+ switch (family) {
+ case AF_INET:
+ reinterpret_cast<sockaddr_in*>(addr)->sin_port = port;
+ return NoError();
+ case AF_INET6:
+ reinterpret_cast<sockaddr_in6*>(addr)->sin6_port = port;
+ return NoError();
+ default:
+ return PosixError(EINVAL,
+ absl::StrCat("unknown socket family: ", family));
+ }
+}
+
+struct TestParam {
+ TestAddress listener;
+ TestAddress connector;
+};
+
+std::string DescribeTestParam(::testing::TestParamInfo<TestParam> const& info) {
+ return absl::StrCat("Listen", info.param.listener.description, "_Connect",
+ info.param.connector.description);
+}
+
+using SocketInetLoopbackTest = ::testing::TestWithParam<TestParam>;
+
+TEST(BadSocketPairArgs, ValidateErrForBadCallsToSocketPair) {
+ int fd[2] = {};
+
+ // Valid AF but invalid for socketpair(2) return ESOCKTNOSUPPORT.
+ ASSERT_THAT(socketpair(AF_INET, 0, 0, fd),
+ SyscallFailsWithErrno(ESOCKTNOSUPPORT));
+ ASSERT_THAT(socketpair(AF_INET6, 0, 0, fd),
+ SyscallFailsWithErrno(ESOCKTNOSUPPORT));
+
+ // Invalid AF will return ENOAFSUPPORT.
+ ASSERT_THAT(socketpair(AF_MAX, 0, 0, fd),
+ SyscallFailsWithErrno(EAFNOSUPPORT));
+ ASSERT_THAT(socketpair(8675309, 0, 0, fd),
+ SyscallFailsWithErrno(EAFNOSUPPORT));
+}
+
+enum class Operation {
+ Bind,
+ Connect,
+ SendTo,
+};
+
+std::string OperationToString(Operation operation) {
+ switch (operation) {
+ case Operation::Bind:
+ return "Bind";
+ case Operation::Connect:
+ return "Connect";
+ case Operation::SendTo:
+ return "SendTo";
+ }
+}
+
+using OperationSequence = std::vector<Operation>;
+
+using DualStackSocketTest =
+ ::testing::TestWithParam<std::tuple<TestAddress, OperationSequence>>;
+
+TEST_P(DualStackSocketTest, AddressOperations) {
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET6, SOCK_DGRAM, 0));
+
+ const TestAddress& addr = std::get<0>(GetParam());
+ const OperationSequence& operations = std::get<1>(GetParam());
+
+ auto addr_in = reinterpret_cast<const sockaddr*>(&addr.addr);
+
+ // sockets may only be bound once. Both `connect` and `sendto` cause a socket
+ // to be bound.
+ bool bound = false;
+ for (const Operation& operation : operations) {
+ bool sockname = false;
+ bool peername = false;
+ switch (operation) {
+ case Operation::Bind: {
+ ASSERT_NO_ERRNO(SetAddrPort(
+ addr.family(), const_cast<sockaddr_storage*>(&addr.addr), 0));
+
+ int bind_ret = bind(fd.get(), addr_in, addr.addr_len);
+
+ // Dual stack sockets may only be bound to AF_INET6.
+ if (!bound && addr.family() == AF_INET6) {
+ EXPECT_THAT(bind_ret, SyscallSucceeds());
+ bound = true;
+
+ sockname = true;
+ } else {
+ EXPECT_THAT(bind_ret, SyscallFailsWithErrno(EINVAL));
+ }
+ break;
+ }
+ case Operation::Connect: {
+ ASSERT_NO_ERRNO(SetAddrPort(
+ addr.family(), const_cast<sockaddr_storage*>(&addr.addr), 1337));
+
+ EXPECT_THAT(RetryEINTR(connect)(fd.get(), addr_in, addr.addr_len),
+ SyscallSucceeds())
+ << GetAddrStr(addr_in);
+ bound = true;
+
+ sockname = true;
+ peername = true;
+
+ break;
+ }
+ case Operation::SendTo: {
+ const char payload[] = "hello";
+ ASSERT_NO_ERRNO(SetAddrPort(
+ addr.family(), const_cast<sockaddr_storage*>(&addr.addr), 1337));
+
+ ssize_t sendto_ret = sendto(fd.get(), &payload, sizeof(payload), 0,
+ addr_in, addr.addr_len);
+
+ EXPECT_THAT(sendto_ret, SyscallSucceedsWithValue(sizeof(payload)));
+ sockname = !bound;
+ bound = true;
+ break;
+ }
+ }
+
+ if (sockname) {
+ sockaddr_storage sock_addr;
+ socklen_t addrlen = sizeof(sock_addr);
+ ASSERT_THAT(getsockname(fd.get(), reinterpret_cast<sockaddr*>(&sock_addr),
+ &addrlen),
+ SyscallSucceeds());
+ ASSERT_EQ(addrlen, sizeof(struct sockaddr_in6));
+
+ auto sock_addr_in6 = reinterpret_cast<const sockaddr_in6*>(&sock_addr);
+
+ if (operation == Operation::SendTo) {
+ EXPECT_EQ(sock_addr_in6->sin6_family, AF_INET6);
+ EXPECT_TRUE(IN6_IS_ADDR_UNSPECIFIED(sock_addr_in6->sin6_addr.s6_addr32))
+ << OperationToString(operation) << " getsocknam="
+ << GetAddrStr(reinterpret_cast<sockaddr*>(&sock_addr));
+
+ EXPECT_NE(sock_addr_in6->sin6_port, 0);
+ } else if (IN6_IS_ADDR_V4MAPPED(
+ reinterpret_cast<const sockaddr_in6*>(addr_in)
+ ->sin6_addr.s6_addr32)) {
+ EXPECT_TRUE(IN6_IS_ADDR_V4MAPPED(sock_addr_in6->sin6_addr.s6_addr32))
+ << OperationToString(operation) << " getsocknam="
+ << GetAddrStr(reinterpret_cast<sockaddr*>(&sock_addr));
+ }
+ }
+
+ if (peername) {
+ sockaddr_storage peer_addr;
+ socklen_t addrlen = sizeof(peer_addr);
+ ASSERT_THAT(getpeername(fd.get(), reinterpret_cast<sockaddr*>(&peer_addr),
+ &addrlen),
+ SyscallSucceeds());
+ ASSERT_EQ(addrlen, sizeof(struct sockaddr_in6));
+
+ if (addr.family() == AF_INET ||
+ IN6_IS_ADDR_V4MAPPED(reinterpret_cast<const sockaddr_in6*>(addr_in)
+ ->sin6_addr.s6_addr32)) {
+ EXPECT_TRUE(IN6_IS_ADDR_V4MAPPED(
+ reinterpret_cast<const sockaddr_in6*>(&peer_addr)
+ ->sin6_addr.s6_addr32))
+ << OperationToString(operation) << " getpeername="
+ << GetAddrStr(reinterpret_cast<sockaddr*>(&peer_addr));
+ }
+ }
+ }
+}
+
+// TODO(gvisor.dev/issue/1556): uncomment V4MappedAny.
+INSTANTIATE_TEST_SUITE_P(
+ All, DualStackSocketTest,
+ ::testing::Combine(
+ ::testing::Values(V4Any(), V4Loopback(), /*V4MappedAny(),*/
+ V4MappedLoopback(), V6Any(), V6Loopback()),
+ ::testing::ValuesIn<OperationSequence>(
+ {{Operation::Bind, Operation::Connect, Operation::SendTo},
+ {Operation::Bind, Operation::SendTo, Operation::Connect},
+ {Operation::Connect, Operation::Bind, Operation::SendTo},
+ {Operation::Connect, Operation::SendTo, Operation::Bind},
+ {Operation::SendTo, Operation::Bind, Operation::Connect},
+ {Operation::SendTo, Operation::Connect, Operation::Bind}})),
+ [](::testing::TestParamInfo<
+ std::tuple<TestAddress, OperationSequence>> const& info) {
+ const TestAddress& addr = std::get<0>(info.param);
+ const OperationSequence& operations = std::get<1>(info.param);
+ std::string s = addr.description;
+ for (const Operation& operation : operations) {
+ absl::StrAppend(&s, OperationToString(operation));
+ }
+ return s;
+ });
+
+void tcpSimpleConnectTest(TestAddress const& listener,
+ TestAddress const& connector, bool unbound) {
+ // Create the listening socket.
+ const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+ sockaddr_storage listen_addr = listener.addr;
+ if (!unbound) {
+ ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+ listener.addr_len),
+ SyscallSucceeds());
+ }
+ ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds());
+
+ // Get the port bound by the listening socket.
+ socklen_t addrlen = listener.addr_len;
+ ASSERT_THAT(getsockname(listen_fd.get(),
+ reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+ SyscallSucceeds());
+ uint16_t const port =
+ ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+ // Connect to the listening socket.
+ const FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+ sockaddr_storage conn_addr = connector.addr;
+ ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+ ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(),
+ reinterpret_cast<sockaddr*>(&conn_addr),
+ connector.addr_len),
+ SyscallSucceeds());
+
+ // Accept the connection.
+ //
+ // We have to assign a name to the accepted socket, as unamed temporary
+ // objects are destructed upon full evaluation of the expression it is in,
+ // potentially causing the connecting socket to fail to shutdown properly.
+ auto accepted =
+ ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+
+ ASSERT_THAT(shutdown(listen_fd.get(), SHUT_RDWR), SyscallSucceeds());
+
+ ASSERT_THAT(shutdown(conn_fd.get(), SHUT_RDWR), SyscallSucceeds());
+}
+
+TEST_P(SocketInetLoopbackTest, TCP) {
+ auto const& param = GetParam();
+ TestAddress const& listener = param.listener;
+ TestAddress const& connector = param.connector;
+
+ tcpSimpleConnectTest(listener, connector, true);
+}
+
+TEST_P(SocketInetLoopbackTest, TCPListenUnbound) {
+ auto const& param = GetParam();
+
+ TestAddress const& listener = param.listener;
+ TestAddress const& connector = param.connector;
+
+ tcpSimpleConnectTest(listener, connector, false);
+}
+
+TEST_P(SocketInetLoopbackTest, TCPListenShutdownListen) {
+ const auto& param = GetParam();
+
+ const TestAddress& listener = param.listener;
+ const TestAddress& connector = param.connector;
+
+ constexpr int kBacklog = 5;
+
+ // Create the listening socket.
+ FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+ sockaddr_storage listen_addr = listener.addr;
+ ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+ listener.addr_len),
+ SyscallSucceeds());
+
+ ASSERT_THAT(listen(listen_fd.get(), kBacklog), SyscallSucceeds());
+ ASSERT_THAT(shutdown(listen_fd.get(), SHUT_RD), SyscallSucceeds());
+ ASSERT_THAT(listen(listen_fd.get(), kBacklog), SyscallSucceeds());
+
+ // Get the port bound by the listening socket.
+ socklen_t addrlen = listener.addr_len;
+ ASSERT_THAT(getsockname(listen_fd.get(),
+ reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+ SyscallSucceeds());
+ const uint16_t port =
+ ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+ sockaddr_storage conn_addr = connector.addr;
+ ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+
+ for (int i = 0; i < kBacklog; i++) {
+ auto client = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+ ASSERT_THAT(RetryEINTR(connect)(client.get(),
+ reinterpret_cast<sockaddr*>(&conn_addr),
+ connector.addr_len),
+ SyscallSucceeds());
+ }
+ for (int i = 0; i < kBacklog; i++) {
+ ASSERT_THAT(accept(listen_fd.get(), nullptr, nullptr), SyscallSucceeds());
+ }
+}
+
+TEST_P(SocketInetLoopbackTest, TCPListenShutdown) {
+ auto const& param = GetParam();
+
+ TestAddress const& listener = param.listener;
+ TestAddress const& connector = param.connector;
+
+ constexpr int kBacklog = 2;
+ constexpr int kFDs = kBacklog + 1;
+
+ // Create the listening socket.
+ FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+ sockaddr_storage listen_addr = listener.addr;
+ ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+ listener.addr_len),
+ SyscallSucceeds());
+ ASSERT_THAT(listen(listen_fd.get(), kBacklog), SyscallSucceeds());
+
+ // Get the port bound by the listening socket.
+ socklen_t addrlen = listener.addr_len;
+ ASSERT_THAT(getsockname(listen_fd.get(),
+ reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+ SyscallSucceeds());
+ uint16_t const port =
+ ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+ sockaddr_storage conn_addr = connector.addr;
+ ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+
+ // Shutdown the write of the listener, expect to not have any effect.
+ ASSERT_THAT(shutdown(listen_fd.get(), SHUT_WR), SyscallSucceeds());
+
+ for (int i = 0; i < kFDs; i++) {
+ auto client = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+ ASSERT_THAT(RetryEINTR(connect)(client.get(),
+ reinterpret_cast<sockaddr*>(&conn_addr),
+ connector.addr_len),
+ SyscallSucceeds());
+ ASSERT_THAT(accept(listen_fd.get(), nullptr, nullptr), SyscallSucceeds());
+ }
+
+ // Shutdown the read of the listener, expect to fail subsequent
+ // server accepts, binds and client connects.
+ ASSERT_THAT(shutdown(listen_fd.get(), SHUT_RD), SyscallSucceeds());
+
+ ASSERT_THAT(accept(listen_fd.get(), nullptr, nullptr),
+ SyscallFailsWithErrno(EINVAL));
+
+ // Check that shutdown did not release the port.
+ FileDescriptor new_listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+ ASSERT_THAT(
+ bind(new_listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+ listener.addr_len),
+ SyscallFailsWithErrno(EADDRINUSE));
+
+ // Check that subsequent connection attempts receive a RST.
+ auto client = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+
+ for (int i = 0; i < kFDs; i++) {
+ auto client = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+ ASSERT_THAT(RetryEINTR(connect)(client.get(),
+ reinterpret_cast<sockaddr*>(&conn_addr),
+ connector.addr_len),
+ SyscallFailsWithErrno(ECONNREFUSED));
+ }
+}
+
+TEST_P(SocketInetLoopbackTest, TCPListenClose) {
+ auto const& param = GetParam();
+
+ TestAddress const& listener = param.listener;
+ TestAddress const& connector = param.connector;
+
+ constexpr int kAcceptCount = 2;
+ constexpr int kBacklog = kAcceptCount + 2;
+ constexpr int kFDs = kBacklog * 3;
+
+ // Create the listening socket.
+ FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+ sockaddr_storage listen_addr = listener.addr;
+ ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+ listener.addr_len),
+ SyscallSucceeds());
+ ASSERT_THAT(listen(listen_fd.get(), kBacklog), SyscallSucceeds());
+
+ // Get the port bound by the listening socket.
+ socklen_t addrlen = listener.addr_len;
+ ASSERT_THAT(getsockname(listen_fd.get(),
+ reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+ SyscallSucceeds());
+ uint16_t const port =
+ ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+ sockaddr_storage conn_addr = connector.addr;
+ ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+ std::vector<FileDescriptor> clients;
+ for (int i = 0; i < kFDs; i++) {
+ auto client = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(connector.family(), SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP));
+ int ret = connect(client.get(), reinterpret_cast<sockaddr*>(&conn_addr),
+ connector.addr_len);
+ if (ret != 0) {
+ EXPECT_THAT(ret, SyscallFailsWithErrno(EINPROGRESS));
+ }
+ clients.push_back(std::move(client));
+ }
+ for (int i = 0; i < kAcceptCount; i++) {
+ auto accepted =
+ ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+ }
+}
+
+void TestListenWhileConnect(const TestParam& param,
+ void (*stopListen)(FileDescriptor&)) {
+ TestAddress const& listener = param.listener;
+ TestAddress const& connector = param.connector;
+
+ constexpr int kBacklog = 2;
+ constexpr int kClients = kBacklog + 1;
+
+ // Create the listening socket.
+ FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+ sockaddr_storage listen_addr = listener.addr;
+ ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+ listener.addr_len),
+ SyscallSucceeds());
+ ASSERT_THAT(listen(listen_fd.get(), kBacklog), SyscallSucceeds());
+
+ // Get the port bound by the listening socket.
+ socklen_t addrlen = listener.addr_len;
+ ASSERT_THAT(getsockname(listen_fd.get(),
+ reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+ SyscallSucceeds());
+ uint16_t const port =
+ ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+ sockaddr_storage conn_addr = connector.addr;
+ ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+ std::vector<FileDescriptor> clients;
+ for (int i = 0; i < kClients; i++) {
+ FileDescriptor client = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(connector.family(), SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP));
+ int ret = connect(client.get(), reinterpret_cast<sockaddr*>(&conn_addr),
+ connector.addr_len);
+ if (ret != 0) {
+ EXPECT_THAT(ret, SyscallFailsWithErrno(EINPROGRESS));
+ clients.push_back(std::move(client));
+ }
+ }
+
+ stopListen(listen_fd);
+
+ for (auto& client : clients) {
+ const int kTimeout = 10000;
+ struct pollfd pfd = {
+ .fd = client.get(),
+ .events = POLLIN,
+ };
+ // When the listening socket is closed, then we expect the remote to reset
+ // the connection.
+ ASSERT_THAT(poll(&pfd, 1, kTimeout), SyscallSucceedsWithValue(1));
+ ASSERT_EQ(pfd.revents, POLLIN | POLLHUP | POLLERR);
+ char c;
+ // Subsequent read can fail with:
+ // ECONNRESET: If the client connection was established and was reset by the
+ // remote.
+ // ECONNREFUSED: If the client connection failed to be established.
+ ASSERT_THAT(read(client.get(), &c, sizeof(c)),
+ AnyOf(SyscallFailsWithErrno(ECONNRESET),
+ SyscallFailsWithErrno(ECONNREFUSED)));
+ }
+}
+
+TEST_P(SocketInetLoopbackTest, TCPListenCloseWhileConnect) {
+ TestListenWhileConnect(GetParam(), [](FileDescriptor& f) {
+ ASSERT_THAT(close(f.release()), SyscallSucceeds());
+ });
+}
+
+TEST_P(SocketInetLoopbackTest, TCPListenShutdownWhileConnect) {
+ TestListenWhileConnect(GetParam(), [](FileDescriptor& f) {
+ ASSERT_THAT(shutdown(f.get(), SHUT_RD), SyscallSucceeds());
+ });
+}
+
+TEST_P(SocketInetLoopbackTest, TCPbacklog) {
+ auto const& param = GetParam();
+
+ TestAddress const& listener = param.listener;
+ TestAddress const& connector = param.connector;
+
+ // Create the listening socket.
+ const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+ sockaddr_storage listen_addr = listener.addr;
+ ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+ listener.addr_len),
+ SyscallSucceeds());
+ ASSERT_THAT(listen(listen_fd.get(), 2), SyscallSucceeds());
+
+ // Get the port bound by the listening socket.
+ socklen_t addrlen = listener.addr_len;
+ ASSERT_THAT(getsockname(listen_fd.get(),
+ reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+ SyscallSucceeds());
+ uint16_t const port =
+ ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+ int i = 0;
+ while (1) {
+ int ret;
+
+ // Connect to the listening socket.
+ const FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(connector.family(), SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP));
+ sockaddr_storage conn_addr = connector.addr;
+ ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+ ret = connect(conn_fd.get(), reinterpret_cast<sockaddr*>(&conn_addr),
+ connector.addr_len);
+ if (ret != 0) {
+ EXPECT_THAT(ret, SyscallFailsWithErrno(EINPROGRESS));
+ struct pollfd pfd = {
+ .fd = conn_fd.get(),
+ .events = POLLOUT,
+ };
+ ret = poll(&pfd, 1, 3000);
+ if (ret == 0) break;
+ EXPECT_THAT(ret, SyscallSucceedsWithValue(1));
+ }
+ EXPECT_THAT(RetryEINTR(send)(conn_fd.get(), &i, sizeof(i), 0),
+ SyscallSucceedsWithValue(sizeof(i)));
+ ASSERT_THAT(shutdown(conn_fd.get(), SHUT_RDWR), SyscallSucceeds());
+ i++;
+ }
+
+ for (; i != 0; i--) {
+ // Accept the connection.
+ //
+ // We have to assign a name to the accepted socket, as unamed temporary
+ // objects are destructed upon full evaluation of the expression it is in,
+ // potentially causing the connecting socket to fail to shutdown properly.
+ auto accepted =
+ ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+ }
+}
+
+// TCPFinWait2Test creates a pair of connected sockets then closes one end to
+// trigger FIN_WAIT2 state for the closed endpoint. Then it binds the same local
+// IP/port on a new socket and tries to connect. The connect should fail w/
+// an EADDRINUSE. Then we wait till the FIN_WAIT2 timeout is over and try the
+// connect again with a new socket and this time it should succeed.
+//
+// TCP timers are not S/R today, this can cause this test to be flaky when run
+// under random S/R due to timer being reset on a restore.
+TEST_P(SocketInetLoopbackTest, TCPFinWait2Test_NoRandomSave) {
+ auto const& param = GetParam();
+ TestAddress const& listener = param.listener;
+ TestAddress const& connector = param.connector;
+
+ // Create the listening socket.
+ const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+ sockaddr_storage listen_addr = listener.addr;
+ ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+ listener.addr_len),
+ SyscallSucceeds());
+ ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds());
+
+ // Get the port bound by the listening socket.
+ socklen_t addrlen = listener.addr_len;
+ ASSERT_THAT(getsockname(listen_fd.get(),
+ reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+ SyscallSucceeds());
+
+ uint16_t const port =
+ ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+ // Connect to the listening socket.
+ FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+
+ // Lower FIN_WAIT2 state to 5 seconds for test.
+ constexpr int kTCPLingerTimeout = 5;
+ EXPECT_THAT(setsockopt(conn_fd.get(), IPPROTO_TCP, TCP_LINGER2,
+ &kTCPLingerTimeout, sizeof(kTCPLingerTimeout)),
+ SyscallSucceedsWithValue(0));
+
+ sockaddr_storage conn_addr = connector.addr;
+ ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+ ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(),
+ reinterpret_cast<sockaddr*>(&conn_addr),
+ connector.addr_len),
+ SyscallSucceeds());
+
+ // Accept the connection.
+ auto accepted =
+ ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+
+ // Get the address/port bound by the connecting socket.
+ sockaddr_storage conn_bound_addr;
+ socklen_t conn_addrlen = connector.addr_len;
+ ASSERT_THAT(
+ getsockname(conn_fd.get(), reinterpret_cast<sockaddr*>(&conn_bound_addr),
+ &conn_addrlen),
+ SyscallSucceeds());
+
+ // close the connecting FD to trigger FIN_WAIT2 on the connected fd.
+ conn_fd.reset();
+
+ // Now bind and connect a new socket.
+ const FileDescriptor conn_fd2 = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+
+ // Disable cooperative saves after this point. As a save between the first
+ // bind/connect and the second one can cause the linger timeout timer to
+ // be restarted causing the final bind/connect to fail.
+ DisableSave ds;
+
+ ASSERT_THAT(bind(conn_fd2.get(),
+ reinterpret_cast<sockaddr*>(&conn_bound_addr), conn_addrlen),
+ SyscallFailsWithErrno(EADDRINUSE));
+
+ // Sleep for a little over the linger timeout to reduce flakiness in
+ // save/restore tests.
+ absl::SleepFor(absl::Seconds(kTCPLingerTimeout + 2));
+
+ ds.reset();
+
+ if (!IsRunningOnGvisor()) {
+ ASSERT_THAT(
+ bind(conn_fd2.get(), reinterpret_cast<sockaddr*>(&conn_bound_addr),
+ conn_addrlen),
+ SyscallSucceeds());
+ }
+ ASSERT_THAT(RetryEINTR(connect)(conn_fd2.get(),
+ reinterpret_cast<sockaddr*>(&conn_addr),
+ conn_addrlen),
+ SyscallSucceeds());
+}
+
+// TCPLinger2TimeoutAfterClose creates a pair of connected sockets
+// then closes one end to trigger FIN_WAIT2 state for the closed endpont.
+// It then sleeps for the TCP_LINGER2 timeout and verifies that bind/
+// connecting the same address succeeds.
+//
+// TCP timers are not S/R today, this can cause this test to be flaky when run
+// under random S/R due to timer being reset on a restore.
+TEST_P(SocketInetLoopbackTest, TCPLinger2TimeoutAfterClose_NoRandomSave) {
+ auto const& param = GetParam();
+ TestAddress const& listener = param.listener;
+ TestAddress const& connector = param.connector;
+
+ // Create the listening socket.
+ const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+ sockaddr_storage listen_addr = listener.addr;
+ ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+ listener.addr_len),
+ SyscallSucceeds());
+ ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds());
+
+ // Get the port bound by the listening socket.
+ socklen_t addrlen = listener.addr_len;
+ ASSERT_THAT(getsockname(listen_fd.get(),
+ reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+ SyscallSucceeds());
+
+ uint16_t const port =
+ ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+ // Connect to the listening socket.
+ FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+
+ sockaddr_storage conn_addr = connector.addr;
+ ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+ ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(),
+ reinterpret_cast<sockaddr*>(&conn_addr),
+ connector.addr_len),
+ SyscallSucceeds());
+
+ // Accept the connection.
+ auto accepted =
+ ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+
+ // Get the address/port bound by the connecting socket.
+ sockaddr_storage conn_bound_addr;
+ socklen_t conn_addrlen = connector.addr_len;
+ ASSERT_THAT(
+ getsockname(conn_fd.get(), reinterpret_cast<sockaddr*>(&conn_bound_addr),
+ &conn_addrlen),
+ SyscallSucceeds());
+
+ // Disable cooperative saves after this point as TCP timers are not restored
+ // across a S/R.
+ {
+ DisableSave ds;
+ constexpr int kTCPLingerTimeout = 5;
+ EXPECT_THAT(setsockopt(conn_fd.get(), IPPROTO_TCP, TCP_LINGER2,
+ &kTCPLingerTimeout, sizeof(kTCPLingerTimeout)),
+ SyscallSucceedsWithValue(0));
+
+ // close the connecting FD to trigger FIN_WAIT2 on the connected fd.
+ conn_fd.reset();
+
+ absl::SleepFor(absl::Seconds(kTCPLingerTimeout + 1));
+
+ // ds going out of scope will Re-enable S/R's since at this point the timer
+ // must have fired and cleaned up the endpoint.
+ }
+
+ // Now bind and connect a new socket and verify that we can immediately
+ // rebind the address bound by the conn_fd as it never entered TIME_WAIT.
+ const FileDescriptor conn_fd2 = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+
+ ASSERT_THAT(bind(conn_fd2.get(),
+ reinterpret_cast<sockaddr*>(&conn_bound_addr), conn_addrlen),
+ SyscallSucceeds());
+ ASSERT_THAT(RetryEINTR(connect)(conn_fd2.get(),
+ reinterpret_cast<sockaddr*>(&conn_addr),
+ conn_addrlen),
+ SyscallSucceeds());
+}
+
+// TCPResetAfterClose creates a pair of connected sockets then closes
+// one end to trigger FIN_WAIT2 state for the closed endpoint verifies
+// that we generate RSTs for any new data after the socket is fully
+// closed.
+TEST_P(SocketInetLoopbackTest, TCPResetAfterClose) {
+ auto const& param = GetParam();
+ TestAddress const& listener = param.listener;
+ TestAddress const& connector = param.connector;
+
+ // Create the listening socket.
+ const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+ sockaddr_storage listen_addr = listener.addr;
+ ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+ listener.addr_len),
+ SyscallSucceeds());
+ ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds());
+
+ // Get the port bound by the listening socket.
+ socklen_t addrlen = listener.addr_len;
+ ASSERT_THAT(getsockname(listen_fd.get(),
+ reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+ SyscallSucceeds());
+
+ uint16_t const port =
+ ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+ // Connect to the listening socket.
+ FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+
+ sockaddr_storage conn_addr = connector.addr;
+ ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+ ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(),
+ reinterpret_cast<sockaddr*>(&conn_addr),
+ connector.addr_len),
+ SyscallSucceeds());
+
+ // Accept the connection.
+ auto accepted =
+ ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+
+ // close the connecting FD to trigger FIN_WAIT2 on the connected fd.
+ conn_fd.reset();
+
+ int data = 1234;
+
+ // Now send data which should trigger a RST as the other end should
+ // have timed out and closed the socket.
+ EXPECT_THAT(RetryEINTR(send)(accepted.get(), &data, sizeof(data), 0),
+ SyscallSucceeds());
+ // Sleep for a shortwhile to get a RST back.
+ absl::SleepFor(absl::Seconds(1));
+
+ // Try writing again and we should get an EPIPE back.
+ EXPECT_THAT(RetryEINTR(send)(accepted.get(), &data, sizeof(data), 0),
+ SyscallFailsWithErrno(EPIPE));
+
+ // Trying to read should return zero as the other end did send
+ // us a FIN. We do it twice to verify that the RST does not cause an
+ // ECONNRESET on the read after EOF has been read by applicaiton.
+ EXPECT_THAT(RetryEINTR(recv)(accepted.get(), &data, sizeof(data), 0),
+ SyscallSucceedsWithValue(0));
+ EXPECT_THAT(RetryEINTR(recv)(accepted.get(), &data, sizeof(data), 0),
+ SyscallSucceedsWithValue(0));
+}
+
+// This test is disabled under random save as the the restore run
+// results in the stack.Seed() being different which can cause
+// sequence number of final connect to be one that is considered
+// old and can cause the test to be flaky.
+TEST_P(SocketInetLoopbackTest, TCPTimeWaitTest_NoRandomSave) {
+ auto const& param = GetParam();
+ TestAddress const& listener = param.listener;
+ TestAddress const& connector = param.connector;
+
+ // Create the listening socket.
+ const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+ sockaddr_storage listen_addr = listener.addr;
+ ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+ listener.addr_len),
+ SyscallSucceeds());
+ ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds());
+
+ // Get the port bound by the listening socket.
+ socklen_t addrlen = listener.addr_len;
+ ASSERT_THAT(getsockname(listen_fd.get(),
+ reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+ SyscallSucceeds());
+
+ uint16_t const port =
+ ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+ // Connect to the listening socket.
+ FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+
+ // We disable saves after this point as a S/R causes the netstack seed
+ // to be regenerated which changes what ports/ISN is picked for a given
+ // tuple (src ip,src port, dst ip, dst port). This can cause the final
+ // SYN to use a sequence number that looks like one from the current
+ // connection in TIME_WAIT and will not be accepted causing the test
+ // to timeout.
+ //
+ // TODO(gvisor.dev/issue/940): S/R portSeed/portHint
+ DisableSave ds;
+ sockaddr_storage conn_addr = connector.addr;
+ ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+ ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(),
+ reinterpret_cast<sockaddr*>(&conn_addr),
+ connector.addr_len),
+ SyscallSucceeds());
+
+ // Accept the connection.
+ auto accepted =
+ ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+
+ // Get the address/port bound by the connecting socket.
+ sockaddr_storage conn_bound_addr;
+ socklen_t conn_addrlen = connector.addr_len;
+ ASSERT_THAT(
+ getsockname(conn_fd.get(), reinterpret_cast<sockaddr*>(&conn_bound_addr),
+ &conn_addrlen),
+ SyscallSucceeds());
+
+ // close the accept FD to trigger TIME_WAIT on the accepted socket which
+ // should cause the conn_fd to follow CLOSE_WAIT->LAST_ACK->CLOSED instead of
+ // TIME_WAIT.
+ accepted.reset();
+ absl::SleepFor(absl::Seconds(1));
+ conn_fd.reset();
+ absl::SleepFor(absl::Seconds(1));
+
+ // Now bind and connect a new socket and verify that we can immediately
+ // rebind the address bound by the conn_fd as it never entered TIME_WAIT.
+ const FileDescriptor conn_fd2 = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+
+ ASSERT_THAT(bind(conn_fd2.get(),
+ reinterpret_cast<sockaddr*>(&conn_bound_addr), conn_addrlen),
+ SyscallSucceeds());
+ ASSERT_THAT(RetryEINTR(connect)(conn_fd2.get(),
+ reinterpret_cast<sockaddr*>(&conn_addr),
+ conn_addrlen),
+ SyscallSucceeds());
+}
+
+TEST_P(SocketInetLoopbackTest, AcceptedInheritsTCPUserTimeout) {
+ auto const& param = GetParam();
+ TestAddress const& listener = param.listener;
+ TestAddress const& connector = param.connector;
+
+ // Create the listening socket.
+ const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+ sockaddr_storage listen_addr = listener.addr;
+ ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+ listener.addr_len),
+ SyscallSucceeds());
+ ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds());
+
+ // Get the port bound by the listening socket.
+ socklen_t addrlen = listener.addr_len;
+ ASSERT_THAT(getsockname(listen_fd.get(),
+ reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+ SyscallSucceeds());
+
+ const uint16_t port =
+ ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+ // Set the userTimeout on the listening socket.
+ constexpr int kUserTimeout = 10;
+ ASSERT_THAT(setsockopt(listen_fd.get(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+ &kUserTimeout, sizeof(kUserTimeout)),
+ SyscallSucceeds());
+
+ // Connect to the listening socket.
+ FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+
+ sockaddr_storage conn_addr = connector.addr;
+ ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+ ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(),
+ reinterpret_cast<sockaddr*>(&conn_addr),
+ connector.addr_len),
+ SyscallSucceeds());
+
+ // Accept the connection.
+ auto accepted =
+ ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+ // Verify that the accepted socket inherited the user timeout set on
+ // listening socket.
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ ASSERT_THAT(
+ getsockopt(accepted.get(), IPPROTO_TCP, TCP_USER_TIMEOUT, &get, &get_len),
+ SyscallSucceeds());
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kUserTimeout);
+}
+
+// TODO(gvisor.dev/issue/1688): Partially completed passive endpoints are not
+// saved. Enable S/R once issue is fixed.
+TEST_P(SocketInetLoopbackTest, TCPDeferAccept_NoRandomSave) {
+ // TODO(gvisor.dev/issue/1688): Partially completed passive endpoints are not
+ // saved. Enable S/R issue is fixed.
+ DisableSave ds;
+
+ auto const& param = GetParam();
+ TestAddress const& listener = param.listener;
+ TestAddress const& connector = param.connector;
+
+ // Create the listening socket.
+ const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+ sockaddr_storage listen_addr = listener.addr;
+ ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+ listener.addr_len),
+ SyscallSucceeds());
+ ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds());
+
+ // Get the port bound by the listening socket.
+ socklen_t addrlen = listener.addr_len;
+ ASSERT_THAT(getsockname(listen_fd.get(),
+ reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+ SyscallSucceeds());
+
+ const uint16_t port =
+ ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+ // Set the TCP_DEFER_ACCEPT on the listening socket.
+ constexpr int kTCPDeferAccept = 3;
+ ASSERT_THAT(setsockopt(listen_fd.get(), IPPROTO_TCP, TCP_DEFER_ACCEPT,
+ &kTCPDeferAccept, sizeof(kTCPDeferAccept)),
+ SyscallSucceeds());
+
+ // Connect to the listening socket.
+ FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+
+ sockaddr_storage conn_addr = connector.addr;
+ ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+ ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(),
+ reinterpret_cast<sockaddr*>(&conn_addr),
+ connector.addr_len),
+ SyscallSucceeds());
+
+ // Set the listening socket to nonblock so that we can verify that there is no
+ // connection in queue despite the connect above succeeding since the peer has
+ // sent no data and TCP_DEFER_ACCEPT is set on the listening socket. Set the
+ // FD to O_NONBLOCK.
+ int opts;
+ ASSERT_THAT(opts = fcntl(listen_fd.get(), F_GETFL), SyscallSucceeds());
+ opts |= O_NONBLOCK;
+ ASSERT_THAT(fcntl(listen_fd.get(), F_SETFL, opts), SyscallSucceeds());
+
+ ASSERT_THAT(accept(listen_fd.get(), nullptr, nullptr),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+
+ // Set FD back to blocking.
+ opts &= ~O_NONBLOCK;
+ ASSERT_THAT(fcntl(listen_fd.get(), F_SETFL, opts), SyscallSucceeds());
+
+ // Now write some data to the socket.
+ int data = 0;
+ ASSERT_THAT(RetryEINTR(write)(conn_fd.get(), &data, sizeof(data)),
+ SyscallSucceedsWithValue(sizeof(data)));
+
+ // This should now cause the connection to complete and be delivered to the
+ // accept socket.
+
+ // Accept the connection.
+ auto accepted =
+ ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+
+ // Verify that the accepted socket returns the data written.
+ int get = -1;
+ ASSERT_THAT(RetryEINTR(recv)(accepted.get(), &get, sizeof(get), 0),
+ SyscallSucceedsWithValue(sizeof(get)));
+
+ EXPECT_EQ(get, data);
+}
+
+// TODO(gvisor.dev/issue/1688): Partially completed passive endpoints are not
+// saved. Enable S/R once issue is fixed.
+TEST_P(SocketInetLoopbackTest, TCPDeferAcceptTimeout_NoRandomSave) {
+ // TODO(gvisor.dev/issue/1688): Partially completed passive endpoints are not
+ // saved. Enable S/R once issue is fixed.
+ DisableSave ds;
+
+ auto const& param = GetParam();
+ TestAddress const& listener = param.listener;
+ TestAddress const& connector = param.connector;
+
+ // Create the listening socket.
+ const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+ sockaddr_storage listen_addr = listener.addr;
+ ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+ listener.addr_len),
+ SyscallSucceeds());
+ ASSERT_THAT(listen(listen_fd.get(), SOMAXCONN), SyscallSucceeds());
+
+ // Get the port bound by the listening socket.
+ socklen_t addrlen = listener.addr_len;
+ ASSERT_THAT(getsockname(listen_fd.get(),
+ reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+ SyscallSucceeds());
+
+ const uint16_t port =
+ ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+ // Set the TCP_DEFER_ACCEPT on the listening socket.
+ constexpr int kTCPDeferAccept = 3;
+ ASSERT_THAT(setsockopt(listen_fd.get(), IPPROTO_TCP, TCP_DEFER_ACCEPT,
+ &kTCPDeferAccept, sizeof(kTCPDeferAccept)),
+ SyscallSucceeds());
+
+ // Connect to the listening socket.
+ FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+
+ sockaddr_storage conn_addr = connector.addr;
+ ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+ ASSERT_THAT(RetryEINTR(connect)(conn_fd.get(),
+ reinterpret_cast<sockaddr*>(&conn_addr),
+ connector.addr_len),
+ SyscallSucceeds());
+
+ // Set the listening socket to nonblock so that we can verify that there is no
+ // connection in queue despite the connect above succeeding since the peer has
+ // sent no data and TCP_DEFER_ACCEPT is set on the listening socket. Set the
+ // FD to O_NONBLOCK.
+ int opts;
+ ASSERT_THAT(opts = fcntl(listen_fd.get(), F_GETFL), SyscallSucceeds());
+ opts |= O_NONBLOCK;
+ ASSERT_THAT(fcntl(listen_fd.get(), F_SETFL, opts), SyscallSucceeds());
+
+ // Verify that there is no acceptable connection before TCP_DEFER_ACCEPT
+ // timeout is hit.
+ absl::SleepFor(absl::Seconds(kTCPDeferAccept - 1));
+ ASSERT_THAT(accept(listen_fd.get(), nullptr, nullptr),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+
+ // Set FD back to blocking.
+ opts &= ~O_NONBLOCK;
+ ASSERT_THAT(fcntl(listen_fd.get(), F_SETFL, opts), SyscallSucceeds());
+
+ // Now sleep for a little over the TCP_DEFER_ACCEPT duration. When the timeout
+ // is hit a SYN-ACK should be retransmitted by the listener as a last ditch
+ // attempt to complete the connection with or without data.
+ absl::SleepFor(absl::Seconds(2));
+
+ // Verify that we have a connection that can be accepted even though no
+ // data was written.
+ auto accepted =
+ ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ All, SocketInetLoopbackTest,
+ ::testing::Values(
+ // Listeners bound to IPv4 addresses refuse connections using IPv6
+ // addresses.
+ TestParam{V4Any(), V4Any()}, TestParam{V4Any(), V4Loopback()},
+ TestParam{V4Any(), V4MappedAny()},
+ TestParam{V4Any(), V4MappedLoopback()},
+ TestParam{V4Loopback(), V4Any()}, TestParam{V4Loopback(), V4Loopback()},
+ TestParam{V4Loopback(), V4MappedLoopback()},
+ TestParam{V4MappedAny(), V4Any()},
+ TestParam{V4MappedAny(), V4Loopback()},
+ TestParam{V4MappedAny(), V4MappedAny()},
+ TestParam{V4MappedAny(), V4MappedLoopback()},
+ TestParam{V4MappedLoopback(), V4Any()},
+ TestParam{V4MappedLoopback(), V4Loopback()},
+ TestParam{V4MappedLoopback(), V4MappedLoopback()},
+
+ // Listeners bound to IN6ADDR_ANY accept all connections.
+ TestParam{V6Any(), V4Any()}, TestParam{V6Any(), V4Loopback()},
+ TestParam{V6Any(), V4MappedAny()},
+ TestParam{V6Any(), V4MappedLoopback()}, TestParam{V6Any(), V6Any()},
+ TestParam{V6Any(), V6Loopback()},
+
+ // Listeners bound to IN6ADDR_LOOPBACK refuse connections using IPv4
+ // addresses.
+ TestParam{V6Loopback(), V6Any()},
+ TestParam{V6Loopback(), V6Loopback()}),
+ DescribeTestParam);
+
+using SocketInetReusePortTest = ::testing::TestWithParam<TestParam>;
+
+// TODO(gvisor.dev/issue/940): Remove _NoRandomSave when portHint/stack.Seed is
+// saved/restored.
+TEST_P(SocketInetReusePortTest, TcpPortReuseMultiThread_NoRandomSave) {
+ auto const& param = GetParam();
+
+ TestAddress const& listener = param.listener;
+ TestAddress const& connector = param.connector;
+ sockaddr_storage listen_addr = listener.addr;
+ sockaddr_storage conn_addr = connector.addr;
+ constexpr int kThreadCount = 3;
+ constexpr int kConnectAttempts = 10000;
+
+ // Create the listening socket.
+ FileDescriptor listener_fds[kThreadCount];
+ for (int i = 0; i < kThreadCount; i++) {
+ listener_fds[i] = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+ int fd = listener_fds[i].get();
+
+ ASSERT_THAT(setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ ASSERT_THAT(
+ bind(fd, reinterpret_cast<sockaddr*>(&listen_addr), listener.addr_len),
+ SyscallSucceeds());
+ ASSERT_THAT(listen(fd, 40), SyscallSucceeds());
+
+ // On the first bind we need to determine which port was bound.
+ if (i != 0) {
+ continue;
+ }
+
+ // Get the port bound by the listening socket.
+ socklen_t addrlen = listener.addr_len;
+ ASSERT_THAT(
+ getsockname(listener_fds[0].get(),
+ reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+ SyscallSucceeds());
+ uint16_t const port =
+ ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+ ASSERT_NO_ERRNO(SetAddrPort(listener.family(), &listen_addr, port));
+ ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+ }
+
+ std::atomic<int> connects_received = ATOMIC_VAR_INIT(0);
+ std::unique_ptr<ScopedThread> listen_thread[kThreadCount];
+ int accept_counts[kThreadCount] = {};
+ // TODO(avagin): figure how to not disable S/R for the whole test.
+ // We need to take into account that this test executes a lot of system
+ // calls from many threads.
+ DisableSave ds;
+
+ for (int i = 0; i < kThreadCount; i++) {
+ listen_thread[i] = absl::make_unique<ScopedThread>(
+ [&listener_fds, &accept_counts, i, &connects_received]() {
+ do {
+ auto fd = Accept(listener_fds[i].get(), nullptr, nullptr);
+ if (!fd.ok()) {
+ if (connects_received >= kConnectAttempts) {
+ // Another thread have shutdown our read side causing the
+ // accept to fail.
+ ASSERT_EQ(errno, EINVAL);
+ break;
+ }
+ ASSERT_NO_ERRNO(fd);
+ break;
+ }
+ // Receive some data from a socket to be sure that the connect()
+ // system call has been completed on another side.
+ // Do a short read and then close the socket to trigger a RST. This
+ // ensures that both ends of the connection are cleaned up and no
+ // goroutines hang around in TIME-WAIT. We do this so that this test
+ // does not timeout under gotsan runs where lots of goroutines can
+ // cause the test to use absurd amounts of memory.
+ //
+ // See: https://tools.ietf.org/html/rfc2525#page-50 section 2.17
+ uint16_t data;
+ EXPECT_THAT(
+ RetryEINTR(recv)(fd.ValueOrDie().get(), &data, sizeof(data), 0),
+ SyscallSucceedsWithValue(sizeof(data)));
+ accept_counts[i]++;
+ } while (++connects_received < kConnectAttempts);
+
+ // Shutdown all sockets to wake up other threads.
+ for (int j = 0; j < kThreadCount; j++) {
+ shutdown(listener_fds[j].get(), SHUT_RDWR);
+ }
+ });
+ }
+
+ ScopedThread connecting_thread([&connector, &conn_addr]() {
+ for (int i = 0; i < kConnectAttempts; i++) {
+ const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+ ASSERT_THAT(
+ RetryEINTR(connect)(fd.get(), reinterpret_cast<sockaddr*>(&conn_addr),
+ connector.addr_len),
+ SyscallSucceeds());
+
+ // Do two separate sends to ensure two segments are received. This is
+ // required for netstack where read is incorrectly assuming a whole
+ // segment is read when endpoint.Read() is called which is technically
+ // incorrect as the syscall that invoked endpoint.Read() may only
+ // consume it partially. This results in a case where a close() of
+ // such a socket does not trigger a RST in netstack due to the
+ // endpoint assuming that the endpoint has no unread data.
+ EXPECT_THAT(RetryEINTR(send)(fd.get(), &i, sizeof(i), 0),
+ SyscallSucceedsWithValue(sizeof(i)));
+
+ // TODO(gvisor.dev/issue/1449): Remove this block once netstack correctly
+ // generates a RST.
+ if (IsRunningOnGvisor()) {
+ EXPECT_THAT(RetryEINTR(send)(fd.get(), &i, sizeof(i), 0),
+ SyscallSucceedsWithValue(sizeof(i)));
+ }
+ }
+ });
+
+ // Join threads to be sure that all connections have been counted
+ connecting_thread.Join();
+ for (int i = 0; i < kThreadCount; i++) {
+ listen_thread[i]->Join();
+ }
+ // Check that connections are distributed fairly between listening sockets
+ for (int i = 0; i < kThreadCount; i++)
+ EXPECT_THAT(accept_counts[i],
+ EquivalentWithin((kConnectAttempts / kThreadCount), 0.10));
+}
+
+TEST_P(SocketInetReusePortTest, UdpPortReuseMultiThread_NoRandomSave) {
+ auto const& param = GetParam();
+
+ TestAddress const& listener = param.listener;
+ TestAddress const& connector = param.connector;
+ sockaddr_storage listen_addr = listener.addr;
+ sockaddr_storage conn_addr = connector.addr;
+ constexpr int kThreadCount = 3;
+
+ // Create the listening socket.
+ FileDescriptor listener_fds[kThreadCount];
+ for (int i = 0; i < kThreadCount; i++) {
+ listener_fds[i] =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(listener.family(), SOCK_DGRAM, 0));
+ int fd = listener_fds[i].get();
+
+ ASSERT_THAT(setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ ASSERT_THAT(
+ bind(fd, reinterpret_cast<sockaddr*>(&listen_addr), listener.addr_len),
+ SyscallSucceeds());
+
+ // On the first bind we need to determine which port was bound.
+ if (i != 0) {
+ continue;
+ }
+
+ // Get the port bound by the listening socket.
+ socklen_t addrlen = listener.addr_len;
+ ASSERT_THAT(
+ getsockname(listener_fds[0].get(),
+ reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+ SyscallSucceeds());
+ uint16_t const port =
+ ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+ ASSERT_NO_ERRNO(SetAddrPort(listener.family(), &listen_addr, port));
+ ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+ }
+
+ constexpr int kConnectAttempts = 10000;
+ std::atomic<int> packets_received = ATOMIC_VAR_INIT(0);
+ std::unique_ptr<ScopedThread> receiver_thread[kThreadCount];
+ int packets_per_socket[kThreadCount] = {};
+ // TODO(avagin): figure how to not disable S/R for the whole test.
+ DisableSave ds; // Too expensive.
+
+ for (int i = 0; i < kThreadCount; i++) {
+ receiver_thread[i] = absl::make_unique<ScopedThread>(
+ [&listener_fds, &packets_per_socket, i, &packets_received]() {
+ do {
+ struct sockaddr_storage addr = {};
+ socklen_t addrlen = sizeof(addr);
+ int data;
+
+ auto ret = RetryEINTR(recvfrom)(
+ listener_fds[i].get(), &data, sizeof(data), 0,
+ reinterpret_cast<struct sockaddr*>(&addr), &addrlen);
+
+ if (packets_received < kConnectAttempts) {
+ ASSERT_THAT(ret, SyscallSucceedsWithValue(sizeof(data)));
+ }
+
+ if (ret != sizeof(data)) {
+ // Another thread may have shutdown our read side causing the
+ // recvfrom to fail.
+ break;
+ }
+
+ packets_received++;
+ packets_per_socket[i]++;
+
+ // A response is required to synchronize with the main thread,
+ // otherwise the main thread can send more than can fit into receive
+ // queues.
+ EXPECT_THAT(RetryEINTR(sendto)(
+ listener_fds[i].get(), &data, sizeof(data), 0,
+ reinterpret_cast<sockaddr*>(&addr), addrlen),
+ SyscallSucceedsWithValue(sizeof(data)));
+ } while (packets_received < kConnectAttempts);
+
+ // Shutdown all sockets to wake up other threads.
+ for (int j = 0; j < kThreadCount; j++)
+ shutdown(listener_fds[j].get(), SHUT_RDWR);
+ });
+ }
+
+ ScopedThread main_thread([&connector, &conn_addr]() {
+ for (int i = 0; i < kConnectAttempts; i++) {
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(connector.family(), SOCK_DGRAM, 0));
+ EXPECT_THAT(RetryEINTR(sendto)(fd.get(), &i, sizeof(i), 0,
+ reinterpret_cast<sockaddr*>(&conn_addr),
+ connector.addr_len),
+ SyscallSucceedsWithValue(sizeof(i)));
+ int data;
+ EXPECT_THAT(RetryEINTR(recv)(fd.get(), &data, sizeof(data), 0),
+ SyscallSucceedsWithValue(sizeof(data)));
+ }
+ });
+
+ main_thread.Join();
+
+ // Join threads to be sure that all connections have been counted
+ for (int i = 0; i < kThreadCount; i++) {
+ receiver_thread[i]->Join();
+ }
+ // Check that packets are distributed fairly between listening sockets.
+ for (int i = 0; i < kThreadCount; i++)
+ EXPECT_THAT(packets_per_socket[i],
+ EquivalentWithin((kConnectAttempts / kThreadCount), 0.10));
+}
+
+TEST_P(SocketInetReusePortTest, UdpPortReuseMultiThreadShort_NoRandomSave) {
+ auto const& param = GetParam();
+
+ TestAddress const& listener = param.listener;
+ TestAddress const& connector = param.connector;
+ sockaddr_storage listen_addr = listener.addr;
+ sockaddr_storage conn_addr = connector.addr;
+ constexpr int kThreadCount = 3;
+
+ // TODO(b/141211329): endpointsByNic.seed has to be saved/restored.
+ const DisableSave ds141211329;
+
+ // Create listening sockets.
+ FileDescriptor listener_fds[kThreadCount];
+ for (int i = 0; i < kThreadCount; i++) {
+ listener_fds[i] =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(listener.family(), SOCK_DGRAM, 0));
+ int fd = listener_fds[i].get();
+
+ ASSERT_THAT(setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ ASSERT_THAT(
+ bind(fd, reinterpret_cast<sockaddr*>(&listen_addr), listener.addr_len),
+ SyscallSucceeds());
+
+ // On the first bind we need to determine which port was bound.
+ if (i != 0) {
+ continue;
+ }
+
+ // Get the port bound by the listening socket.
+ socklen_t addrlen = listener.addr_len;
+ ASSERT_THAT(
+ getsockname(listener_fds[0].get(),
+ reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+ SyscallSucceeds());
+ uint16_t const port =
+ ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+ ASSERT_NO_ERRNO(SetAddrPort(listener.family(), &listen_addr, port));
+ ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+ }
+
+ constexpr int kConnectAttempts = 10;
+ FileDescriptor client_fds[kConnectAttempts];
+
+ // Do the first run without save/restore.
+ DisableSave ds;
+ for (int i = 0; i < kConnectAttempts; i++) {
+ client_fds[i] =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(connector.family(), SOCK_DGRAM, 0));
+ EXPECT_THAT(RetryEINTR(sendto)(client_fds[i].get(), &i, sizeof(i), 0,
+ reinterpret_cast<sockaddr*>(&conn_addr),
+ connector.addr_len),
+ SyscallSucceedsWithValue(sizeof(i)));
+ }
+ ds.reset();
+
+ // Check that a mapping of client and server sockets has
+ // not been change after save/restore.
+ for (int i = 0; i < kConnectAttempts; i++) {
+ EXPECT_THAT(RetryEINTR(sendto)(client_fds[i].get(), &i, sizeof(i), 0,
+ reinterpret_cast<sockaddr*>(&conn_addr),
+ connector.addr_len),
+ SyscallSucceedsWithValue(sizeof(i)));
+ }
+
+ struct pollfd pollfds[kThreadCount];
+ for (int i = 0; i < kThreadCount; i++) {
+ pollfds[i].fd = listener_fds[i].get();
+ pollfds[i].events = POLLIN;
+ }
+
+ std::map<uint16_t, int> portToFD;
+
+ int received = 0;
+ while (received < kConnectAttempts * 2) {
+ ASSERT_THAT(poll(pollfds, kThreadCount, -1),
+ SyscallSucceedsWithValue(Gt(0)));
+
+ for (int i = 0; i < kThreadCount; i++) {
+ if ((pollfds[i].revents & POLLIN) == 0) {
+ continue;
+ }
+
+ received++;
+
+ const int fd = pollfds[i].fd;
+ struct sockaddr_storage addr = {};
+ socklen_t addrlen = sizeof(addr);
+ int data;
+ EXPECT_THAT(RetryEINTR(recvfrom)(
+ fd, &data, sizeof(data), 0,
+ reinterpret_cast<struct sockaddr*>(&addr), &addrlen),
+ SyscallSucceedsWithValue(sizeof(data)));
+ uint16_t const port =
+ ASSERT_NO_ERRNO_AND_VALUE(AddrPort(connector.family(), addr));
+ auto prev_port = portToFD.find(port);
+ // Check that all packets from one client have been delivered to the
+ // same server socket.
+ if (prev_port == portToFD.end()) {
+ portToFD[port] = fd;
+ } else {
+ EXPECT_EQ(portToFD[port], fd);
+ }
+ }
+ }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ All, SocketInetReusePortTest,
+ ::testing::Values(
+ // Listeners bound to IPv4 addresses refuse connections using IPv6
+ // addresses.
+ TestParam{V4Any(), V4Loopback()},
+ TestParam{V4Loopback(), V4MappedLoopback()},
+
+ // Listeners bound to IN6ADDR_ANY accept all connections.
+ TestParam{V6Any(), V4Loopback()}, TestParam{V6Any(), V6Loopback()},
+
+ // Listeners bound to IN6ADDR_LOOPBACK refuse connections using IPv4
+ // addresses.
+ TestParam{V6Loopback(), V6Loopback()}),
+ DescribeTestParam);
+
+struct ProtocolTestParam {
+ std::string description;
+ int type;
+};
+
+std::string DescribeProtocolTestParam(
+ ::testing::TestParamInfo<ProtocolTestParam> const& info) {
+ return info.param.description;
+}
+
+using SocketMultiProtocolInetLoopbackTest =
+ ::testing::TestWithParam<ProtocolTestParam>;
+
+TEST_P(SocketMultiProtocolInetLoopbackTest, V4MappedLoopbackOnlyReservesV4) {
+ auto const& param = GetParam();
+
+ for (int i = 0; true; i++) {
+ // Bind the v4 loopback on a dual stack socket.
+ TestAddress const& test_addr_dual = V4MappedLoopback();
+ sockaddr_storage addr_dual = test_addr_dual.addr;
+ const FileDescriptor fd_dual = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(test_addr_dual.family(), param.type, 0));
+ ASSERT_THAT(bind(fd_dual.get(), reinterpret_cast<sockaddr*>(&addr_dual),
+ test_addr_dual.addr_len),
+ SyscallSucceeds());
+
+ // Get the port that we bound.
+ socklen_t addrlen = test_addr_dual.addr_len;
+ ASSERT_THAT(getsockname(fd_dual.get(),
+ reinterpret_cast<sockaddr*>(&addr_dual), &addrlen),
+ SyscallSucceeds());
+ uint16_t const port =
+ ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr_dual.family(), addr_dual));
+
+ // Verify that we can still bind the v6 loopback on the same port.
+ TestAddress const& test_addr_v6 = V6Loopback();
+ sockaddr_storage addr_v6 = test_addr_v6.addr;
+ ASSERT_NO_ERRNO(SetAddrPort(test_addr_v6.family(), &addr_v6, port));
+ const FileDescriptor fd_v6 =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr_v6.family(), param.type, 0));
+ int ret = bind(fd_v6.get(), reinterpret_cast<sockaddr*>(&addr_v6),
+ test_addr_v6.addr_len);
+ if (ret == -1 && errno == EADDRINUSE) {
+ // Port may have been in use.
+ ASSERT_LT(i, 100); // Give up after 100 tries.
+ continue;
+ }
+ ASSERT_THAT(ret, SyscallSucceeds());
+
+ // Verify that binding the v4 loopback with the same port on a v4 socket
+ // fails.
+ TestAddress const& test_addr_v4 = V4Loopback();
+ sockaddr_storage addr_v4 = test_addr_v4.addr;
+ ASSERT_NO_ERRNO(SetAddrPort(test_addr_v4.family(), &addr_v4, port));
+ const FileDescriptor fd_v4 =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr_v4.family(), param.type, 0));
+ ASSERT_THAT(bind(fd_v4.get(), reinterpret_cast<sockaddr*>(&addr_v4),
+ test_addr_v4.addr_len),
+ SyscallFailsWithErrno(EADDRINUSE));
+
+ // No need to try again.
+ break;
+ }
+}
+
+TEST_P(SocketMultiProtocolInetLoopbackTest, V4MappedAnyOnlyReservesV4) {
+ auto const& param = GetParam();
+
+ for (int i = 0; true; i++) {
+ // Bind the v4 any on a dual stack socket.
+ TestAddress const& test_addr_dual = V4MappedAny();
+ sockaddr_storage addr_dual = test_addr_dual.addr;
+ const FileDescriptor fd_dual = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(test_addr_dual.family(), param.type, 0));
+ ASSERT_THAT(bind(fd_dual.get(), reinterpret_cast<sockaddr*>(&addr_dual),
+ test_addr_dual.addr_len),
+ SyscallSucceeds());
+
+ // Get the port that we bound.
+ socklen_t addrlen = test_addr_dual.addr_len;
+ ASSERT_THAT(getsockname(fd_dual.get(),
+ reinterpret_cast<sockaddr*>(&addr_dual), &addrlen),
+ SyscallSucceeds());
+ uint16_t const port =
+ ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr_dual.family(), addr_dual));
+
+ // Verify that we can still bind the v6 loopback on the same port.
+ TestAddress const& test_addr_v6 = V6Loopback();
+ sockaddr_storage addr_v6 = test_addr_v6.addr;
+ ASSERT_NO_ERRNO(SetAddrPort(test_addr_v6.family(), &addr_v6, port));
+ const FileDescriptor fd_v6 =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr_v6.family(), param.type, 0));
+ int ret = bind(fd_v6.get(), reinterpret_cast<sockaddr*>(&addr_v6),
+ test_addr_v6.addr_len);
+ if (ret == -1 && errno == EADDRINUSE) {
+ // Port may have been in use.
+ ASSERT_LT(i, 100); // Give up after 100 tries.
+ continue;
+ }
+ ASSERT_THAT(ret, SyscallSucceeds());
+
+ // Verify that binding the v4 loopback with the same port on a v4 socket
+ // fails.
+ TestAddress const& test_addr_v4 = V4Loopback();
+ sockaddr_storage addr_v4 = test_addr_v4.addr;
+ ASSERT_NO_ERRNO(SetAddrPort(test_addr_v4.family(), &addr_v4, port));
+ const FileDescriptor fd_v4 =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr_v4.family(), param.type, 0));
+ ASSERT_THAT(bind(fd_v4.get(), reinterpret_cast<sockaddr*>(&addr_v4),
+ test_addr_v4.addr_len),
+ SyscallFailsWithErrno(EADDRINUSE));
+
+ // No need to try again.
+ break;
+ }
+}
+
+TEST_P(SocketMultiProtocolInetLoopbackTest, DualStackV6AnyReservesEverything) {
+ auto const& param = GetParam();
+
+ // Bind the v6 any on a dual stack socket.
+ TestAddress const& test_addr_dual = V6Any();
+ sockaddr_storage addr_dual = test_addr_dual.addr;
+ const FileDescriptor fd_dual =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr_dual.family(), param.type, 0));
+ ASSERT_THAT(bind(fd_dual.get(), reinterpret_cast<sockaddr*>(&addr_dual),
+ test_addr_dual.addr_len),
+ SyscallSucceeds());
+
+ // Get the port that we bound.
+ socklen_t addrlen = test_addr_dual.addr_len;
+ ASSERT_THAT(getsockname(fd_dual.get(),
+ reinterpret_cast<sockaddr*>(&addr_dual), &addrlen),
+ SyscallSucceeds());
+ uint16_t const port =
+ ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr_dual.family(), addr_dual));
+
+ // Verify that binding the v6 loopback with the same port fails.
+ TestAddress const& test_addr_v6 = V6Loopback();
+ sockaddr_storage addr_v6 = test_addr_v6.addr;
+ ASSERT_NO_ERRNO(SetAddrPort(test_addr_v6.family(), &addr_v6, port));
+ const FileDescriptor fd_v6 =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr_v6.family(), param.type, 0));
+ ASSERT_THAT(bind(fd_v6.get(), reinterpret_cast<sockaddr*>(&addr_v6),
+ test_addr_v6.addr_len),
+ SyscallFailsWithErrno(EADDRINUSE));
+
+ // Verify that binding the v4 loopback on the same port with a v6 socket
+ // fails.
+ TestAddress const& test_addr_v4_mapped = V4MappedLoopback();
+ sockaddr_storage addr_v4_mapped = test_addr_v4_mapped.addr;
+ ASSERT_NO_ERRNO(
+ SetAddrPort(test_addr_v4_mapped.family(), &addr_v4_mapped, port));
+ const FileDescriptor fd_v4_mapped = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(test_addr_v4_mapped.family(), param.type, 0));
+ ASSERT_THAT(
+ bind(fd_v4_mapped.get(), reinterpret_cast<sockaddr*>(&addr_v4_mapped),
+ test_addr_v4_mapped.addr_len),
+ SyscallFailsWithErrno(EADDRINUSE));
+
+ // Verify that binding the v4 loopback on the same port with a v4 socket
+ // fails.
+ TestAddress const& test_addr_v4 = V4Loopback();
+ sockaddr_storage addr_v4 = test_addr_v4.addr;
+ ASSERT_NO_ERRNO(SetAddrPort(test_addr_v4.family(), &addr_v4, port));
+ const FileDescriptor fd_v4 =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr_v4.family(), param.type, 0));
+ ASSERT_THAT(bind(fd_v4.get(), reinterpret_cast<sockaddr*>(&addr_v4),
+ test_addr_v4.addr_len),
+ SyscallFailsWithErrno(EADDRINUSE));
+
+ // Verify that binding the v4 any on the same port with a v4 socket
+ // fails.
+ TestAddress const& test_addr_v4_any = V4Any();
+ sockaddr_storage addr_v4_any = test_addr_v4_any.addr;
+ ASSERT_NO_ERRNO(SetAddrPort(test_addr_v4_any.family(), &addr_v4_any, port));
+ const FileDescriptor fd_v4_any = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(test_addr_v4_any.family(), param.type, 0));
+ ASSERT_THAT(bind(fd_v4_any.get(), reinterpret_cast<sockaddr*>(&addr_v4_any),
+ test_addr_v4_any.addr_len),
+ SyscallFailsWithErrno(EADDRINUSE));
+}
+
+TEST_P(SocketMultiProtocolInetLoopbackTest,
+ DualStackV6AnyReuseAddrDoesNotReserveV4Any) {
+ auto const& param = GetParam();
+
+ // Bind the v6 any on a dual stack socket.
+ TestAddress const& test_addr_dual = V6Any();
+ sockaddr_storage addr_dual = test_addr_dual.addr;
+ const FileDescriptor fd_dual =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr_dual.family(), param.type, 0));
+ ASSERT_THAT(setsockopt(fd_dual.get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ ASSERT_THAT(bind(fd_dual.get(), reinterpret_cast<sockaddr*>(&addr_dual),
+ test_addr_dual.addr_len),
+ SyscallSucceeds());
+
+ // Get the port that we bound.
+ socklen_t addrlen = test_addr_dual.addr_len;
+ ASSERT_THAT(getsockname(fd_dual.get(),
+ reinterpret_cast<sockaddr*>(&addr_dual), &addrlen),
+ SyscallSucceeds());
+ uint16_t const port =
+ ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr_dual.family(), addr_dual));
+
+ // Verify that binding the v4 any on the same port with a v4 socket succeeds.
+ TestAddress const& test_addr_v4_any = V4Any();
+ sockaddr_storage addr_v4_any = test_addr_v4_any.addr;
+ ASSERT_NO_ERRNO(SetAddrPort(test_addr_v4_any.family(), &addr_v4_any, port));
+ const FileDescriptor fd_v4_any = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(test_addr_v4_any.family(), param.type, 0));
+ ASSERT_THAT(setsockopt(fd_v4_any.get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ ASSERT_THAT(bind(fd_v4_any.get(), reinterpret_cast<sockaddr*>(&addr_v4_any),
+ test_addr_v4_any.addr_len),
+ SyscallSucceeds());
+}
+
+TEST_P(SocketMultiProtocolInetLoopbackTest,
+ DualStackV6AnyReuseAddrListenReservesV4Any) {
+ auto const& param = GetParam();
+
+ // Only TCP sockets are supported.
+ SKIP_IF((param.type & SOCK_STREAM) == 0);
+
+ // Bind the v6 any on a dual stack socket.
+ TestAddress const& test_addr_dual = V6Any();
+ sockaddr_storage addr_dual = test_addr_dual.addr;
+ const FileDescriptor fd_dual =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr_dual.family(), param.type, 0));
+ ASSERT_THAT(setsockopt(fd_dual.get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ ASSERT_THAT(bind(fd_dual.get(), reinterpret_cast<sockaddr*>(&addr_dual),
+ test_addr_dual.addr_len),
+ SyscallSucceeds());
+
+ ASSERT_THAT(listen(fd_dual.get(), 5), SyscallSucceeds());
+
+ // Get the port that we bound.
+ socklen_t addrlen = test_addr_dual.addr_len;
+ ASSERT_THAT(getsockname(fd_dual.get(),
+ reinterpret_cast<sockaddr*>(&addr_dual), &addrlen),
+ SyscallSucceeds());
+ uint16_t const port =
+ ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr_dual.family(), addr_dual));
+
+ // Verify that binding the v4 any on the same port with a v4 socket succeeds.
+ TestAddress const& test_addr_v4_any = V4Any();
+ sockaddr_storage addr_v4_any = test_addr_v4_any.addr;
+ ASSERT_NO_ERRNO(SetAddrPort(test_addr_v4_any.family(), &addr_v4_any, port));
+ const FileDescriptor fd_v4_any = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(test_addr_v4_any.family(), param.type, 0));
+ ASSERT_THAT(setsockopt(fd_v4_any.get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+
+ ASSERT_THAT(bind(fd_v4_any.get(), reinterpret_cast<sockaddr*>(&addr_v4_any),
+ test_addr_v4_any.addr_len),
+ SyscallFailsWithErrno(EADDRINUSE));
+}
+
+TEST_P(SocketMultiProtocolInetLoopbackTest,
+ DualStackV6AnyWithListenReservesEverything) {
+ auto const& param = GetParam();
+
+ // Only TCP sockets are supported.
+ SKIP_IF((param.type & SOCK_STREAM) == 0);
+
+ // Bind the v6 any on a dual stack socket.
+ TestAddress const& test_addr_dual = V6Any();
+ sockaddr_storage addr_dual = test_addr_dual.addr;
+ const FileDescriptor fd_dual =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr_dual.family(), param.type, 0));
+ ASSERT_THAT(bind(fd_dual.get(), reinterpret_cast<sockaddr*>(&addr_dual),
+ test_addr_dual.addr_len),
+ SyscallSucceeds());
+
+ ASSERT_THAT(listen(fd_dual.get(), 5), SyscallSucceeds());
+
+ // Get the port that we bound.
+ socklen_t addrlen = test_addr_dual.addr_len;
+ ASSERT_THAT(getsockname(fd_dual.get(),
+ reinterpret_cast<sockaddr*>(&addr_dual), &addrlen),
+ SyscallSucceeds());
+ uint16_t const port =
+ ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr_dual.family(), addr_dual));
+
+ // Verify that binding the v6 loopback with the same port fails.
+ TestAddress const& test_addr_v6 = V6Loopback();
+ sockaddr_storage addr_v6 = test_addr_v6.addr;
+ ASSERT_NO_ERRNO(SetAddrPort(test_addr_v6.family(), &addr_v6, port));
+ const FileDescriptor fd_v6 =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr_v6.family(), param.type, 0));
+ ASSERT_THAT(bind(fd_v6.get(), reinterpret_cast<sockaddr*>(&addr_v6),
+ test_addr_v6.addr_len),
+ SyscallFailsWithErrno(EADDRINUSE));
+
+ // Verify that binding the v4 loopback on the same port with a v6 socket
+ // fails.
+ TestAddress const& test_addr_v4_mapped = V4MappedLoopback();
+ sockaddr_storage addr_v4_mapped = test_addr_v4_mapped.addr;
+ ASSERT_NO_ERRNO(
+ SetAddrPort(test_addr_v4_mapped.family(), &addr_v4_mapped, port));
+ const FileDescriptor fd_v4_mapped = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(test_addr_v4_mapped.family(), param.type, 0));
+ ASSERT_THAT(
+ bind(fd_v4_mapped.get(), reinterpret_cast<sockaddr*>(&addr_v4_mapped),
+ test_addr_v4_mapped.addr_len),
+ SyscallFailsWithErrno(EADDRINUSE));
+
+ // Verify that binding the v4 loopback on the same port with a v4 socket
+ // fails.
+ TestAddress const& test_addr_v4 = V4Loopback();
+ sockaddr_storage addr_v4 = test_addr_v4.addr;
+ ASSERT_NO_ERRNO(SetAddrPort(test_addr_v4.family(), &addr_v4, port));
+ const FileDescriptor fd_v4 =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr_v4.family(), param.type, 0));
+ ASSERT_THAT(bind(fd_v4.get(), reinterpret_cast<sockaddr*>(&addr_v4),
+ test_addr_v4.addr_len),
+ SyscallFailsWithErrno(EADDRINUSE));
+
+ // Verify that binding the v4 any on the same port with a v4 socket
+ // fails.
+ TestAddress const& test_addr_v4_any = V4Any();
+ sockaddr_storage addr_v4_any = test_addr_v4_any.addr;
+ ASSERT_NO_ERRNO(SetAddrPort(test_addr_v4_any.family(), &addr_v4_any, port));
+ const FileDescriptor fd_v4_any = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(test_addr_v4_any.family(), param.type, 0));
+ ASSERT_THAT(bind(fd_v4_any.get(), reinterpret_cast<sockaddr*>(&addr_v4_any),
+ test_addr_v4_any.addr_len),
+ SyscallFailsWithErrno(EADDRINUSE));
+}
+
+TEST_P(SocketMultiProtocolInetLoopbackTest, V6OnlyV6AnyReservesV6) {
+ auto const& param = GetParam();
+
+ for (int i = 0; true; i++) {
+ // Bind the v6 any on a v6-only socket.
+ TestAddress const& test_addr_dual = V6Any();
+ sockaddr_storage addr_dual = test_addr_dual.addr;
+ const FileDescriptor fd_dual = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(test_addr_dual.family(), param.type, 0));
+ EXPECT_THAT(setsockopt(fd_dual.get(), IPPROTO_IPV6, IPV6_V6ONLY,
+ &kSockOptOn, sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ ASSERT_THAT(bind(fd_dual.get(), reinterpret_cast<sockaddr*>(&addr_dual),
+ test_addr_dual.addr_len),
+ SyscallSucceeds());
+
+ // Get the port that we bound.
+ socklen_t addrlen = test_addr_dual.addr_len;
+ ASSERT_THAT(getsockname(fd_dual.get(),
+ reinterpret_cast<sockaddr*>(&addr_dual), &addrlen),
+ SyscallSucceeds());
+ uint16_t const port =
+ ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr_dual.family(), addr_dual));
+
+ // Verify that binding the v6 loopback with the same port fails.
+ TestAddress const& test_addr_v6 = V6Loopback();
+ sockaddr_storage addr_v6 = test_addr_v6.addr;
+ ASSERT_NO_ERRNO(SetAddrPort(test_addr_v6.family(), &addr_v6, port));
+ const FileDescriptor fd_v6 =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr_v6.family(), param.type, 0));
+ ASSERT_THAT(bind(fd_v6.get(), reinterpret_cast<sockaddr*>(&addr_v6),
+ test_addr_v6.addr_len),
+ SyscallFailsWithErrno(EADDRINUSE));
+
+ // Verify that we can still bind the v4 loopback on the same port.
+ TestAddress const& test_addr_v4_mapped = V4MappedLoopback();
+ sockaddr_storage addr_v4_mapped = test_addr_v4_mapped.addr;
+ ASSERT_NO_ERRNO(
+ SetAddrPort(test_addr_v4_mapped.family(), &addr_v4_mapped, port));
+ const FileDescriptor fd_v4_mapped = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(test_addr_v4_mapped.family(), param.type, 0));
+ int ret =
+ bind(fd_v4_mapped.get(), reinterpret_cast<sockaddr*>(&addr_v4_mapped),
+ test_addr_v4_mapped.addr_len);
+ if (ret == -1 && errno == EADDRINUSE) {
+ // Port may have been in use.
+ ASSERT_LT(i, 100); // Give up after 100 tries.
+ continue;
+ }
+ ASSERT_THAT(ret, SyscallSucceeds());
+
+ // No need to try again.
+ break;
+ }
+}
+
+TEST_P(SocketMultiProtocolInetLoopbackTest, V6EphemeralPortReserved) {
+ auto const& param = GetParam();
+
+ for (int i = 0; true; i++) {
+ // Bind the v6 loopback on a dual stack socket.
+ TestAddress const& test_addr = V6Loopback();
+ sockaddr_storage bound_addr = test_addr.addr;
+ const FileDescriptor bound_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+ ASSERT_THAT(bind(bound_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
+ test_addr.addr_len),
+ SyscallSucceeds());
+
+ // Listen iff TCP.
+ if (param.type == SOCK_STREAM) {
+ ASSERT_THAT(listen(bound_fd.get(), SOMAXCONN), SyscallSucceeds());
+ }
+
+ // Get the port that we bound.
+ socklen_t bound_addr_len = test_addr.addr_len;
+ ASSERT_THAT(
+ getsockname(bound_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
+ &bound_addr_len),
+ SyscallSucceeds());
+
+ // Connect to bind an ephemeral port.
+ const FileDescriptor connected_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+ ASSERT_THAT(RetryEINTR(connect)(connected_fd.get(),
+ reinterpret_cast<sockaddr*>(&bound_addr),
+ bound_addr_len),
+ SyscallSucceeds());
+
+ // Get the ephemeral port.
+ sockaddr_storage connected_addr = {};
+ socklen_t connected_addr_len = sizeof(connected_addr);
+ ASSERT_THAT(getsockname(connected_fd.get(),
+ reinterpret_cast<sockaddr*>(&connected_addr),
+ &connected_addr_len),
+ SyscallSucceeds());
+ uint16_t const ephemeral_port =
+ ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr.family(), connected_addr));
+
+ // Verify that we actually got an ephemeral port.
+ ASSERT_NE(ephemeral_port, 0);
+
+ // Verify that the ephemeral port is reserved.
+ const FileDescriptor checking_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+ EXPECT_THAT(
+ bind(checking_fd.get(), reinterpret_cast<sockaddr*>(&connected_addr),
+ connected_addr_len),
+ SyscallFailsWithErrno(EADDRINUSE));
+
+ // Verify that binding the v6 loopback with the same port fails.
+ TestAddress const& test_addr_v6 = V6Loopback();
+ sockaddr_storage addr_v6 = test_addr_v6.addr;
+ ASSERT_NO_ERRNO(
+ SetAddrPort(test_addr_v6.family(), &addr_v6, ephemeral_port));
+ const FileDescriptor fd_v6 =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr_v6.family(), param.type, 0));
+ ASSERT_THAT(bind(fd_v6.get(), reinterpret_cast<sockaddr*>(&addr_v6),
+ test_addr_v6.addr_len),
+ SyscallFailsWithErrno(EADDRINUSE));
+
+ // Verify that we can still bind the v4 loopback on the same port.
+ TestAddress const& test_addr_v4_mapped = V4MappedLoopback();
+ sockaddr_storage addr_v4_mapped = test_addr_v4_mapped.addr;
+ ASSERT_NO_ERRNO(SetAddrPort(test_addr_v4_mapped.family(), &addr_v4_mapped,
+ ephemeral_port));
+ const FileDescriptor fd_v4_mapped = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(test_addr_v4_mapped.family(), param.type, 0));
+ int ret =
+ bind(fd_v4_mapped.get(), reinterpret_cast<sockaddr*>(&addr_v4_mapped),
+ test_addr_v4_mapped.addr_len);
+ if (ret == -1 && errno == EADDRINUSE) {
+ // Port may have been in use.
+ ASSERT_LT(i, 100); // Give up after 100 tries.
+ continue;
+ }
+ EXPECT_THAT(ret, SyscallSucceeds());
+
+ // No need to try again.
+ break;
+ }
+}
+
+TEST_P(SocketMultiProtocolInetLoopbackTest, V6EphemeralPortReservedReuseAddr) {
+ auto const& param = GetParam();
+
+ // Bind the v6 loopback on a dual stack socket.
+ TestAddress const& test_addr = V6Loopback();
+ sockaddr_storage bound_addr = test_addr.addr;
+ const FileDescriptor bound_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+ ASSERT_THAT(bind(bound_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
+ test_addr.addr_len),
+ SyscallSucceeds());
+ ASSERT_THAT(setsockopt(bound_fd.get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+
+ // Listen iff TCP.
+ if (param.type == SOCK_STREAM) {
+ ASSERT_THAT(listen(bound_fd.get(), SOMAXCONN), SyscallSucceeds());
+ }
+
+ // Get the port that we bound.
+ socklen_t bound_addr_len = test_addr.addr_len;
+ ASSERT_THAT(
+ getsockname(bound_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
+ &bound_addr_len),
+ SyscallSucceeds());
+
+ // Connect to bind an ephemeral port.
+ const FileDescriptor connected_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+ ASSERT_THAT(setsockopt(connected_fd.get(), SOL_SOCKET, SO_REUSEADDR,
+ &kSockOptOn, sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ ASSERT_THAT(RetryEINTR(connect)(connected_fd.get(),
+ reinterpret_cast<sockaddr*>(&bound_addr),
+ bound_addr_len),
+ SyscallSucceeds());
+
+ // Get the ephemeral port.
+ sockaddr_storage connected_addr = {};
+ socklen_t connected_addr_len = sizeof(connected_addr);
+ ASSERT_THAT(getsockname(connected_fd.get(),
+ reinterpret_cast<sockaddr*>(&connected_addr),
+ &connected_addr_len),
+ SyscallSucceeds());
+ uint16_t const ephemeral_port =
+ ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr.family(), connected_addr));
+
+ // Verify that we actually got an ephemeral port.
+ ASSERT_NE(ephemeral_port, 0);
+
+ // Verify that the ephemeral port is not reserved.
+ const FileDescriptor checking_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+ ASSERT_THAT(setsockopt(checking_fd.get(), SOL_SOCKET, SO_REUSEADDR,
+ &kSockOptOn, sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ EXPECT_THAT(
+ bind(checking_fd.get(), reinterpret_cast<sockaddr*>(&connected_addr),
+ connected_addr_len),
+ SyscallSucceeds());
+}
+
+TEST_P(SocketMultiProtocolInetLoopbackTest, V4MappedEphemeralPortReserved) {
+ auto const& param = GetParam();
+
+ for (int i = 0; true; i++) {
+ // Bind the v4 loopback on a dual stack socket.
+ TestAddress const& test_addr = V4MappedLoopback();
+ sockaddr_storage bound_addr = test_addr.addr;
+ const FileDescriptor bound_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+ ASSERT_THAT(bind(bound_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
+ test_addr.addr_len),
+ SyscallSucceeds());
+
+ // Listen iff TCP.
+ if (param.type == SOCK_STREAM) {
+ ASSERT_THAT(listen(bound_fd.get(), SOMAXCONN), SyscallSucceeds());
+ }
+
+ // Get the port that we bound.
+ socklen_t bound_addr_len = test_addr.addr_len;
+ ASSERT_THAT(
+ getsockname(bound_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
+ &bound_addr_len),
+ SyscallSucceeds());
+
+ // Connect to bind an ephemeral port.
+ const FileDescriptor connected_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+ ASSERT_THAT(RetryEINTR(connect)(connected_fd.get(),
+ reinterpret_cast<sockaddr*>(&bound_addr),
+ bound_addr_len),
+ SyscallSucceeds());
+
+ // Get the ephemeral port.
+ sockaddr_storage connected_addr = {};
+ socklen_t connected_addr_len = sizeof(connected_addr);
+ ASSERT_THAT(getsockname(connected_fd.get(),
+ reinterpret_cast<sockaddr*>(&connected_addr),
+ &connected_addr_len),
+ SyscallSucceeds());
+ uint16_t const ephemeral_port =
+ ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr.family(), connected_addr));
+
+ // Verify that we actually got an ephemeral port.
+ ASSERT_NE(ephemeral_port, 0);
+
+ // Verify that the ephemeral port is reserved.
+ const FileDescriptor checking_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+ EXPECT_THAT(
+ bind(checking_fd.get(), reinterpret_cast<sockaddr*>(&connected_addr),
+ connected_addr_len),
+ SyscallFailsWithErrno(EADDRINUSE));
+
+ // Verify that binding the v4 loopback on the same port with a v4 socket
+ // fails.
+ TestAddress const& test_addr_v4 = V4Loopback();
+ sockaddr_storage addr_v4 = test_addr_v4.addr;
+ ASSERT_NO_ERRNO(
+ SetAddrPort(test_addr_v4.family(), &addr_v4, ephemeral_port));
+ const FileDescriptor fd_v4 =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr_v4.family(), param.type, 0));
+ EXPECT_THAT(bind(fd_v4.get(), reinterpret_cast<sockaddr*>(&addr_v4),
+ test_addr_v4.addr_len),
+ SyscallFailsWithErrno(EADDRINUSE));
+
+ // Verify that binding the v6 any on the same port with a dual-stack socket
+ // fails.
+ TestAddress const& test_addr_v6_any = V6Any();
+ sockaddr_storage addr_v6_any = test_addr_v6_any.addr;
+ ASSERT_NO_ERRNO(
+ SetAddrPort(test_addr_v6_any.family(), &addr_v6_any, ephemeral_port));
+ const FileDescriptor fd_v6_any = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(test_addr_v6_any.family(), param.type, 0));
+ ASSERT_THAT(bind(fd_v6_any.get(), reinterpret_cast<sockaddr*>(&addr_v6_any),
+ test_addr_v6_any.addr_len),
+ SyscallFailsWithErrno(EADDRINUSE));
+
+ // For some reason, binding the TCP v6-only any is flaky on Linux. Maybe we
+ // tend to run out of ephemeral ports? Regardless, binding the v6 loopback
+ // seems pretty reliable. Only try to bind the v6-only any on UDP and
+ // gVisor.
+
+ int ret = -1;
+
+ if (!IsRunningOnGvisor() && param.type == SOCK_STREAM) {
+ // Verify that we can still bind the v6 loopback on the same port.
+ TestAddress const& test_addr_v6 = V6Loopback();
+ sockaddr_storage addr_v6 = test_addr_v6.addr;
+ ASSERT_NO_ERRNO(
+ SetAddrPort(test_addr_v6.family(), &addr_v6, ephemeral_port));
+ const FileDescriptor fd_v6 = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(test_addr_v6.family(), param.type, 0));
+ ret = bind(fd_v6.get(), reinterpret_cast<sockaddr*>(&addr_v6),
+ test_addr_v6.addr_len);
+ } else {
+ // Verify that we can still bind the v6 any on the same port with a
+ // v6-only socket.
+ const FileDescriptor fd_v6_only_any = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(test_addr_v6_any.family(), param.type, 0));
+ EXPECT_THAT(setsockopt(fd_v6_only_any.get(), IPPROTO_IPV6, IPV6_V6ONLY,
+ &kSockOptOn, sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ ret =
+ bind(fd_v6_only_any.get(), reinterpret_cast<sockaddr*>(&addr_v6_any),
+ test_addr_v6_any.addr_len);
+ }
+
+ if (ret == -1 && errno == EADDRINUSE) {
+ // Port may have been in use.
+ ASSERT_LT(i, 100); // Give up after 100 tries.
+ continue;
+ }
+ EXPECT_THAT(ret, SyscallSucceeds());
+
+ // No need to try again.
+ break;
+ }
+}
+
+TEST_P(SocketMultiProtocolInetLoopbackTest,
+ V4MappedEphemeralPortReservedResueAddr) {
+ auto const& param = GetParam();
+
+ // Bind the v4 loopback on a dual stack socket.
+ TestAddress const& test_addr = V4MappedLoopback();
+ sockaddr_storage bound_addr = test_addr.addr;
+ const FileDescriptor bound_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+ ASSERT_THAT(bind(bound_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
+ test_addr.addr_len),
+ SyscallSucceeds());
+
+ ASSERT_THAT(setsockopt(bound_fd.get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+
+ // Listen iff TCP.
+ if (param.type == SOCK_STREAM) {
+ ASSERT_THAT(listen(bound_fd.get(), SOMAXCONN), SyscallSucceeds());
+ }
+
+ // Get the port that we bound.
+ socklen_t bound_addr_len = test_addr.addr_len;
+ ASSERT_THAT(
+ getsockname(bound_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
+ &bound_addr_len),
+ SyscallSucceeds());
+
+ // Connect to bind an ephemeral port.
+ const FileDescriptor connected_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+ ASSERT_THAT(setsockopt(connected_fd.get(), SOL_SOCKET, SO_REUSEADDR,
+ &kSockOptOn, sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ ASSERT_THAT(RetryEINTR(connect)(connected_fd.get(),
+ reinterpret_cast<sockaddr*>(&bound_addr),
+ bound_addr_len),
+ SyscallSucceeds());
+
+ // Get the ephemeral port.
+ sockaddr_storage connected_addr = {};
+ socklen_t connected_addr_len = sizeof(connected_addr);
+ ASSERT_THAT(getsockname(connected_fd.get(),
+ reinterpret_cast<sockaddr*>(&connected_addr),
+ &connected_addr_len),
+ SyscallSucceeds());
+ uint16_t const ephemeral_port =
+ ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr.family(), connected_addr));
+
+ // Verify that we actually got an ephemeral port.
+ ASSERT_NE(ephemeral_port, 0);
+
+ // Verify that the ephemeral port is not reserved.
+ const FileDescriptor checking_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+ ASSERT_THAT(setsockopt(checking_fd.get(), SOL_SOCKET, SO_REUSEADDR,
+ &kSockOptOn, sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ EXPECT_THAT(
+ bind(checking_fd.get(), reinterpret_cast<sockaddr*>(&connected_addr),
+ connected_addr_len),
+ SyscallSucceeds());
+}
+
+TEST_P(SocketMultiProtocolInetLoopbackTest, V4EphemeralPortReserved) {
+ auto const& param = GetParam();
+
+ for (int i = 0; true; i++) {
+ // Bind the v4 loopback on a v4 socket.
+ TestAddress const& test_addr = V4Loopback();
+ sockaddr_storage bound_addr = test_addr.addr;
+ const FileDescriptor bound_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+ ASSERT_THAT(bind(bound_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
+ test_addr.addr_len),
+ SyscallSucceeds());
+
+ // Listen iff TCP.
+ if (param.type == SOCK_STREAM) {
+ ASSERT_THAT(listen(bound_fd.get(), SOMAXCONN), SyscallSucceeds());
+ }
+
+ // Get the port that we bound.
+ socklen_t bound_addr_len = test_addr.addr_len;
+ ASSERT_THAT(
+ getsockname(bound_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
+ &bound_addr_len),
+ SyscallSucceeds());
+
+ // Connect to bind an ephemeral port.
+ const FileDescriptor connected_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+ ASSERT_THAT(RetryEINTR(connect)(connected_fd.get(),
+ reinterpret_cast<sockaddr*>(&bound_addr),
+ bound_addr_len),
+ SyscallSucceeds());
+
+ // Get the ephemeral port.
+ sockaddr_storage connected_addr = {};
+ socklen_t connected_addr_len = sizeof(connected_addr);
+ ASSERT_THAT(getsockname(connected_fd.get(),
+ reinterpret_cast<sockaddr*>(&connected_addr),
+ &connected_addr_len),
+ SyscallSucceeds());
+ uint16_t const ephemeral_port =
+ ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr.family(), connected_addr));
+
+ // Verify that we actually got an ephemeral port.
+ ASSERT_NE(ephemeral_port, 0);
+
+ // Verify that the ephemeral port is reserved.
+ const FileDescriptor checking_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+ EXPECT_THAT(
+ bind(checking_fd.get(), reinterpret_cast<sockaddr*>(&connected_addr),
+ connected_addr_len),
+ SyscallFailsWithErrno(EADDRINUSE));
+
+ // Verify that binding the v4 loopback on the same port with a v6 socket
+ // fails.
+ TestAddress const& test_addr_v4_mapped = V4MappedLoopback();
+ sockaddr_storage addr_v4_mapped = test_addr_v4_mapped.addr;
+ ASSERT_NO_ERRNO(SetAddrPort(test_addr_v4_mapped.family(), &addr_v4_mapped,
+ ephemeral_port));
+ const FileDescriptor fd_v4_mapped = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(test_addr_v4_mapped.family(), param.type, 0));
+ EXPECT_THAT(
+ bind(fd_v4_mapped.get(), reinterpret_cast<sockaddr*>(&addr_v4_mapped),
+ test_addr_v4_mapped.addr_len),
+ SyscallFailsWithErrno(EADDRINUSE));
+
+ // Verify that binding the v6 any on the same port with a dual-stack socket
+ // fails.
+ TestAddress const& test_addr_v6_any = V6Any();
+ sockaddr_storage addr_v6_any = test_addr_v6_any.addr;
+ ASSERT_NO_ERRNO(
+ SetAddrPort(test_addr_v6_any.family(), &addr_v6_any, ephemeral_port));
+ const FileDescriptor fd_v6_any = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(test_addr_v6_any.family(), param.type, 0));
+ ASSERT_THAT(bind(fd_v6_any.get(), reinterpret_cast<sockaddr*>(&addr_v6_any),
+ test_addr_v6_any.addr_len),
+ SyscallFailsWithErrno(EADDRINUSE));
+
+ // For some reason, binding the TCP v6-only any is flaky on Linux. Maybe we
+ // tend to run out of ephemeral ports? Regardless, binding the v6 loopback
+ // seems pretty reliable. Only try to bind the v6-only any on UDP and
+ // gVisor.
+
+ int ret = -1;
+
+ if (!IsRunningOnGvisor() && param.type == SOCK_STREAM) {
+ // Verify that we can still bind the v6 loopback on the same port.
+ TestAddress const& test_addr_v6 = V6Loopback();
+ sockaddr_storage addr_v6 = test_addr_v6.addr;
+ ASSERT_NO_ERRNO(
+ SetAddrPort(test_addr_v6.family(), &addr_v6, ephemeral_port));
+ const FileDescriptor fd_v6 = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(test_addr_v6.family(), param.type, 0));
+ ret = bind(fd_v6.get(), reinterpret_cast<sockaddr*>(&addr_v6),
+ test_addr_v6.addr_len);
+ } else {
+ // Verify that we can still bind the v6 any on the same port with a
+ // v6-only socket.
+ const FileDescriptor fd_v6_only_any = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(test_addr_v6_any.family(), param.type, 0));
+ EXPECT_THAT(setsockopt(fd_v6_only_any.get(), IPPROTO_IPV6, IPV6_V6ONLY,
+ &kSockOptOn, sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ ret =
+ bind(fd_v6_only_any.get(), reinterpret_cast<sockaddr*>(&addr_v6_any),
+ test_addr_v6_any.addr_len);
+ }
+
+ if (ret == -1 && errno == EADDRINUSE) {
+ // Port may have been in use.
+ ASSERT_LT(i, 100); // Give up after 100 tries.
+ continue;
+ }
+ EXPECT_THAT(ret, SyscallSucceeds());
+
+ // No need to try again.
+ break;
+ }
+}
+
+TEST_P(SocketMultiProtocolInetLoopbackTest, V4EphemeralPortReservedReuseAddr) {
+ auto const& param = GetParam();
+
+ // Bind the v4 loopback on a v4 socket.
+ TestAddress const& test_addr = V4Loopback();
+ sockaddr_storage bound_addr = test_addr.addr;
+ const FileDescriptor bound_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+
+ ASSERT_THAT(setsockopt(bound_fd.get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+
+ ASSERT_THAT(bind(bound_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
+ test_addr.addr_len),
+ SyscallSucceeds());
+
+ // Listen iff TCP.
+ if (param.type == SOCK_STREAM) {
+ ASSERT_THAT(listen(bound_fd.get(), SOMAXCONN), SyscallSucceeds());
+ }
+
+ // Get the port that we bound.
+ socklen_t bound_addr_len = test_addr.addr_len;
+ ASSERT_THAT(
+ getsockname(bound_fd.get(), reinterpret_cast<sockaddr*>(&bound_addr),
+ &bound_addr_len),
+ SyscallSucceeds());
+
+ // Connect to bind an ephemeral port.
+ const FileDescriptor connected_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+
+ ASSERT_THAT(setsockopt(connected_fd.get(), SOL_SOCKET, SO_REUSEADDR,
+ &kSockOptOn, sizeof(kSockOptOn)),
+ SyscallSucceeds());
+
+ ASSERT_THAT(RetryEINTR(connect)(connected_fd.get(),
+ reinterpret_cast<sockaddr*>(&bound_addr),
+ bound_addr_len),
+ SyscallSucceeds());
+
+ // Get the ephemeral port.
+ sockaddr_storage connected_addr = {};
+ socklen_t connected_addr_len = sizeof(connected_addr);
+ ASSERT_THAT(getsockname(connected_fd.get(),
+ reinterpret_cast<sockaddr*>(&connected_addr),
+ &connected_addr_len),
+ SyscallSucceeds());
+ uint16_t const ephemeral_port =
+ ASSERT_NO_ERRNO_AND_VALUE(AddrPort(test_addr.family(), connected_addr));
+
+ // Verify that we actually got an ephemeral port.
+ ASSERT_NE(ephemeral_port, 0);
+
+ // Verify that the ephemeral port is not reserved.
+ const FileDescriptor checking_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+ ASSERT_THAT(setsockopt(checking_fd.get(), SOL_SOCKET, SO_REUSEADDR,
+ &kSockOptOn, sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ EXPECT_THAT(
+ bind(checking_fd.get(), reinterpret_cast<sockaddr*>(&connected_addr),
+ connected_addr_len),
+ SyscallSucceeds());
+}
+
+TEST_P(SocketMultiProtocolInetLoopbackTest, PortReuseTwoSockets) {
+ auto const& param = GetParam();
+ TestAddress const& test_addr = V4Loopback();
+ sockaddr_storage addr = test_addr.addr;
+
+ for (int i = 0; i < 2; i++) {
+ const int portreuse1 = i % 2;
+ auto s1 =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+ int fd1 = s1.get();
+ socklen_t addrlen = test_addr.addr_len;
+
+ EXPECT_THAT(
+ setsockopt(fd1, SOL_SOCKET, SO_REUSEPORT, &portreuse1, sizeof(int)),
+ SyscallSucceeds());
+
+ ASSERT_THAT(bind(fd1, reinterpret_cast<sockaddr*>(&addr), addrlen),
+ SyscallSucceeds());
+
+ ASSERT_THAT(getsockname(fd1, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+ SyscallSucceeds());
+ if (param.type == SOCK_STREAM) {
+ ASSERT_THAT(listen(fd1, 1), SyscallSucceeds());
+ }
+
+ // j is less than 4 to check that the port reuse logic works correctly after
+ // closing bound sockets.
+ for (int j = 0; j < 4; j++) {
+ const int portreuse2 = j % 2;
+ auto s2 =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+ int fd2 = s2.get();
+
+ EXPECT_THAT(
+ setsockopt(fd2, SOL_SOCKET, SO_REUSEPORT, &portreuse2, sizeof(int)),
+ SyscallSucceeds());
+
+ std::cout << portreuse1 << " " << portreuse2 << std::endl;
+ int ret = bind(fd2, reinterpret_cast<sockaddr*>(&addr), addrlen);
+
+ // Verify that two sockets can be bound to the same port only if
+ // SO_REUSEPORT is set for both of them.
+ if (!portreuse1 || !portreuse2) {
+ ASSERT_THAT(ret, SyscallFailsWithErrno(EADDRINUSE));
+ } else {
+ ASSERT_THAT(ret, SyscallSucceeds());
+ }
+ }
+ }
+}
+
+// Check that when a socket was bound to an address with REUSEPORT and then
+// closed, we can bind a different socket to the same address without needing
+// REUSEPORT.
+TEST_P(SocketMultiProtocolInetLoopbackTest, NoReusePortFollowingReusePort) {
+ auto const& param = GetParam();
+ TestAddress const& test_addr = V4Loopback();
+ sockaddr_storage addr = test_addr.addr;
+
+ auto s = ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+ int fd = s.get();
+ socklen_t addrlen = test_addr.addr_len;
+ int portreuse = 1;
+ ASSERT_THAT(
+ setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &portreuse, sizeof(portreuse)),
+ SyscallSucceeds());
+ ASSERT_THAT(bind(fd, reinterpret_cast<sockaddr*>(&addr), addrlen),
+ SyscallSucceeds());
+ ASSERT_THAT(getsockname(fd, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+ SyscallSucceeds());
+ ASSERT_EQ(addrlen, test_addr.addr_len);
+
+ s.reset();
+
+ // Open a new socket and bind to the same address, but w/o REUSEPORT.
+ s = ASSERT_NO_ERRNO_AND_VALUE(Socket(test_addr.family(), param.type, 0));
+ fd = s.get();
+ portreuse = 0;
+ ASSERT_THAT(
+ setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &portreuse, sizeof(portreuse)),
+ SyscallSucceeds());
+ ASSERT_THAT(bind(fd, reinterpret_cast<sockaddr*>(&addr), addrlen),
+ SyscallSucceeds());
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ AllFamilies, SocketMultiProtocolInetLoopbackTest,
+ ::testing::Values(ProtocolTestParam{"TCP", SOCK_STREAM},
+ ProtocolTestParam{"UDP", SOCK_DGRAM}),
+ DescribeProtocolTestParam);
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_inet_loopback_nogotsan.cc b/test/syscalls/linux/socket_inet_loopback_nogotsan.cc
new file mode 100644
index 000000000..2324c7f6a
--- /dev/null
+++ b/test/syscalls/linux/socket_inet_loopback_nogotsan.cc
@@ -0,0 +1,171 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <string.h>
+
+#include <iostream>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/strings/str_cat.h"
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/posix_error.h"
+#include "test/util/save_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+using ::testing::Gt;
+
+PosixErrorOr<uint16_t> AddrPort(int family, sockaddr_storage const& addr) {
+ switch (family) {
+ case AF_INET:
+ return static_cast<uint16_t>(
+ reinterpret_cast<sockaddr_in const*>(&addr)->sin_port);
+ case AF_INET6:
+ return static_cast<uint16_t>(
+ reinterpret_cast<sockaddr_in6 const*>(&addr)->sin6_port);
+ default:
+ return PosixError(EINVAL,
+ absl::StrCat("unknown socket family: ", family));
+ }
+}
+
+PosixError SetAddrPort(int family, sockaddr_storage* addr, uint16_t port) {
+ switch (family) {
+ case AF_INET:
+ reinterpret_cast<sockaddr_in*>(addr)->sin_port = port;
+ return NoError();
+ case AF_INET6:
+ reinterpret_cast<sockaddr_in6*>(addr)->sin6_port = port;
+ return NoError();
+ default:
+ return PosixError(EINVAL,
+ absl::StrCat("unknown socket family: ", family));
+ }
+}
+
+struct TestParam {
+ TestAddress listener;
+ TestAddress connector;
+};
+
+std::string DescribeTestParam(::testing::TestParamInfo<TestParam> const& info) {
+ return absl::StrCat("Listen", info.param.listener.description, "_Connect",
+ info.param.connector.description);
+}
+
+using SocketInetLoopbackTest = ::testing::TestWithParam<TestParam>;
+
+// This test verifies that connect returns EADDRNOTAVAIL if all local ephemeral
+// ports are already in use for a given destination ip/port.
+// We disable S/R because this test creates a large number of sockets.
+TEST_P(SocketInetLoopbackTest, TestTCPPortExhaustion_NoRandomSave) {
+ auto const& param = GetParam();
+ TestAddress const& listener = param.listener;
+ TestAddress const& connector = param.connector;
+
+ constexpr int kBacklog = 10;
+ constexpr int kClients = 65536;
+
+ // Create the listening socket.
+ auto listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+ sockaddr_storage listen_addr = listener.addr;
+ ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+ listener.addr_len),
+ SyscallSucceeds());
+ ASSERT_THAT(listen(listen_fd.get(), kBacklog), SyscallSucceeds());
+
+ // Get the port bound by the listening socket.
+ socklen_t addrlen = listener.addr_len;
+ ASSERT_THAT(getsockname(listen_fd.get(),
+ reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+ SyscallSucceeds());
+ uint16_t const port =
+ ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+
+ // Disable cooperative S/R as we are making too many syscalls.
+ DisableSave ds;
+
+ // Now we keep opening connections till we run out of local ephemeral ports.
+ // and assert the error we get back.
+ sockaddr_storage conn_addr = connector.addr;
+ ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+ std::vector<FileDescriptor> clients;
+ std::vector<FileDescriptor> servers;
+
+ for (int i = 0; i < kClients; i++) {
+ FileDescriptor client = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(connector.family(), SOCK_STREAM, IPPROTO_TCP));
+ int ret = connect(client.get(), reinterpret_cast<sockaddr*>(&conn_addr),
+ connector.addr_len);
+ if (ret == 0) {
+ clients.push_back(std::move(client));
+ FileDescriptor server =
+ ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+ servers.push_back(std::move(server));
+ continue;
+ }
+ ASSERT_THAT(ret, SyscallFailsWithErrno(EADDRNOTAVAIL));
+ break;
+ }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ All, SocketInetLoopbackTest,
+ ::testing::Values(
+ // Listeners bound to IPv4 addresses refuse connections using IPv6
+ // addresses.
+ TestParam{V4Any(), V4Any()}, TestParam{V4Any(), V4Loopback()},
+ TestParam{V4Any(), V4MappedAny()},
+ TestParam{V4Any(), V4MappedLoopback()},
+ TestParam{V4Loopback(), V4Any()}, TestParam{V4Loopback(), V4Loopback()},
+ TestParam{V4Loopback(), V4MappedLoopback()},
+ TestParam{V4MappedAny(), V4Any()},
+ TestParam{V4MappedAny(), V4Loopback()},
+ TestParam{V4MappedAny(), V4MappedAny()},
+ TestParam{V4MappedAny(), V4MappedLoopback()},
+ TestParam{V4MappedLoopback(), V4Any()},
+ TestParam{V4MappedLoopback(), V4Loopback()},
+ TestParam{V4MappedLoopback(), V4MappedLoopback()},
+
+ // Listeners bound to IN6ADDR_ANY accept all connections.
+ TestParam{V6Any(), V4Any()}, TestParam{V6Any(), V4Loopback()},
+ TestParam{V6Any(), V4MappedAny()},
+ TestParam{V6Any(), V4MappedLoopback()}, TestParam{V6Any(), V6Any()},
+ TestParam{V6Any(), V6Loopback()},
+
+ // Listeners bound to IN6ADDR_LOOPBACK refuse connections using IPv4
+ // addresses.
+ TestParam{V6Loopback(), V6Any()},
+ TestParam{V6Loopback(), V6Loopback()}),
+ DescribeTestParam);
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_loopback_blocking.cc b/test/syscalls/linux/socket_ip_loopback_blocking.cc
new file mode 100644
index 000000000..fda252dd7
--- /dev/null
+++ b/test/syscalls/linux/socket_ip_loopback_blocking.cc
@@ -0,0 +1,49 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <netinet/tcp.h>
+
+#include <vector>
+
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_blocking.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+ return VecCat<SocketPairKind>(
+ std::vector<SocketPairKind>{
+ IPv6UDPBidirectionalBindSocketPair(0),
+ IPv4UDPBidirectionalBindSocketPair(0),
+ },
+ ApplyVecToVec<SocketPairKind>(
+ std::vector<Middleware>{
+ NoOp, SetSockOpt(IPPROTO_TCP, TCP_NODELAY, &kSockOptOn)},
+ std::vector<SocketPairKind>{
+ IPv6TCPAcceptBindSocketPair(0),
+ IPv4TCPAcceptBindSocketPair(0),
+ }));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ BlockingIPSockets, BlockingSocketPairTest,
+ ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_tcp_generic.cc b/test/syscalls/linux/socket_ip_tcp_generic.cc
new file mode 100644
index 000000000..c2ecb639f
--- /dev/null
+++ b/test/syscalls/linux/socket_ip_tcp_generic.cc
@@ -0,0 +1,1054 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_ip_tcp_generic.h"
+
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <poll.h>
+#include <stdio.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+
+#include "gtest/gtest.h"
+#include "absl/memory/memory.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+TEST_P(TCPSocketPairTest, TcpInfoSucceeds) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ struct tcp_info opt = {};
+ socklen_t optLen = sizeof(opt);
+ EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_TCP, TCP_INFO, &opt, &optLen),
+ SyscallSucceeds());
+}
+
+TEST_P(TCPSocketPairTest, ShortTcpInfoSucceeds) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ struct tcp_info opt = {};
+ socklen_t optLen = 1;
+ EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_TCP, TCP_INFO, &opt, &optLen),
+ SyscallSucceeds());
+}
+
+TEST_P(TCPSocketPairTest, ZeroTcpInfoSucceeds) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ struct tcp_info opt = {};
+ socklen_t optLen = 0;
+ EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_TCP, TCP_INFO, &opt, &optLen),
+ SyscallSucceeds());
+}
+
+// This test validates that an RST is sent instead of a FIN when data is
+// unread on calls to close(2).
+TEST_P(TCPSocketPairTest, RSTSentOnCloseWithUnreadData) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char buf[10] = {};
+ ASSERT_THAT(RetryEINTR(write)(sockets->first_fd(), buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ // Wait until t_ sees the data on its side but don't read it.
+ struct pollfd poll_fd = {sockets->second_fd(), POLLIN | POLLHUP, 0};
+ constexpr int kPollTimeoutMs = 20000; // Wait up to 20 seconds for the data.
+ ASSERT_THAT(RetryEINTR(poll)(&poll_fd, 1, kPollTimeoutMs),
+ SyscallSucceedsWithValue(1));
+
+ // Now close the connected without reading the data.
+ ASSERT_THAT(close(sockets->release_second_fd()), SyscallSucceeds());
+
+ // Wait for the other end to receive the RST (up to 20 seconds).
+ struct pollfd poll_fd2 = {sockets->first_fd(), POLLIN | POLLHUP, 0};
+ ASSERT_THAT(RetryEINTR(poll)(&poll_fd2, 1, kPollTimeoutMs),
+ SyscallSucceedsWithValue(1));
+
+ // A shutdown with unread data will cause a RST to be sent instead
+ // of a FIN, per RFC 2525 section 2.17; this is also what Linux does.
+ ASSERT_THAT(RetryEINTR(read)(sockets->first_fd(), buf, sizeof(buf)),
+ SyscallFailsWithErrno(ECONNRESET));
+}
+
+// This test will validate that a RST will cause POLLHUP to trigger.
+TEST_P(TCPSocketPairTest, RSTCausesPollHUP) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char buf[10] = {};
+ ASSERT_THAT(RetryEINTR(write)(sockets->first_fd(), buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ // Wait until second sees the data on its side but don't read it.
+ struct pollfd poll_fd = {sockets->second_fd(), POLLIN, 0};
+ constexpr int kPollTimeoutMs = 20000; // Wait up to 20 seconds for the data.
+ ASSERT_THAT(RetryEINTR(poll)(&poll_fd, 1, kPollTimeoutMs),
+ SyscallSucceedsWithValue(1));
+ EXPECT_EQ(poll_fd.revents & POLLIN, POLLIN);
+
+ // Confirm we at least have one unread byte.
+ int bytes_available = 0;
+ ASSERT_THAT(
+ RetryEINTR(ioctl)(sockets->second_fd(), FIONREAD, &bytes_available),
+ SyscallSucceeds());
+ EXPECT_GT(bytes_available, 0);
+
+ // Now close the connected socket without reading the data from the second,
+ // this will cause a RST and we should see that with POLLHUP.
+ ASSERT_THAT(close(sockets->release_second_fd()), SyscallSucceeds());
+
+ // Wait for the other end to receive the RST (up to 20 seconds).
+ struct pollfd poll_fd3 = {sockets->first_fd(), POLLHUP, 0};
+ ASSERT_THAT(RetryEINTR(poll)(&poll_fd3, 1, kPollTimeoutMs),
+ SyscallSucceedsWithValue(1));
+ ASSERT_NE(poll_fd3.revents & POLLHUP, 0);
+}
+
+// This test validates that even if a RST is sent the other end will not
+// get an ECONNRESET until it's read all data.
+TEST_P(TCPSocketPairTest, RSTSentOnCloseWithUnreadDataAllowsReadBuffered) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char buf[10] = {};
+ ASSERT_THAT(RetryEINTR(write)(sockets->first_fd(), buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+ ASSERT_THAT(RetryEINTR(write)(sockets->second_fd(), buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ // Wait until second sees the data on its side but don't read it.
+ struct pollfd poll_fd = {sockets->second_fd(), POLLIN, 0};
+ constexpr int kPollTimeoutMs = 30000; // Wait up to 30 seconds for the data.
+ ASSERT_THAT(RetryEINTR(poll)(&poll_fd, 1, kPollTimeoutMs),
+ SyscallSucceedsWithValue(1));
+
+ // Wait until first sees the data on its side but don't read it.
+ struct pollfd poll_fd2 = {sockets->first_fd(), POLLIN, 0};
+ ASSERT_THAT(RetryEINTR(poll)(&poll_fd2, 1, kPollTimeoutMs),
+ SyscallSucceedsWithValue(1));
+
+ // Now close the connected socket without reading the data from the second.
+ ASSERT_THAT(close(sockets->release_second_fd()), SyscallSucceeds());
+
+ // Wait for the other end to receive the RST (up to 30 seconds).
+ struct pollfd poll_fd3 = {sockets->first_fd(), POLLHUP, 0};
+ ASSERT_THAT(RetryEINTR(poll)(&poll_fd3, 1, kPollTimeoutMs),
+ SyscallSucceedsWithValue(1));
+
+ // Since we also have data buffered we should be able to read it before
+ // the syscall will fail with ECONNRESET.
+ ASSERT_THAT(RetryEINTR(read)(sockets->first_fd(), buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ // A shutdown with unread data will cause a RST to be sent instead
+ // of a FIN, per RFC 2525 section 2.17; this is also what Linux does.
+ ASSERT_THAT(RetryEINTR(read)(sockets->first_fd(), buf, sizeof(buf)),
+ SyscallFailsWithErrno(ECONNRESET));
+}
+
+// This test will verify that a clean shutdown (FIN) is preformed when there
+// is unread data but only the write side is closed.
+TEST_P(TCPSocketPairTest, FINSentOnShutdownWrWithUnreadData) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char buf[10] = {};
+ ASSERT_THAT(RetryEINTR(write)(sockets->first_fd(), buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ // Wait until t_ sees the data on its side but don't read it.
+ struct pollfd poll_fd = {sockets->second_fd(), POLLIN | POLLHUP, 0};
+ constexpr int kPollTimeoutMs = 20000; // Wait up to 20 seconds for the data.
+ ASSERT_THAT(RetryEINTR(poll)(&poll_fd, 1, kPollTimeoutMs),
+ SyscallSucceedsWithValue(1));
+
+ // Now shutdown the write end leaving the read end open.
+ ASSERT_THAT(shutdown(sockets->second_fd(), SHUT_WR), SyscallSucceeds());
+
+ // Wait for the other end to receive the FIN (up to 20 seconds).
+ struct pollfd poll_fd2 = {sockets->first_fd(), POLLIN | POLLHUP, 0};
+ ASSERT_THAT(RetryEINTR(poll)(&poll_fd2, 1, kPollTimeoutMs),
+ SyscallSucceedsWithValue(1));
+
+ // Since we didn't shutdown the read end this will be a clean close.
+ ASSERT_THAT(RetryEINTR(read)(sockets->first_fd(), buf, sizeof(buf)),
+ SyscallSucceedsWithValue(0));
+}
+
+// This test will verify that when data is received by a socket, even if it's
+// not read SHUT_RD will not cause any packets to be generated.
+TEST_P(TCPSocketPairTest, ShutdownRdShouldCauseNoPacketsWithUnreadData) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char buf[10] = {};
+ ASSERT_THAT(RetryEINTR(write)(sockets->first_fd(), buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ // Wait until t_ sees the data on its side but don't read it.
+ struct pollfd poll_fd = {sockets->second_fd(), POLLIN | POLLHUP, 0};
+ constexpr int kPollTimeoutMs = 20000; // Wait up to 20 seconds for the data.
+ ASSERT_THAT(RetryEINTR(poll)(&poll_fd, 1, kPollTimeoutMs),
+ SyscallSucceedsWithValue(1));
+
+ // Now shutdown the read end, this will generate no packets to the other end.
+ ASSERT_THAT(shutdown(sockets->second_fd(), SHUT_RD), SyscallSucceeds());
+
+ // We should not receive any events on the other side of the socket.
+ struct pollfd poll_fd2 = {sockets->first_fd(), POLLIN | POLLHUP, 0};
+ constexpr int kPollNoResponseTimeoutMs = 3000;
+ ASSERT_THAT(RetryEINTR(poll)(&poll_fd2, 1, kPollNoResponseTimeoutMs),
+ SyscallSucceedsWithValue(0)); // Timeout.
+}
+
+// This test will verify that a socket which has unread data will still allow
+// the data to be read after shutting down the read side, and once there is no
+// unread data left, then read will return an EOF.
+TEST_P(TCPSocketPairTest, ShutdownRdAllowsReadOfReceivedDataBeforeEOF) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char buf[10] = {};
+ ASSERT_THAT(RetryEINTR(write)(sockets->first_fd(), buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ // Wait until t_ sees the data on its side but don't read it.
+ struct pollfd poll_fd = {sockets->second_fd(), POLLIN | POLLHUP, 0};
+ constexpr int kPollTimeoutMs = 20000; // Wait up to 20 seconds for the data.
+ ASSERT_THAT(RetryEINTR(poll)(&poll_fd, 1, kPollTimeoutMs),
+ SyscallSucceedsWithValue(1));
+
+ // Now shutdown the read end.
+ ASSERT_THAT(shutdown(sockets->second_fd(), SHUT_RD), SyscallSucceeds());
+
+ // Even though we did a SHUT_RD on the read end we can still read the data.
+ ASSERT_THAT(RetryEINTR(read)(sockets->second_fd(), buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ // After reading all of the data, reading the closed read end returns EOF.
+ ASSERT_THAT(RetryEINTR(poll)(&poll_fd, 1, kPollTimeoutMs),
+ SyscallSucceedsWithValue(1));
+ ASSERT_THAT(RetryEINTR(read)(sockets->second_fd(), buf, sizeof(buf)),
+ SyscallSucceedsWithValue(0));
+}
+
+// This test verifies that a shutdown(wr) by the server after sending
+// data allows the client to still read() the queued data and a client
+// close after sending response allows server to read the incoming
+// response.
+TEST_P(TCPSocketPairTest, ShutdownWrServerClientClose) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ char buf[10] = {};
+ ScopedThread t([&]() {
+ ASSERT_THAT(RetryEINTR(read)(sockets->first_fd(), buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+ ASSERT_THAT(RetryEINTR(write)(sockets->first_fd(), buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+ ASSERT_THAT(close(sockets->release_first_fd()),
+ SyscallSucceedsWithValue(0));
+ });
+ ASSERT_THAT(RetryEINTR(write)(sockets->second_fd(), buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+ ASSERT_THAT(RetryEINTR(shutdown)(sockets->second_fd(), SHUT_WR),
+ SyscallSucceedsWithValue(0));
+ t.Join();
+
+ ASSERT_THAT(RetryEINTR(read)(sockets->second_fd(), buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+}
+
+TEST_P(TCPSocketPairTest, ClosedReadNonBlockingSocket) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ // Set the read end to O_NONBLOCK.
+ int opts = 0;
+ ASSERT_THAT(opts = fcntl(sockets->second_fd(), F_GETFL), SyscallSucceeds());
+ ASSERT_THAT(fcntl(sockets->second_fd(), F_SETFL, opts | O_NONBLOCK),
+ SyscallSucceeds());
+
+ char buf[10] = {};
+ ASSERT_THAT(RetryEINTR(send)(sockets->first_fd(), buf, sizeof(buf), 0),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ // Wait until second_fd sees the data and then recv it.
+ struct pollfd poll_fd = {sockets->second_fd(), POLLIN, 0};
+ constexpr int kPollTimeoutMs = 2000; // Wait up to 2 seconds for the data.
+ ASSERT_THAT(RetryEINTR(poll)(&poll_fd, 1, kPollTimeoutMs),
+ SyscallSucceedsWithValue(1));
+
+ ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), buf, sizeof(buf), 0),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ // Now shutdown the write end leaving the read end open.
+ ASSERT_THAT(close(sockets->release_first_fd()), SyscallSucceeds());
+
+ // Wait for close notification and recv again.
+ struct pollfd poll_fd2 = {sockets->second_fd(), POLLIN, 0};
+ ASSERT_THAT(RetryEINTR(poll)(&poll_fd2, 1, kPollTimeoutMs),
+ SyscallSucceedsWithValue(1));
+
+ ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), buf, sizeof(buf), 0),
+ SyscallSucceedsWithValue(0));
+}
+
+TEST_P(TCPSocketPairTest,
+ ShutdownRdUnreadDataShouldCauseNoPacketsUnlessClosed) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char buf[10] = {};
+ ASSERT_THAT(RetryEINTR(write)(sockets->first_fd(), buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ // Wait until t_ sees the data on its side but don't read it.
+ struct pollfd poll_fd = {sockets->second_fd(), POLLIN | POLLHUP, 0};
+ constexpr int kPollTimeoutMs = 20000; // Wait up to 20 seconds for the data.
+ ASSERT_THAT(RetryEINTR(poll)(&poll_fd, 1, kPollTimeoutMs),
+ SyscallSucceedsWithValue(1));
+
+ // Now shutdown the read end, this will generate no packets to the other end.
+ ASSERT_THAT(shutdown(sockets->second_fd(), SHUT_RD), SyscallSucceeds());
+
+ // We should not receive any events on the other side of the socket.
+ struct pollfd poll_fd2 = {sockets->first_fd(), POLLIN | POLLHUP, 0};
+ constexpr int kPollNoResponseTimeoutMs = 3000;
+ ASSERT_THAT(RetryEINTR(poll)(&poll_fd2, 1, kPollNoResponseTimeoutMs),
+ SyscallSucceedsWithValue(0)); // Timeout.
+
+ // Now since we've fully closed the connection it will generate a RST.
+ ASSERT_THAT(close(sockets->release_second_fd()), SyscallSucceeds());
+ ASSERT_THAT(RetryEINTR(poll)(&poll_fd2, 1, kPollTimeoutMs),
+ SyscallSucceedsWithValue(1)); // The other end has closed.
+
+ // A shutdown with unread data will cause a RST to be sent instead
+ // of a FIN, per RFC 2525 section 2.17; this is also what Linux does.
+ ASSERT_THAT(RetryEINTR(read)(sockets->first_fd(), buf, sizeof(buf)),
+ SyscallFailsWithErrno(ECONNRESET));
+}
+
+TEST_P(TCPSocketPairTest, TCPCorkDefault) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ EXPECT_THAT(
+ getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CORK, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kSockOptOff);
+}
+
+TEST_P(TCPSocketPairTest, SetTCPCork) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CORK,
+ &kSockOptOn, sizeof(kSockOptOn)),
+ SyscallSucceeds());
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ EXPECT_THAT(
+ getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CORK, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kSockOptOn);
+
+ ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CORK,
+ &kSockOptOff, sizeof(kSockOptOff)),
+ SyscallSucceeds());
+
+ EXPECT_THAT(
+ getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CORK, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kSockOptOff);
+}
+
+TEST_P(TCPSocketPairTest, TCPCork) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CORK,
+ &kSockOptOn, sizeof(kSockOptOn)),
+ SyscallSucceeds());
+
+ constexpr char kData[] = "abc";
+ ASSERT_THAT(WriteFd(sockets->first_fd(), kData, sizeof(kData)),
+ SyscallSucceedsWithValue(sizeof(kData)));
+
+ ASSERT_NO_FATAL_FAILURE(RecvNoData(sockets->second_fd()));
+
+ EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CORK,
+ &kSockOptOff, sizeof(kSockOptOff)),
+ SyscallSucceeds());
+
+ // Create a receive buffer larger than kData.
+ char buf[(sizeof(kData) + 1) * 2] = {};
+ ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), buf, sizeof(buf), 0),
+ SyscallSucceedsWithValue(sizeof(kData)));
+ EXPECT_EQ(absl::string_view(kData, sizeof(kData)),
+ absl::string_view(buf, sizeof(kData)));
+}
+
+TEST_P(TCPSocketPairTest, TCPQuickAckDefault) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_QUICKACK, &get,
+ &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kSockOptOn);
+}
+
+TEST_P(TCPSocketPairTest, SetTCPQuickAck) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_QUICKACK,
+ &kSockOptOff, sizeof(kSockOptOff)),
+ SyscallSucceeds());
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_QUICKACK, &get,
+ &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kSockOptOff);
+
+ ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_QUICKACK,
+ &kSockOptOn, sizeof(kSockOptOn)),
+ SyscallSucceeds());
+
+ EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_QUICKACK, &get,
+ &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kSockOptOn);
+}
+
+TEST_P(TCPSocketPairTest, SoKeepaliveDefault) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ EXPECT_THAT(
+ getsockopt(sockets->first_fd(), SOL_SOCKET, SO_KEEPALIVE, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kSockOptOff);
+}
+
+TEST_P(TCPSocketPairTest, SetSoKeepalive) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_KEEPALIVE,
+ &kSockOptOn, sizeof(kSockOptOn)),
+ SyscallSucceeds());
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ EXPECT_THAT(
+ getsockopt(sockets->first_fd(), SOL_SOCKET, SO_KEEPALIVE, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kSockOptOn);
+
+ ASSERT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_KEEPALIVE,
+ &kSockOptOff, sizeof(kSockOptOff)),
+ SyscallSucceeds());
+
+ EXPECT_THAT(
+ getsockopt(sockets->first_fd(), SOL_SOCKET, SO_KEEPALIVE, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kSockOptOff);
+}
+
+TEST_P(TCPSocketPairTest, TCPKeepidleDefault) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_KEEPIDLE, &get,
+ &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, 2 * 60 * 60); // 2 hours.
+}
+
+TEST_P(TCPSocketPairTest, TCPKeepintvlDefault) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_KEEPINTVL, &get,
+ &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, 75); // 75 seconds.
+}
+
+TEST_P(TCPSocketPairTest, SetTCPKeepidleZero) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ constexpr int kZero = 0;
+ EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_KEEPIDLE, &kZero,
+ sizeof(kZero)),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(TCPSocketPairTest, SetTCPKeepintvlZero) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ constexpr int kZero = 0;
+ EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_KEEPINTVL,
+ &kZero, sizeof(kZero)),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+// Copied from include/net/tcp.h.
+constexpr int MAX_TCP_KEEPIDLE = 32767;
+constexpr int MAX_TCP_KEEPINTVL = 32767;
+constexpr int MAX_TCP_KEEPCNT = 127;
+
+TEST_P(TCPSocketPairTest, SetTCPKeepidleAboveMax) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ constexpr int kAboveMax = MAX_TCP_KEEPIDLE + 1;
+ EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_KEEPIDLE,
+ &kAboveMax, sizeof(kAboveMax)),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(TCPSocketPairTest, SetTCPKeepintvlAboveMax) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ constexpr int kAboveMax = MAX_TCP_KEEPINTVL + 1;
+ EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_KEEPINTVL,
+ &kAboveMax, sizeof(kAboveMax)),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(TCPSocketPairTest, SetTCPKeepidleToMax) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_KEEPIDLE,
+ &MAX_TCP_KEEPIDLE, sizeof(MAX_TCP_KEEPIDLE)),
+ SyscallSucceedsWithValue(0));
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_KEEPIDLE, &get,
+ &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, MAX_TCP_KEEPIDLE);
+}
+
+TEST_P(TCPSocketPairTest, SetTCPKeepintvlToMax) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_KEEPINTVL,
+ &MAX_TCP_KEEPINTVL, sizeof(MAX_TCP_KEEPINTVL)),
+ SyscallSucceedsWithValue(0));
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_KEEPINTVL, &get,
+ &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, MAX_TCP_KEEPINTVL);
+}
+
+TEST_P(TCPSocketPairTest, TCPKeepcountDefault) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ EXPECT_THAT(
+ getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_KEEPCNT, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, 9); // 9 keepalive probes.
+}
+
+TEST_P(TCPSocketPairTest, SetTCPKeepcountZero) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ constexpr int kZero = 0;
+ EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_KEEPCNT, &kZero,
+ sizeof(kZero)),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(TCPSocketPairTest, SetTCPKeepcountAboveMax) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ constexpr int kAboveMax = MAX_TCP_KEEPCNT + 1;
+ EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_KEEPCNT,
+ &kAboveMax, sizeof(kAboveMax)),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(TCPSocketPairTest, SetTCPKeepcountToMax) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_KEEPCNT,
+ &MAX_TCP_KEEPCNT, sizeof(MAX_TCP_KEEPCNT)),
+ SyscallSucceedsWithValue(0));
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ EXPECT_THAT(
+ getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_KEEPCNT, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, MAX_TCP_KEEPCNT);
+}
+
+TEST_P(TCPSocketPairTest, SetTCPKeepcountToOne) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ int keepaliveCount = 1;
+ EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_KEEPCNT,
+ &keepaliveCount, sizeof(keepaliveCount)),
+ SyscallSucceedsWithValue(0));
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ EXPECT_THAT(
+ getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_KEEPCNT, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, keepaliveCount);
+}
+
+TEST_P(TCPSocketPairTest, SetTCPKeepcountToNegative) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ int keepaliveCount = -5;
+ EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_KEEPCNT,
+ &keepaliveCount, sizeof(keepaliveCount)),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(TCPSocketPairTest, SetOOBInline) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ EXPECT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_OOBINLINE,
+ &kSockOptOn, sizeof(kSockOptOn)),
+ SyscallSucceeds());
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ EXPECT_THAT(
+ getsockopt(sockets->first_fd(), SOL_SOCKET, SO_OOBINLINE, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kSockOptOn);
+}
+
+TEST_P(TCPSocketPairTest, MsgTruncMsgPeek) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[512];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+ ASSERT_THAT(
+ RetryEINTR(send)(sockets->first_fd(), sent_data, sizeof(sent_data), 0),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ // Read half of the data with MSG_TRUNC | MSG_PEEK. This way there will still
+ // be some data left to read in the next step even if the data gets consumed.
+ char received_data1[sizeof(sent_data) / 2] = {};
+ ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data1,
+ sizeof(received_data1), MSG_TRUNC | MSG_PEEK),
+ SyscallSucceedsWithValue(sizeof(received_data1)));
+
+ // Check that we didn't get anything.
+ char zeros[sizeof(received_data1)] = {};
+ EXPECT_EQ(0, memcmp(zeros, received_data1, sizeof(received_data1)));
+
+ // Check that all of the data is still there.
+ char received_data2[sizeof(sent_data)] = {};
+ ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data2,
+ sizeof(received_data2), 0),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ EXPECT_EQ(0, memcmp(received_data2, sent_data, sizeof(sent_data)));
+}
+
+TEST_P(TCPSocketPairTest, SetCongestionControlSucceedsForSupported) {
+ // This is Linux's net/tcp.h TCP_CA_NAME_MAX.
+ const int kTcpCaNameMax = 16;
+
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ // Netstack only supports reno & cubic so we only test these two values here.
+ {
+ const char kSetCC[kTcpCaNameMax] = "reno";
+ ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CONGESTION,
+ &kSetCC, strlen(kSetCC)),
+ SyscallSucceedsWithValue(0));
+
+ char got_cc[kTcpCaNameMax];
+ memset(got_cc, '1', sizeof(got_cc));
+ socklen_t optlen = sizeof(got_cc);
+ ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CONGESTION,
+ &got_cc, &optlen),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(0, memcmp(got_cc, kSetCC, sizeof(kSetCC)));
+ }
+ {
+ const char kSetCC[kTcpCaNameMax] = "cubic";
+ ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CONGESTION,
+ &kSetCC, strlen(kSetCC)),
+ SyscallSucceedsWithValue(0));
+
+ char got_cc[kTcpCaNameMax];
+ memset(got_cc, '1', sizeof(got_cc));
+ socklen_t optlen = sizeof(got_cc);
+ ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CONGESTION,
+ &got_cc, &optlen),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(0, memcmp(got_cc, kSetCC, sizeof(kSetCC)));
+ }
+}
+
+TEST_P(TCPSocketPairTest, SetGetTCPCongestionShortReadBuffer) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ {
+ // Verify that getsockopt/setsockopt work with buffers smaller than
+ // kTcpCaNameMax.
+ const char kSetCC[] = "cubic";
+ ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CONGESTION,
+ &kSetCC, strlen(kSetCC)),
+ SyscallSucceedsWithValue(0));
+
+ char got_cc[sizeof(kSetCC)];
+ socklen_t optlen = sizeof(got_cc);
+ ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CONGESTION,
+ &got_cc, &optlen),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(0, memcmp(got_cc, kSetCC, sizeof(got_cc)));
+ }
+}
+
+TEST_P(TCPSocketPairTest, SetGetTCPCongestionLargeReadBuffer) {
+ // This is Linux's net/tcp.h TCP_CA_NAME_MAX.
+ const int kTcpCaNameMax = 16;
+
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ {
+ // Verify that getsockopt works with buffers larger than
+ // kTcpCaNameMax.
+ const char kSetCC[] = "cubic";
+ ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CONGESTION,
+ &kSetCC, strlen(kSetCC)),
+ SyscallSucceedsWithValue(0));
+
+ char got_cc[kTcpCaNameMax + 5];
+ socklen_t optlen = sizeof(got_cc);
+ ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CONGESTION,
+ &got_cc, &optlen),
+ SyscallSucceedsWithValue(0));
+ // Linux copies the minimum of kTcpCaNameMax or the length of the passed in
+ // buffer and sets optlen to the number of bytes actually copied
+ // irrespective of the actual length of the congestion control name.
+ EXPECT_EQ(kTcpCaNameMax, optlen);
+ EXPECT_EQ(0, memcmp(got_cc, kSetCC, sizeof(kSetCC)));
+ }
+}
+
+TEST_P(TCPSocketPairTest, SetCongestionControlFailsForUnsupported) {
+ // This is Linux's net/tcp.h TCP_CA_NAME_MAX.
+ const int kTcpCaNameMax = 16;
+
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ char old_cc[kTcpCaNameMax];
+ socklen_t optlen = sizeof(old_cc);
+ ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CONGESTION,
+ &old_cc, &optlen),
+ SyscallSucceedsWithValue(0));
+
+ const char kSetCC[] = "invalid_ca_cc";
+ ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CONGESTION,
+ &kSetCC, strlen(kSetCC)),
+ SyscallFailsWithErrno(ENOENT));
+
+ char got_cc[kTcpCaNameMax];
+ optlen = sizeof(got_cc);
+ ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CONGESTION,
+ &got_cc, &optlen),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(0, memcmp(got_cc, old_cc, sizeof(old_cc)));
+}
+
+// Linux and Netstack both default to a 60s TCP_LINGER2 timeout.
+constexpr int kDefaultTCPLingerTimeout = 60;
+
+TEST_P(TCPSocketPairTest, TCPLingerTimeoutDefault) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ EXPECT_THAT(
+ getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_LINGER2, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kDefaultTCPLingerTimeout);
+}
+
+TEST_P(TCPSocketPairTest, SetTCPLingerTimeoutZeroOrLess) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ constexpr int kZero = 0;
+ EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_LINGER2, &kZero,
+ sizeof(kZero)),
+ SyscallSucceedsWithValue(0));
+
+ constexpr int kNegative = -1234;
+ EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_LINGER2,
+ &kNegative, sizeof(kNegative)),
+ SyscallSucceedsWithValue(0));
+}
+
+TEST_P(TCPSocketPairTest, SetTCPLingerTimeoutAboveDefault) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ // Values above the net.ipv4.tcp_fin_timeout are capped to tcp_fin_timeout
+ // on linux (defaults to 60 seconds on linux).
+ constexpr int kAboveDefault = kDefaultTCPLingerTimeout + 1;
+ EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_LINGER2,
+ &kAboveDefault, sizeof(kAboveDefault)),
+ SyscallSucceedsWithValue(0));
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ EXPECT_THAT(
+ getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_LINGER2, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kDefaultTCPLingerTimeout);
+}
+
+TEST_P(TCPSocketPairTest, SetTCPLingerTimeout) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ // Values above the net.ipv4.tcp_fin_timeout are capped to tcp_fin_timeout
+ // on linux (defaults to 60 seconds on linux).
+ constexpr int kTCPLingerTimeout = kDefaultTCPLingerTimeout - 1;
+ EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_LINGER2,
+ &kTCPLingerTimeout, sizeof(kTCPLingerTimeout)),
+ SyscallSucceedsWithValue(0));
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ EXPECT_THAT(
+ getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_LINGER2, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kTCPLingerTimeout);
+}
+
+TEST_P(TCPSocketPairTest, TestTCPCloseWithData) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ScopedThread t([&]() {
+ // Close one end to trigger sending of a FIN.
+ ASSERT_THAT(shutdown(sockets->second_fd(), SHUT_WR), SyscallSucceeds());
+ char buf[3];
+ ASSERT_THAT(read(sockets->second_fd(), buf, 3),
+ SyscallSucceedsWithValue(3));
+ absl::SleepFor(absl::Milliseconds(50));
+ ASSERT_THAT(close(sockets->release_second_fd()), SyscallSucceeds());
+ });
+
+ absl::SleepFor(absl::Milliseconds(50));
+ // Send some data then close.
+ constexpr char kStr[] = "abc";
+ ASSERT_THAT(write(sockets->first_fd(), kStr, 3), SyscallSucceedsWithValue(3));
+ t.Join();
+ ASSERT_THAT(close(sockets->release_first_fd()), SyscallSucceeds());
+}
+
+TEST_P(TCPSocketPairTest, TCPUserTimeoutDefault) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+ &get, &get_len),
+ SyscallSucceeds());
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, 0); // 0 ms (disabled).
+}
+
+TEST_P(TCPSocketPairTest, SetTCPUserTimeoutZero) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ constexpr int kZero = 0;
+ ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+ &kZero, sizeof(kZero)),
+ SyscallSucceeds());
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+ &get, &get_len),
+ SyscallSucceeds());
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, 0); // 0 ms (disabled).
+}
+
+TEST_P(TCPSocketPairTest, SetTCPUserTimeoutBelowZero) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ constexpr int kNeg = -10;
+ EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+ &kNeg, sizeof(kNeg)),
+ SyscallFailsWithErrno(EINVAL));
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+ &get, &get_len),
+ SyscallSucceeds());
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, 0); // 0 ms (disabled).
+}
+
+TEST_P(TCPSocketPairTest, SetTCPUserTimeoutAboveZero) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ constexpr int kAbove = 10;
+ ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+ &kAbove, sizeof(kAbove)),
+ SyscallSucceeds());
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+ &get, &get_len),
+ SyscallSucceeds());
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kAbove);
+}
+
+TEST_P(TCPSocketPairTest, SetTCPWindowClampBelowMinRcvBufConnectedSocket) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ // Discover minimum receive buf by setting a really low value
+ // for the receive buffer.
+ constexpr int kZero = 0;
+ EXPECT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVBUF, &kZero,
+ sizeof(kZero)),
+ SyscallSucceeds());
+
+ // Now retrieve the minimum value for SO_RCVBUF as the set above should
+ // have caused SO_RCVBUF for the socket to be set to the minimum.
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ ASSERT_THAT(
+ getsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVBUF, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ int min_so_rcvbuf = get;
+
+ {
+ // Setting TCP_WINDOW_CLAMP to zero for a connected socket is not permitted.
+ constexpr int kZero = 0;
+ EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_WINDOW_CLAMP,
+ &kZero, sizeof(kZero)),
+ SyscallFailsWithErrno(EINVAL));
+
+ // Non-zero clamp values below MIN_SO_RCVBUF/2 should result in the clamp
+ // being set to MIN_SO_RCVBUF/2.
+ int below_half_min_so_rcvbuf = min_so_rcvbuf / 2 - 1;
+ EXPECT_THAT(
+ setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_WINDOW_CLAMP,
+ &below_half_min_so_rcvbuf, sizeof(below_half_min_so_rcvbuf)),
+ SyscallSucceeds());
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+
+ ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_WINDOW_CLAMP,
+ &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(min_so_rcvbuf / 2, get);
+ }
+}
+
+TEST_P(TCPSocketPairTest, IpMulticastTtlDefault) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
+ &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_GT(get, 0);
+}
+
+TEST_P(TCPSocketPairTest, IpMulticastLoopDefault) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ EXPECT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
+ &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, 1);
+}
+
+TEST_P(TCPSocketPairTest, TCPResetDuringClose_NoRandomSave) {
+ DisableSave ds; // Too many syscalls.
+ constexpr int kThreadCount = 1000;
+ std::unique_ptr<ScopedThread> instances[kThreadCount];
+ for (int i = 0; i < kThreadCount; i++) {
+ instances[i] = absl::make_unique<ScopedThread>([&]() {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ScopedThread t([&]() {
+ // Close one end to trigger sending of a FIN.
+ struct pollfd poll_fd = {sockets->second_fd(), POLLIN | POLLHUP, 0};
+ // Wait up to 20 seconds for the data.
+ constexpr int kPollTimeoutMs = 20000;
+ ASSERT_THAT(RetryEINTR(poll)(&poll_fd, 1, kPollTimeoutMs),
+ SyscallSucceedsWithValue(1));
+ ASSERT_THAT(close(sockets->release_second_fd()), SyscallSucceeds());
+ });
+
+ // Send some data then close.
+ constexpr char kStr[] = "abc";
+ ASSERT_THAT(write(sockets->first_fd(), kStr, 3),
+ SyscallSucceedsWithValue(3));
+ absl::SleepFor(absl::Milliseconds(10));
+ ASSERT_THAT(close(sockets->release_first_fd()), SyscallSucceeds());
+ t.Join();
+ });
+ }
+ for (int i = 0; i < kThreadCount; i++) {
+ instances[i]->Join();
+ }
+}
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_tcp_generic.h b/test/syscalls/linux/socket_ip_tcp_generic.h
new file mode 100644
index 000000000..a3eff3c73
--- /dev/null
+++ b/test/syscalls/linux/socket_ip_tcp_generic.h
@@ -0,0 +1,29 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IP_TCP_GENERIC_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IP_TCP_GENERIC_H_
+
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to pairs of connected TCP sockets.
+using TCPSocketPairTest = SocketPairTest;
+
+} // namespace testing
+} // namespace gvisor
+
+#endif // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IP_TCP_GENERIC_H_
diff --git a/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc b/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc
new file mode 100644
index 000000000..4e79d21f4
--- /dev/null
+++ b/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc
@@ -0,0 +1,45 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <netinet/tcp.h>
+
+#include <vector>
+
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_ip_tcp_generic.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+ return ApplyVecToVec<SocketPairKind>(
+ std::vector<Middleware>{
+ NoOp, SetSockOpt(IPPROTO_TCP, TCP_NODELAY, &kSockOptOn)},
+ std::vector<SocketPairKind>{
+ IPv6TCPAcceptBindSocketPair(0),
+ IPv4TCPAcceptBindSocketPair(0),
+ DualStackTCPAcceptBindSocketPair(0),
+ });
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ AllTCPSockets, TCPSocketPairTest,
+ ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_tcp_loopback.cc b/test/syscalls/linux/socket_ip_tcp_loopback.cc
new file mode 100644
index 000000000..9db3037bc
--- /dev/null
+++ b/test/syscalls/linux/socket_ip_tcp_loopback.cc
@@ -0,0 +1,40 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_generic.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+ return {
+ IPv6TCPAcceptBindSocketPair(0),
+ IPv4TCPAcceptBindSocketPair(0),
+ DualStackTCPAcceptBindSocketPair(0),
+ };
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ AllUnixDomainSockets, AllSocketPairTest,
+ ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc b/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc
new file mode 100644
index 000000000..f996b93d2
--- /dev/null
+++ b/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc
@@ -0,0 +1,45 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <netinet/tcp.h>
+
+#include <vector>
+
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_stream_blocking.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+ return ApplyVecToVec<SocketPairKind>(
+ std::vector<Middleware>{
+ NoOp, SetSockOpt(IPPROTO_TCP, TCP_NODELAY, &kSockOptOn)},
+ std::vector<SocketPairKind>{
+ IPv6TCPAcceptBindSocketPair(0),
+ IPv4TCPAcceptBindSocketPair(0),
+ DualStackTCPAcceptBindSocketPair(0),
+ });
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ BlockingTCPSockets, BlockingStreamSocketPairTest,
+ ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc b/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc
new file mode 100644
index 000000000..ffa377210
--- /dev/null
+++ b/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc
@@ -0,0 +1,44 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <netinet/tcp.h>
+
+#include <vector>
+
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_non_blocking.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+ return ApplyVecToVec<SocketPairKind>(
+ std::vector<Middleware>{
+ NoOp, SetSockOpt(IPPROTO_TCP, TCP_NODELAY, &kSockOptOn)},
+ std::vector<SocketPairKind>{
+ IPv6TCPAcceptBindSocketPair(SOCK_NONBLOCK),
+ IPv4TCPAcceptBindSocketPair(SOCK_NONBLOCK),
+ });
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ NonBlockingTCPSockets, NonBlockingSocketPairTest,
+ ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_tcp_udp_generic.cc b/test/syscalls/linux/socket_ip_tcp_udp_generic.cc
new file mode 100644
index 000000000..f178f1af9
--- /dev/null
+++ b/test/syscalls/linux/socket_ip_tcp_udp_generic.cc
@@ -0,0 +1,77 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <poll.h>
+#include <stdio.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Test fixture for tests that apply to pairs of TCP and UDP sockets.
+using TcpUdpSocketPairTest = SocketPairTest;
+
+TEST_P(TcpUdpSocketPairTest, ShutdownWrFollowedBySendIsError) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ // Now shutdown the write end of the first.
+ ASSERT_THAT(shutdown(sockets->first_fd(), SHUT_WR), SyscallSucceeds());
+
+ char buf[10] = {};
+ ASSERT_THAT(RetryEINTR(send)(sockets->first_fd(), buf, sizeof(buf), 0),
+ SyscallFailsWithErrno(EPIPE));
+}
+
+std::vector<SocketPairKind> GetSocketPairs() {
+ return VecCat<SocketPairKind>(
+ ApplyVec<SocketPairKind>(
+ IPv6UDPBidirectionalBindSocketPair,
+ AllBitwiseCombinations(List<int>{0, SOCK_NONBLOCK})),
+ ApplyVec<SocketPairKind>(
+ IPv4UDPBidirectionalBindSocketPair,
+ AllBitwiseCombinations(List<int>{0, SOCK_NONBLOCK})),
+ ApplyVec<SocketPairKind>(
+ DualStackUDPBidirectionalBindSocketPair,
+ AllBitwiseCombinations(List<int>{0, SOCK_NONBLOCK})),
+ ApplyVec<SocketPairKind>(
+ IPv6TCPAcceptBindSocketPair,
+ AllBitwiseCombinations(List<int>{0, SOCK_NONBLOCK})),
+ ApplyVec<SocketPairKind>(
+ IPv4TCPAcceptBindSocketPair,
+ AllBitwiseCombinations(List<int>{0, SOCK_NONBLOCK})),
+ ApplyVec<SocketPairKind>(
+ DualStackTCPAcceptBindSocketPair,
+ AllBitwiseCombinations(List<int>{0, SOCK_NONBLOCK})));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ AllIPSockets, TcpUdpSocketPairTest,
+ ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_udp_generic.cc b/test/syscalls/linux/socket_ip_udp_generic.cc
new file mode 100644
index 000000000..edb86aded
--- /dev/null
+++ b/test/syscalls/linux/socket_ip_udp_generic.cc
@@ -0,0 +1,452 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_ip_udp_generic.h"
+
+#include <errno.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <poll.h>
+#include <stdio.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+TEST_P(UDPSocketPairTest, MulticastTTLDefault) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
+ &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, 1);
+}
+
+TEST_P(UDPSocketPairTest, SetUDPMulticastTTLMin) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ constexpr int kMin = 0;
+ EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
+ &kMin, sizeof(kMin)),
+ SyscallSucceeds());
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
+ &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kMin);
+}
+
+TEST_P(UDPSocketPairTest, SetUDPMulticastTTLMax) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ constexpr int kMax = 255;
+ EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
+ &kMax, sizeof(kMax)),
+ SyscallSucceeds());
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
+ &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kMax);
+}
+
+TEST_P(UDPSocketPairTest, SetUDPMulticastTTLNegativeOne) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ constexpr int kArbitrary = 6;
+ EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
+ &kArbitrary, sizeof(kArbitrary)),
+ SyscallSucceeds());
+
+ constexpr int kNegOne = -1;
+ EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
+ &kNegOne, sizeof(kNegOne)),
+ SyscallSucceeds());
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
+ &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, 1);
+}
+
+TEST_P(UDPSocketPairTest, SetUDPMulticastTTLBelowMin) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ constexpr int kBelowMin = -2;
+ EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
+ &kBelowMin, sizeof(kBelowMin)),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(UDPSocketPairTest, SetUDPMulticastTTLAboveMax) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ constexpr int kAboveMax = 256;
+ EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
+ &kAboveMax, sizeof(kAboveMax)),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(UDPSocketPairTest, SetUDPMulticastTTLChar) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ constexpr char kArbitrary = 6;
+ EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
+ &kArbitrary, sizeof(kArbitrary)),
+ SyscallSucceeds());
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_TTL,
+ &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kArbitrary);
+}
+
+TEST_P(UDPSocketPairTest, SetEmptyIPAddMembership) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ struct ip_mreqn req = {};
+ EXPECT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+ &req, sizeof(req)),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(UDPSocketPairTest, MulticastLoopDefault) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
+ &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kSockOptOn);
+}
+
+TEST_P(UDPSocketPairTest, SetMulticastLoop) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
+ &kSockOptOff, sizeof(kSockOptOff)),
+ SyscallSucceeds());
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
+ &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kSockOptOff);
+
+ ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
+ &kSockOptOn, sizeof(kSockOptOn)),
+ SyscallSucceeds());
+
+ ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
+ &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kSockOptOn);
+}
+
+TEST_P(UDPSocketPairTest, SetMulticastLoopChar) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ constexpr char kSockOptOnChar = kSockOptOn;
+ constexpr char kSockOptOffChar = kSockOptOff;
+
+ ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
+ &kSockOptOffChar, sizeof(kSockOptOffChar)),
+ SyscallSucceeds());
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
+ &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kSockOptOff);
+
+ ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
+ &kSockOptOnChar, sizeof(kSockOptOnChar)),
+ SyscallSucceeds());
+
+ ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_LOOP,
+ &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kSockOptOn);
+}
+
+TEST_P(UDPSocketPairTest, ReuseAddrDefault) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ ASSERT_THAT(
+ getsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEADDR, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kSockOptOff);
+}
+
+TEST_P(UDPSocketPairTest, SetReuseAddr) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEADDR,
+ &kSockOptOn, sizeof(kSockOptOn)),
+ SyscallSucceeds());
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ ASSERT_THAT(
+ getsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEADDR, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kSockOptOn);
+
+ ASSERT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEADDR,
+ &kSockOptOff, sizeof(kSockOptOff)),
+ SyscallSucceeds());
+
+ ASSERT_THAT(
+ getsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEADDR, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kSockOptOff);
+}
+
+TEST_P(UDPSocketPairTest, ReusePortDefault) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ ASSERT_THAT(
+ getsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEPORT, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kSockOptOff);
+}
+
+TEST_P(UDPSocketPairTest, SetReusePort) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEPORT,
+ &kSockOptOn, sizeof(kSockOptOn)),
+ SyscallSucceeds());
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ ASSERT_THAT(
+ getsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEPORT, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kSockOptOn);
+
+ ASSERT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEPORT,
+ &kSockOptOff, sizeof(kSockOptOff)),
+ SyscallSucceeds());
+
+ ASSERT_THAT(
+ getsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEPORT, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kSockOptOff);
+}
+
+TEST_P(UDPSocketPairTest, SetReuseAddrReusePort) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEADDR,
+ &kSockOptOn, sizeof(kSockOptOn)),
+ SyscallSucceeds());
+
+ ASSERT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEPORT,
+ &kSockOptOn, sizeof(kSockOptOn)),
+ SyscallSucceeds());
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ ASSERT_THAT(
+ getsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEADDR, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kSockOptOn);
+
+ ASSERT_THAT(
+ getsockopt(sockets->first_fd(), SOL_SOCKET, SO_REUSEPORT, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kSockOptOn);
+}
+
+// Test getsockopt for a socket which is not set with IP_PKTINFO option.
+TEST_P(UDPSocketPairTest, IPPKTINFODefault) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+
+ ASSERT_THAT(
+ getsockopt(sockets->first_fd(), SOL_IP, IP_PKTINFO, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kSockOptOff);
+}
+
+// Test setsockopt and getsockopt for a socket with IP_PKTINFO option.
+TEST_P(UDPSocketPairTest, SetAndGetIPPKTINFO) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ int level = SOL_IP;
+ int type = IP_PKTINFO;
+
+ // Check getsockopt before IP_PKTINFO is set.
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+
+ ASSERT_THAT(setsockopt(sockets->first_fd(), level, type, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceedsWithValue(0));
+
+ ASSERT_THAT(getsockopt(sockets->first_fd(), level, type, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get, kSockOptOn);
+ EXPECT_EQ(get_len, sizeof(get));
+
+ ASSERT_THAT(setsockopt(sockets->first_fd(), level, type, &kSockOptOff,
+ sizeof(kSockOptOff)),
+ SyscallSucceedsWithValue(0));
+
+ ASSERT_THAT(getsockopt(sockets->first_fd(), level, type, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get, kSockOptOff);
+ EXPECT_EQ(get_len, sizeof(get));
+}
+
+// Holds TOS or TClass information for IPv4 or IPv6 respectively.
+struct RecvTosOption {
+ int level;
+ int option;
+};
+
+RecvTosOption GetRecvTosOption(int domain) {
+ TEST_CHECK(domain == AF_INET || domain == AF_INET6);
+ RecvTosOption opt;
+ switch (domain) {
+ case AF_INET:
+ opt.level = IPPROTO_IP;
+ opt.option = IP_RECVTOS;
+ break;
+ case AF_INET6:
+ opt.level = IPPROTO_IPV6;
+ opt.option = IPV6_RECVTCLASS;
+ break;
+ }
+ return opt;
+}
+
+// Ensure that Receiving TOS or TCLASS is off by default.
+TEST_P(UDPSocketPairTest, RecvTosDefault) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ RecvTosOption t = GetRecvTosOption(GetParam().domain);
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ ASSERT_THAT(
+ getsockopt(sockets->first_fd(), t.level, t.option, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kSockOptOff);
+}
+
+// Test that setting and getting IP_RECVTOS or IPV6_RECVTCLASS works as
+// expected.
+TEST_P(UDPSocketPairTest, SetRecvTos) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ RecvTosOption t = GetRecvTosOption(GetParam().domain);
+
+ ASSERT_THAT(setsockopt(sockets->first_fd(), t.level, t.option, &kSockOptOff,
+ sizeof(kSockOptOff)),
+ SyscallSucceeds());
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ ASSERT_THAT(
+ getsockopt(sockets->first_fd(), t.level, t.option, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kSockOptOff);
+
+ ASSERT_THAT(setsockopt(sockets->first_fd(), t.level, t.option, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+
+ ASSERT_THAT(
+ getsockopt(sockets->first_fd(), t.level, t.option, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kSockOptOn);
+}
+
+// Test that any socket (including IPv6 only) accepts the IPv4 TOS option: this
+// mirrors behavior in linux.
+TEST_P(UDPSocketPairTest, TOSRecvMismatch) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ RecvTosOption t = GetRecvTosOption(AF_INET);
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+
+ ASSERT_THAT(
+ getsockopt(sockets->first_fd(), t.level, t.option, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+}
+
+// Test that an IPv4 socket does not support the IPv6 TClass option.
+TEST_P(UDPSocketPairTest, TClassRecvMismatch) {
+ // This should only test AF_INET sockets for the mismatch behavior.
+ SKIP_IF(GetParam().domain != AF_INET);
+
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+
+ ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_IPV6, IPV6_RECVTCLASS,
+ &get, &get_len),
+ SyscallFailsWithErrno(EOPNOTSUPP));
+}
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_udp_generic.h b/test/syscalls/linux/socket_ip_udp_generic.h
new file mode 100644
index 000000000..106c54e9f
--- /dev/null
+++ b/test/syscalls/linux/socket_ip_udp_generic.h
@@ -0,0 +1,29 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IP_UDP_GENERIC_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IP_UDP_GENERIC_H_
+
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to pairs of connected UDP sockets.
+using UDPSocketPairTest = SocketPairTest;
+
+} // namespace testing
+} // namespace gvisor
+
+#endif // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IP_UDP_GENERIC_H_
diff --git a/test/syscalls/linux/socket_ip_udp_loopback.cc b/test/syscalls/linux/socket_ip_udp_loopback.cc
new file mode 100644
index 000000000..c7fa44884
--- /dev/null
+++ b/test/syscalls/linux/socket_ip_udp_loopback.cc
@@ -0,0 +1,50 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_generic.h"
+#include "test/syscalls/linux/socket_ip_udp_generic.h"
+#include "test/syscalls/linux/socket_non_stream.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+ return {
+ IPv6UDPBidirectionalBindSocketPair(0),
+ IPv4UDPBidirectionalBindSocketPair(0),
+ DualStackUDPBidirectionalBindSocketPair(0),
+ };
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ AllUDPSockets, AllSocketPairTest,
+ ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+INSTANTIATE_TEST_SUITE_P(
+ AllUDPSockets, NonStreamSocketPairTest,
+ ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+INSTANTIATE_TEST_SUITE_P(
+ AllUDPSockets, UDPSocketPairTest,
+ ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_udp_loopback_blocking.cc b/test/syscalls/linux/socket_ip_udp_loopback_blocking.cc
new file mode 100644
index 000000000..d6925a8df
--- /dev/null
+++ b/test/syscalls/linux/socket_ip_udp_loopback_blocking.cc
@@ -0,0 +1,39 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_non_stream_blocking.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+ return {
+ IPv6UDPBidirectionalBindSocketPair(0),
+ IPv4UDPBidirectionalBindSocketPair(0),
+ };
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ BlockingUDPSockets, BlockingNonStreamSocketPairTest,
+ ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc b/test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc
new file mode 100644
index 000000000..d675eddc6
--- /dev/null
+++ b/test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc
@@ -0,0 +1,39 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_non_blocking.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+ return {
+ IPv6UDPBidirectionalBindSocketPair(SOCK_NONBLOCK),
+ IPv4UDPBidirectionalBindSocketPair(SOCK_NONBLOCK),
+ };
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ NonBlockingUDPSockets, NonBlockingSocketPairTest,
+ ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_unbound.cc b/test/syscalls/linux/socket_ip_unbound.cc
new file mode 100644
index 000000000..1c7b0cf90
--- /dev/null
+++ b/test/syscalls/linux/socket_ip_unbound.cc
@@ -0,0 +1,474 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+
+#include <cstdio>
+#include <cstring>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to pairs of IP sockets.
+using IPUnboundSocketTest = SimpleSocketTest;
+
+TEST_P(IPUnboundSocketTest, TtlDefault) {
+ auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ int get = -1;
+ socklen_t get_sz = sizeof(get);
+ EXPECT_THAT(getsockopt(socket->get(), IPPROTO_IP, IP_TTL, &get, &get_sz),
+ SyscallSucceedsWithValue(0));
+ EXPECT_TRUE(get == 64 || get == 127);
+ EXPECT_EQ(get_sz, sizeof(get));
+}
+
+TEST_P(IPUnboundSocketTest, SetTtl) {
+ auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ int get1 = -1;
+ socklen_t get1_sz = sizeof(get1);
+ EXPECT_THAT(getsockopt(socket->get(), IPPROTO_IP, IP_TTL, &get1, &get1_sz),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get1_sz, sizeof(get1));
+
+ int set = 100;
+ if (set == get1) {
+ set += 1;
+ }
+ socklen_t set_sz = sizeof(set);
+ EXPECT_THAT(setsockopt(socket->get(), IPPROTO_IP, IP_TTL, &set, set_sz),
+ SyscallSucceedsWithValue(0));
+
+ int get2 = -1;
+ socklen_t get2_sz = sizeof(get2);
+ EXPECT_THAT(getsockopt(socket->get(), IPPROTO_IP, IP_TTL, &get2, &get2_sz),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get2_sz, sizeof(get2));
+ EXPECT_EQ(get2, set);
+}
+
+TEST_P(IPUnboundSocketTest, ResetTtlToDefault) {
+ auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ int get1 = -1;
+ socklen_t get1_sz = sizeof(get1);
+ EXPECT_THAT(getsockopt(socket->get(), IPPROTO_IP, IP_TTL, &get1, &get1_sz),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get1_sz, sizeof(get1));
+
+ int set1 = 100;
+ if (set1 == get1) {
+ set1 += 1;
+ }
+ socklen_t set1_sz = sizeof(set1);
+ EXPECT_THAT(setsockopt(socket->get(), IPPROTO_IP, IP_TTL, &set1, set1_sz),
+ SyscallSucceedsWithValue(0));
+
+ int set2 = -1;
+ socklen_t set2_sz = sizeof(set2);
+ EXPECT_THAT(setsockopt(socket->get(), IPPROTO_IP, IP_TTL, &set2, set2_sz),
+ SyscallSucceedsWithValue(0));
+
+ int get2 = -1;
+ socklen_t get2_sz = sizeof(get2);
+ EXPECT_THAT(getsockopt(socket->get(), IPPROTO_IP, IP_TTL, &get2, &get2_sz),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get2_sz, sizeof(get2));
+ EXPECT_EQ(get2, get1);
+}
+
+TEST_P(IPUnboundSocketTest, ZeroTtl) {
+ auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ int set = 0;
+ socklen_t set_sz = sizeof(set);
+ EXPECT_THAT(setsockopt(socket->get(), IPPROTO_IP, IP_TTL, &set, set_sz),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(IPUnboundSocketTest, InvalidLargeTtl) {
+ auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ int set = 256;
+ socklen_t set_sz = sizeof(set);
+ EXPECT_THAT(setsockopt(socket->get(), IPPROTO_IP, IP_TTL, &set, set_sz),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(IPUnboundSocketTest, InvalidNegativeTtl) {
+ auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ int set = -2;
+ socklen_t set_sz = sizeof(set);
+ EXPECT_THAT(setsockopt(socket->get(), IPPROTO_IP, IP_TTL, &set, set_sz),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+struct TOSOption {
+ int level;
+ int option;
+ int cmsg_level;
+};
+
+constexpr int INET_ECN_MASK = 3;
+
+static TOSOption GetTOSOption(int domain) {
+ TOSOption opt;
+ switch (domain) {
+ case AF_INET:
+ opt.level = IPPROTO_IP;
+ opt.option = IP_TOS;
+ opt.cmsg_level = SOL_IP;
+ break;
+ case AF_INET6:
+ opt.level = IPPROTO_IPV6;
+ opt.option = IPV6_TCLASS;
+ opt.cmsg_level = SOL_IPV6;
+ break;
+ }
+ return opt;
+}
+
+TEST_P(IPUnboundSocketTest, TOSDefault) {
+ auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ TOSOption t = GetTOSOption(GetParam().domain);
+ int get = -1;
+ socklen_t get_sz = sizeof(get);
+ constexpr int kDefaultTOS = 0;
+ ASSERT_THAT(getsockopt(socket->get(), t.level, t.option, &get, &get_sz),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_sz, sizeof(get));
+ EXPECT_EQ(get, kDefaultTOS);
+}
+
+TEST_P(IPUnboundSocketTest, SetTOS) {
+ auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ int set = 0xC0;
+ socklen_t set_sz = sizeof(set);
+ TOSOption t = GetTOSOption(GetParam().domain);
+ EXPECT_THAT(setsockopt(socket->get(), t.level, t.option, &set, set_sz),
+ SyscallSucceedsWithValue(0));
+
+ int get = -1;
+ socklen_t get_sz = sizeof(get);
+ ASSERT_THAT(getsockopt(socket->get(), t.level, t.option, &get, &get_sz),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_sz, sizeof(get));
+ EXPECT_EQ(get, set);
+}
+
+TEST_P(IPUnboundSocketTest, ZeroTOS) {
+ auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ int set = 0;
+ socklen_t set_sz = sizeof(set);
+ TOSOption t = GetTOSOption(GetParam().domain);
+ EXPECT_THAT(setsockopt(socket->get(), t.level, t.option, &set, set_sz),
+ SyscallSucceedsWithValue(0));
+ int get = -1;
+ socklen_t get_sz = sizeof(get);
+ ASSERT_THAT(getsockopt(socket->get(), t.level, t.option, &get, &get_sz),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_sz, sizeof(get));
+ EXPECT_EQ(get, set);
+}
+
+TEST_P(IPUnboundSocketTest, InvalidLargeTOS) {
+ auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ // Test with exceeding the byte space.
+ int set = 256;
+ constexpr int kDefaultTOS = 0;
+ socklen_t set_sz = sizeof(set);
+ TOSOption t = GetTOSOption(GetParam().domain);
+ if (GetParam().domain == AF_INET) {
+ EXPECT_THAT(setsockopt(socket->get(), t.level, t.option, &set, set_sz),
+ SyscallSucceedsWithValue(0));
+ } else {
+ EXPECT_THAT(setsockopt(socket->get(), t.level, t.option, &set, set_sz),
+ SyscallFailsWithErrno(EINVAL));
+ }
+ int get = -1;
+ socklen_t get_sz = sizeof(get);
+ ASSERT_THAT(getsockopt(socket->get(), t.level, t.option, &get, &get_sz),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_sz, sizeof(get));
+ EXPECT_EQ(get, kDefaultTOS);
+}
+
+TEST_P(IPUnboundSocketTest, CheckSkipECN) {
+ auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ int set = 0xFF;
+ socklen_t set_sz = sizeof(set);
+ TOSOption t = GetTOSOption(GetParam().domain);
+ EXPECT_THAT(setsockopt(socket->get(), t.level, t.option, &set, set_sz),
+ SyscallSucceedsWithValue(0));
+ int expect = static_cast<uint8_t>(set);
+ if (GetParam().protocol == IPPROTO_TCP) {
+ expect &= ~INET_ECN_MASK;
+ }
+ int get = -1;
+ socklen_t get_sz = sizeof(get);
+ ASSERT_THAT(getsockopt(socket->get(), t.level, t.option, &get, &get_sz),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_sz, sizeof(get));
+ EXPECT_EQ(get, expect);
+}
+
+TEST_P(IPUnboundSocketTest, ZeroTOSOptionSize) {
+ auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ int set = 0xC0;
+ socklen_t set_sz = 0;
+ TOSOption t = GetTOSOption(GetParam().domain);
+ if (GetParam().domain == AF_INET) {
+ EXPECT_THAT(setsockopt(socket->get(), t.level, t.option, &set, set_sz),
+ SyscallSucceedsWithValue(0));
+ } else {
+ EXPECT_THAT(setsockopt(socket->get(), t.level, t.option, &set, set_sz),
+ SyscallFailsWithErrno(EINVAL));
+ }
+ int get = -1;
+ socklen_t get_sz = 0;
+ ASSERT_THAT(getsockopt(socket->get(), t.level, t.option, &get, &get_sz),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_sz, 0);
+ EXPECT_EQ(get, -1);
+}
+
+TEST_P(IPUnboundSocketTest, SmallTOSOptionSize) {
+ auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ int set = 0xC0;
+ constexpr int kDefaultTOS = 0;
+ TOSOption t = GetTOSOption(GetParam().domain);
+ for (socklen_t i = 1; i < sizeof(int); i++) {
+ int expect_tos;
+ socklen_t expect_sz;
+ if (GetParam().domain == AF_INET) {
+ EXPECT_THAT(setsockopt(socket->get(), t.level, t.option, &set, i),
+ SyscallSucceedsWithValue(0));
+ expect_tos = set;
+ expect_sz = sizeof(uint8_t);
+ } else {
+ EXPECT_THAT(setsockopt(socket->get(), t.level, t.option, &set, i),
+ SyscallFailsWithErrno(EINVAL));
+ expect_tos = kDefaultTOS;
+ expect_sz = i;
+ }
+ uint get = -1;
+ socklen_t get_sz = i;
+ ASSERT_THAT(getsockopt(socket->get(), t.level, t.option, &get, &get_sz),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_sz, expect_sz);
+ // Account for partial copies by getsockopt, retrieve the lower
+ // bits specified by get_sz, while comparing against expect_tos.
+ EXPECT_EQ(get & ~(~0 << (get_sz * 8)), expect_tos);
+ }
+}
+
+TEST_P(IPUnboundSocketTest, LargeTOSOptionSize) {
+ auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ int set = 0xC0;
+ TOSOption t = GetTOSOption(GetParam().domain);
+ for (socklen_t i = sizeof(int); i < 10; i++) {
+ EXPECT_THAT(setsockopt(socket->get(), t.level, t.option, &set, i),
+ SyscallSucceedsWithValue(0));
+ int get = -1;
+ socklen_t get_sz = i;
+ // We expect the system call handler to only copy atmost sizeof(int) bytes
+ // as asserted by the check below. Hence, we do not expect the copy to
+ // overflow in getsockopt.
+ ASSERT_THAT(getsockopt(socket->get(), t.level, t.option, &get, &get_sz),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_sz, sizeof(int));
+ EXPECT_EQ(get, set);
+ }
+}
+
+TEST_P(IPUnboundSocketTest, NegativeTOS) {
+ auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ int set = -1;
+ socklen_t set_sz = sizeof(set);
+ TOSOption t = GetTOSOption(GetParam().domain);
+ EXPECT_THAT(setsockopt(socket->get(), t.level, t.option, &set, set_sz),
+ SyscallSucceedsWithValue(0));
+ int expect;
+ if (GetParam().domain == AF_INET) {
+ expect = static_cast<uint8_t>(set);
+ if (GetParam().protocol == IPPROTO_TCP) {
+ expect &= ~INET_ECN_MASK;
+ }
+ } else {
+ // On IPv6 TCLASS, setting -1 has the effect of resetting the
+ // TrafficClass.
+ expect = 0;
+ }
+ int get = -1;
+ socklen_t get_sz = sizeof(get);
+ ASSERT_THAT(getsockopt(socket->get(), t.level, t.option, &get, &get_sz),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_sz, sizeof(get));
+ EXPECT_EQ(get, expect);
+}
+
+TEST_P(IPUnboundSocketTest, InvalidNegativeTOS) {
+ auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ int set = -2;
+ socklen_t set_sz = sizeof(set);
+ TOSOption t = GetTOSOption(GetParam().domain);
+ int expect;
+ if (GetParam().domain == AF_INET) {
+ ASSERT_THAT(setsockopt(socket->get(), t.level, t.option, &set, set_sz),
+ SyscallSucceedsWithValue(0));
+ expect = static_cast<uint8_t>(set);
+ if (GetParam().protocol == IPPROTO_TCP) {
+ expect &= ~INET_ECN_MASK;
+ }
+ } else {
+ ASSERT_THAT(setsockopt(socket->get(), t.level, t.option, &set, set_sz),
+ SyscallFailsWithErrno(EINVAL));
+ expect = 0;
+ }
+ int get = 0;
+ socklen_t get_sz = sizeof(get);
+ ASSERT_THAT(getsockopt(socket->get(), t.level, t.option, &get, &get_sz),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_sz, sizeof(get));
+ EXPECT_EQ(get, expect);
+}
+
+TEST_P(IPUnboundSocketTest, NullTOS) {
+ auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ TOSOption t = GetTOSOption(GetParam().domain);
+ int set_sz = sizeof(int);
+ if (GetParam().domain == AF_INET) {
+ EXPECT_THAT(setsockopt(socket->get(), t.level, t.option, nullptr, set_sz),
+ SyscallFailsWithErrno(EFAULT));
+ } else { // AF_INET6
+ // The AF_INET6 behavior is not yet compatible. gVisor will try to read
+ // optval from user memory at syscall handler, it needs substantial
+ // refactoring to implement this behavior just for IPv6.
+ if (IsRunningOnGvisor()) {
+ EXPECT_THAT(setsockopt(socket->get(), t.level, t.option, nullptr, set_sz),
+ SyscallFailsWithErrno(EFAULT));
+ } else {
+ // Linux's IPv6 stack treats nullptr optval as input of 0, so the call
+ // succeeds. (net/ipv6/ipv6_sockglue.c, do_ipv6_setsockopt())
+ //
+ // Linux's implementation would need fixing as passing a nullptr as optval
+ // and non-zero optlen may not be valid.
+ // TODO(b/158666797): Combine the gVisor and linux cases for IPv6.
+ // Some kernel versions return EFAULT, so we handle both.
+ EXPECT_THAT(
+ setsockopt(socket->get(), t.level, t.option, nullptr, set_sz),
+ AnyOf(SyscallFailsWithErrno(EFAULT), SyscallSucceedsWithValue(0)));
+ }
+ }
+ socklen_t get_sz = sizeof(int);
+ EXPECT_THAT(getsockopt(socket->get(), t.level, t.option, nullptr, &get_sz),
+ SyscallFailsWithErrno(EFAULT));
+ int get = -1;
+ EXPECT_THAT(getsockopt(socket->get(), t.level, t.option, &get, nullptr),
+ SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_P(IPUnboundSocketTest, InsufficientBufferTOS) {
+ SKIP_IF(GetParam().protocol == IPPROTO_TCP);
+
+ auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ TOSOption t = GetTOSOption(GetParam().domain);
+
+ in_addr addr4;
+ in6_addr addr6;
+ ASSERT_THAT(inet_pton(AF_INET, "127.0.0.1", &addr4), ::testing::Eq(1));
+ ASSERT_THAT(inet_pton(AF_INET6, "fe80::", &addr6), ::testing::Eq(1));
+
+ cmsghdr cmsg = {};
+ cmsg.cmsg_len = sizeof(cmsg);
+ cmsg.cmsg_level = t.cmsg_level;
+ cmsg.cmsg_type = t.option;
+
+ msghdr msg = {};
+ msg.msg_control = &cmsg;
+ msg.msg_controllen = sizeof(cmsg);
+ if (GetParam().domain == AF_INET) {
+ msg.msg_name = &addr4;
+ msg.msg_namelen = sizeof(addr4);
+ } else {
+ msg.msg_name = &addr6;
+ msg.msg_namelen = sizeof(addr6);
+ }
+
+ EXPECT_THAT(sendmsg(socket->get(), &msg, 0), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(IPUnboundSocketTest, ReuseAddrDefault) {
+ auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ int get = -1;
+ socklen_t get_sz = sizeof(get);
+ ASSERT_THAT(
+ getsockopt(socket->get(), SOL_SOCKET, SO_REUSEADDR, &get, &get_sz),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get, kSockOptOff);
+ EXPECT_EQ(get_sz, sizeof(get));
+}
+
+TEST_P(IPUnboundSocketTest, SetReuseAddr) {
+ auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ ASSERT_THAT(setsockopt(socket->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceedsWithValue(0));
+
+ int get = -1;
+ socklen_t get_sz = sizeof(get);
+ ASSERT_THAT(
+ getsockopt(socket->get(), SOL_SOCKET, SO_REUSEADDR, &get, &get_sz),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get, kSockOptOn);
+ EXPECT_EQ(get_sz, sizeof(get));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ IPUnboundSockets, IPUnboundSocketTest,
+ ::testing::ValuesIn(VecCat<SocketKind>(VecCat<SocketKind>(
+ ApplyVec<SocketKind>(IPv4UDPUnboundSocket,
+ AllBitwiseCombinations(List<int>{SOCK_DGRAM},
+ List<int>{0,
+ SOCK_NONBLOCK})),
+ ApplyVec<SocketKind>(IPv6UDPUnboundSocket,
+ AllBitwiseCombinations(List<int>{SOCK_DGRAM},
+ List<int>{0,
+ SOCK_NONBLOCK})),
+ ApplyVec<SocketKind>(IPv4TCPUnboundSocket,
+ AllBitwiseCombinations(List<int>{SOCK_STREAM},
+ List<int>{0,
+ SOCK_NONBLOCK})),
+ ApplyVec<SocketKind>(IPv6TCPUnboundSocket,
+ AllBitwiseCombinations(List<int>{SOCK_STREAM},
+ List<int>{
+ 0, SOCK_NONBLOCK}))))));
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.cc b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.cc
new file mode 100644
index 000000000..80f12b0a9
--- /dev/null
+++ b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.cc
@@ -0,0 +1,66 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.h"
+
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+
+#include <cstdio>
+#include <cstring>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Verifies that a newly instantiated TCP socket does not have the
+// broadcast socket option enabled.
+TEST_P(IPv4TCPUnboundExternalNetworkingSocketTest, TCPBroadcastDefault) {
+ auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ int get = -1;
+ socklen_t get_sz = sizeof(get);
+ EXPECT_THAT(
+ getsockopt(socket->get(), SOL_SOCKET, SO_BROADCAST, &get, &get_sz),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get, kSockOptOff);
+ EXPECT_EQ(get_sz, sizeof(get));
+}
+
+// Verifies that a newly instantiated TCP socket returns true after enabling
+// the broadcast socket option.
+TEST_P(IPv4TCPUnboundExternalNetworkingSocketTest, SetTCPBroadcast) {
+ auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ EXPECT_THAT(setsockopt(socket->get(), SOL_SOCKET, SO_BROADCAST, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceedsWithValue(0));
+
+ int get = -1;
+ socklen_t get_sz = sizeof(get);
+ EXPECT_THAT(
+ getsockopt(socket->get(), SOL_SOCKET, SO_BROADCAST, &get, &get_sz),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get, kSockOptOn);
+ EXPECT_EQ(get_sz, sizeof(get));
+}
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.h b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.h
new file mode 100644
index 000000000..fb582b224
--- /dev/null
+++ b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.h
@@ -0,0 +1,30 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_TCP_UNBOUND_EXTERNAL_NETWORKING_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_TCP_UNBOUND_EXTERNAL_NETWORKING_H_
+
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to unbound IPv4 TCP sockets in a sandbox
+// with external networking support.
+using IPv4TCPUnboundExternalNetworkingSocketTest = SimpleSocketTest;
+
+} // namespace testing
+} // namespace gvisor
+
+#endif // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_TCP_UNBOUND_EXTERNAL_NETWORKING_H_
diff --git a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc
new file mode 100644
index 000000000..797c4174e
--- /dev/null
+++ b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc
@@ -0,0 +1,39 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking.h"
+
+#include <vector>
+
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+std::vector<SocketKind> GetSockets() {
+ return ApplyVec<SocketKind>(
+ IPv4TCPUnboundSocket,
+ AllBitwiseCombinations(List<int>{0, SOCK_NONBLOCK}));
+}
+
+INSTANTIATE_TEST_SUITE_P(IPv4TCPUnboundSockets,
+ IPv4TCPUnboundExternalNetworkingSocketTest,
+ ::testing::ValuesIn(GetSockets()));
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound.cc b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
new file mode 100644
index 000000000..de0f5f01b
--- /dev/null
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound.cc
@@ -0,0 +1,2456 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_ipv4_udp_unbound.h"
+
+#include <arpa/inet.h>
+#include <net/if.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+
+#include <cstdio>
+
+#include "gtest/gtest.h"
+#include "absl/memory/memory.h"
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Check that packets are not received without a group membership. Default send
+// interface configured by bind.
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackNoGroup) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Bind the first FD to the loopback. This is an alternative to
+ // IP_MULTICAST_IF for setting the default send interface.
+ auto sender_addr = V4Loopback();
+ EXPECT_THAT(
+ bind(socket1->get(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
+ sender_addr.addr_len),
+ SyscallSucceeds());
+
+ // Bind the second FD to the v4 any address. If multicast worked like unicast,
+ // this would ensure that we get the packet.
+ auto receiver_addr = V4Any();
+ EXPECT_THAT(
+ bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ receiver_addr.addr_len),
+ SyscallSucceeds());
+ socklen_t receiver_addr_len = receiver_addr.addr_len;
+ ASSERT_THAT(getsockname(socket2->get(),
+ reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ &receiver_addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+ // Send the multicast packet.
+ auto send_addr = V4Multicast();
+ reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+ char send_buf[200];
+ RandomizeBuffer(send_buf, sizeof(send_buf));
+ EXPECT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+ reinterpret_cast<sockaddr*>(&send_addr.addr),
+ send_addr.addr_len),
+ SyscallSucceedsWithValue(sizeof(send_buf)));
+
+ // Check that we did not receive the multicast packet.
+ char recv_buf[sizeof(send_buf)] = {};
+ EXPECT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf),
+ MSG_DONTWAIT),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+// Check that not setting a default send interface prevents multicast packets
+// from being sent. Group membership interface configured by address.
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackAddrNoDefaultSendIf) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Bind the second FD to the v4 any address to ensure that we can receive any
+ // unicast packet.
+ auto receiver_addr = V4Any();
+ EXPECT_THAT(
+ bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ receiver_addr.addr_len),
+ SyscallSucceeds());
+ socklen_t receiver_addr_len = receiver_addr.addr_len;
+ ASSERT_THAT(getsockname(socket2->get(),
+ reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ &receiver_addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+ // Register to receive multicast packets.
+ ip_mreq group = {};
+ group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+ group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+ EXPECT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+ sizeof(group)),
+ SyscallSucceeds());
+
+ // Send a multicast packet.
+ auto send_addr = V4Multicast();
+ reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+ char send_buf[200];
+ RandomizeBuffer(send_buf, sizeof(send_buf));
+ EXPECT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+ reinterpret_cast<sockaddr*>(&send_addr.addr),
+ send_addr.addr_len),
+ SyscallFailsWithErrno(ENETUNREACH));
+}
+
+// Check that not setting a default send interface prevents multicast packets
+// from being sent. Group membership interface configured by NIC ID.
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackNicNoDefaultSendIf) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Bind the second FD to the v4 any address to ensure that we can receive any
+ // unicast packet.
+ auto receiver_addr = V4Any();
+ ASSERT_THAT(
+ bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ receiver_addr.addr_len),
+ SyscallSucceeds());
+ socklen_t receiver_addr_len = receiver_addr.addr_len;
+ ASSERT_THAT(getsockname(socket2->get(),
+ reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ &receiver_addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+ // Register to receive multicast packets.
+ ip_mreqn group = {};
+ group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+ group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
+ EXPECT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+ sizeof(group)),
+ SyscallSucceeds());
+
+ // Send a multicast packet.
+ auto send_addr = V4Multicast();
+ reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+ char send_buf[200];
+ RandomizeBuffer(send_buf, sizeof(send_buf));
+ EXPECT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+ reinterpret_cast<sockaddr*>(&send_addr.addr),
+ send_addr.addr_len),
+ SyscallFailsWithErrno(ENETUNREACH));
+}
+
+// Check that multicast works when the default send interface is configured by
+// bind and the group membership is configured by address.
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackAddr) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Bind the first FD to the loopback. This is an alternative to
+ // IP_MULTICAST_IF for setting the default send interface.
+ auto sender_addr = V4Loopback();
+ ASSERT_THAT(
+ bind(socket1->get(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
+ sender_addr.addr_len),
+ SyscallSucceeds());
+
+ // Bind the second FD to the v4 any address to ensure that we can receive the
+ // multicast packet.
+ auto receiver_addr = V4Any();
+ ASSERT_THAT(
+ bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ receiver_addr.addr_len),
+ SyscallSucceeds());
+ socklen_t receiver_addr_len = receiver_addr.addr_len;
+ ASSERT_THAT(getsockname(socket2->get(),
+ reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ &receiver_addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+ // Register to receive multicast packets.
+ ip_mreq group = {};
+ group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+ group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+ ASSERT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+ sizeof(group)),
+ SyscallSucceeds());
+
+ // Send a multicast packet.
+ auto send_addr = V4Multicast();
+ reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+ char send_buf[200];
+ RandomizeBuffer(send_buf, sizeof(send_buf));
+ ASSERT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+ reinterpret_cast<sockaddr*>(&send_addr.addr),
+ send_addr.addr_len),
+ SyscallSucceedsWithValue(sizeof(send_buf)));
+
+ // Check that we received the multicast packet.
+ char recv_buf[sizeof(send_buf)] = {};
+ ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf), 0),
+ SyscallSucceedsWithValue(sizeof(recv_buf)));
+
+ EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+}
+
+// Check that multicast works when the default send interface is configured by
+// bind and the group membership is configured by NIC ID.
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackNic) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Bind the first FD to the loopback. This is an alternative to
+ // IP_MULTICAST_IF for setting the default send interface.
+ auto sender_addr = V4Loopback();
+ ASSERT_THAT(
+ bind(socket1->get(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
+ sender_addr.addr_len),
+ SyscallSucceeds());
+
+ // Bind the second FD to the v4 any address to ensure that we can receive the
+ // multicast packet.
+ auto receiver_addr = V4Any();
+ ASSERT_THAT(
+ bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ receiver_addr.addr_len),
+ SyscallSucceeds());
+ socklen_t receiver_addr_len = receiver_addr.addr_len;
+ ASSERT_THAT(getsockname(socket2->get(),
+ reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ &receiver_addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+ // Register to receive multicast packets.
+ ip_mreqn group = {};
+ group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+ group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
+ ASSERT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+ sizeof(group)),
+ SyscallSucceeds());
+
+ // Send a multicast packet.
+ auto send_addr = V4Multicast();
+ reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+ char send_buf[200];
+ RandomizeBuffer(send_buf, sizeof(send_buf));
+ ASSERT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+ reinterpret_cast<sockaddr*>(&send_addr.addr),
+ send_addr.addr_len),
+ SyscallSucceedsWithValue(sizeof(send_buf)));
+
+ // Check that we received the multicast packet.
+ char recv_buf[sizeof(send_buf)] = {};
+ ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf), 0),
+ SyscallSucceedsWithValue(sizeof(recv_buf)));
+
+ EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+}
+
+// Check that multicast works when the default send interface is configured by
+// IP_MULTICAST_IF, the send address is specified in sendto, and the group
+// membership is configured by address.
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfAddr) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Set the default send interface.
+ ip_mreq iface = {};
+ iface.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+ ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+ sizeof(iface)),
+ SyscallSucceeds());
+
+ // Bind the second FD to the v4 any address to ensure that we can receive the
+ // multicast packet.
+ auto receiver_addr = V4Any();
+ ASSERT_THAT(
+ bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ receiver_addr.addr_len),
+ SyscallSucceeds());
+ socklen_t receiver_addr_len = receiver_addr.addr_len;
+ ASSERT_THAT(getsockname(socket2->get(),
+ reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ &receiver_addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+ // Register to receive multicast packets.
+ ip_mreq group = {};
+ group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+ group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+ ASSERT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+ sizeof(group)),
+ SyscallSucceeds());
+
+ // Send a multicast packet.
+ auto send_addr = V4Multicast();
+ reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+ char send_buf[200];
+ RandomizeBuffer(send_buf, sizeof(send_buf));
+ ASSERT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+ reinterpret_cast<sockaddr*>(&send_addr.addr),
+ send_addr.addr_len),
+ SyscallSucceedsWithValue(sizeof(send_buf)));
+
+ // Check that we received the multicast packet.
+ char recv_buf[sizeof(send_buf)] = {};
+ ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf), 0),
+ SyscallSucceedsWithValue(sizeof(recv_buf)));
+
+ EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+}
+
+// Check that multicast works when the default send interface is configured by
+// IP_MULTICAST_IF, the send address is specified in sendto, and the group
+// membership is configured by NIC ID.
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfNic) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Set the default send interface.
+ ip_mreqn iface = {};
+ iface.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
+ ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+ sizeof(iface)),
+ SyscallSucceeds());
+
+ // Bind the second FD to the v4 any address to ensure that we can receive the
+ // multicast packet.
+ auto receiver_addr = V4Any();
+ ASSERT_THAT(
+ bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ receiver_addr.addr_len),
+ SyscallSucceeds());
+ socklen_t receiver_addr_len = receiver_addr.addr_len;
+ ASSERT_THAT(getsockname(socket2->get(),
+ reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ &receiver_addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+ // Register to receive multicast packets.
+ ip_mreqn group = {};
+ group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+ group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
+ ASSERT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+ sizeof(group)),
+ SyscallSucceeds());
+
+ // Send a multicast packet.
+ auto send_addr = V4Multicast();
+ reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+ char send_buf[200];
+ RandomizeBuffer(send_buf, sizeof(send_buf));
+ ASSERT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+ reinterpret_cast<sockaddr*>(&send_addr.addr),
+ send_addr.addr_len),
+ SyscallSucceedsWithValue(sizeof(send_buf)));
+
+ // Check that we received the multicast packet.
+ char recv_buf[sizeof(send_buf)] = {};
+ ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf), 0),
+ SyscallSucceedsWithValue(sizeof(recv_buf)));
+
+ EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+}
+
+// Check that multicast works when the default send interface is configured by
+// IP_MULTICAST_IF, the send address is specified in connect, and the group
+// membership is configured by address.
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfAddrConnect) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Set the default send interface.
+ ip_mreq iface = {};
+ iface.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+ ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+ sizeof(iface)),
+ SyscallSucceeds());
+
+ // Bind the second FD to the v4 any address to ensure that we can receive the
+ // multicast packet.
+ auto receiver_addr = V4Any();
+ ASSERT_THAT(
+ bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ receiver_addr.addr_len),
+ SyscallSucceeds());
+ socklen_t receiver_addr_len = receiver_addr.addr_len;
+ ASSERT_THAT(getsockname(socket2->get(),
+ reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ &receiver_addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+ // Register to receive multicast packets.
+ ip_mreq group = {};
+ group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+ group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+ ASSERT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+ sizeof(group)),
+ SyscallSucceeds());
+
+ // Send a multicast packet.
+ auto connect_addr = V4Multicast();
+ reinterpret_cast<sockaddr_in*>(&connect_addr.addr)->sin_port =
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+ ASSERT_THAT(
+ RetryEINTR(connect)(socket1->get(),
+ reinterpret_cast<sockaddr*>(&connect_addr.addr),
+ connect_addr.addr_len),
+ SyscallSucceeds());
+
+ char send_buf[200];
+ RandomizeBuffer(send_buf, sizeof(send_buf));
+ ASSERT_THAT(RetryEINTR(send)(socket1->get(), send_buf, sizeof(send_buf), 0),
+ SyscallSucceedsWithValue(sizeof(send_buf)));
+
+ // Check that we received the multicast packet.
+ char recv_buf[sizeof(send_buf)] = {};
+ ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf), 0),
+ SyscallSucceedsWithValue(sizeof(recv_buf)));
+
+ EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+}
+
+// Check that multicast works when the default send interface is configured by
+// IP_MULTICAST_IF, the send address is specified in connect, and the group
+// membership is configured by NIC ID.
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfNicConnect) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Set the default send interface.
+ ip_mreqn iface = {};
+ iface.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
+ ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+ sizeof(iface)),
+ SyscallSucceeds());
+
+ // Bind the second FD to the v4 any address to ensure that we can receive the
+ // multicast packet.
+ auto receiver_addr = V4Any();
+ ASSERT_THAT(
+ bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ receiver_addr.addr_len),
+ SyscallSucceeds());
+ socklen_t receiver_addr_len = receiver_addr.addr_len;
+ ASSERT_THAT(getsockname(socket2->get(),
+ reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ &receiver_addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+ // Register to receive multicast packets.
+ ip_mreqn group = {};
+ group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+ group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
+ ASSERT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+ sizeof(group)),
+ SyscallSucceeds());
+
+ // Send a multicast packet.
+ auto connect_addr = V4Multicast();
+ reinterpret_cast<sockaddr_in*>(&connect_addr.addr)->sin_port =
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+ ASSERT_THAT(
+ RetryEINTR(connect)(socket1->get(),
+ reinterpret_cast<sockaddr*>(&connect_addr.addr),
+ connect_addr.addr_len),
+ SyscallSucceeds());
+
+ char send_buf[200];
+ RandomizeBuffer(send_buf, sizeof(send_buf));
+ ASSERT_THAT(RetryEINTR(send)(socket1->get(), send_buf, sizeof(send_buf), 0),
+ SyscallSucceedsWithValue(sizeof(send_buf)));
+
+ // Check that we received the multicast packet.
+ char recv_buf[sizeof(send_buf)] = {};
+ ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf), 0),
+ SyscallSucceedsWithValue(sizeof(recv_buf)));
+
+ EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+}
+
+// Check that multicast works when the default send interface is configured by
+// IP_MULTICAST_IF, the send address is specified in sendto, and the group
+// membership is configured by address.
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfAddrSelf) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Set the default send interface.
+ ip_mreq iface = {};
+ iface.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+ ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+ sizeof(iface)),
+ SyscallSucceeds());
+
+ // Bind the first FD to the v4 any address to ensure that we can receive the
+ // multicast packet.
+ auto receiver_addr = V4Any();
+ ASSERT_THAT(
+ bind(socket1->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ receiver_addr.addr_len),
+ SyscallSucceeds());
+ socklen_t receiver_addr_len = receiver_addr.addr_len;
+ ASSERT_THAT(getsockname(socket1->get(),
+ reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ &receiver_addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+ // Register to receive multicast packets.
+ ip_mreq group = {};
+ group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+ group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+ ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+ sizeof(group)),
+ SyscallSucceeds());
+
+ // Send a multicast packet.
+ auto send_addr = V4Multicast();
+ reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+ char send_buf[200];
+ RandomizeBuffer(send_buf, sizeof(send_buf));
+ ASSERT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+ reinterpret_cast<sockaddr*>(&send_addr.addr),
+ send_addr.addr_len),
+ SyscallSucceedsWithValue(sizeof(send_buf)));
+
+ // Check that we received the multicast packet.
+ char recv_buf[sizeof(send_buf)] = {};
+ ASSERT_THAT(RetryEINTR(recv)(socket1->get(), recv_buf, sizeof(recv_buf), 0),
+ SyscallSucceedsWithValue(sizeof(recv_buf)));
+
+ EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+}
+
+// Check that multicast works when the default send interface is configured by
+// IP_MULTICAST_IF, the send address is specified in sendto, and the group
+// membership is configured by NIC ID.
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfNicSelf) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Set the default send interface.
+ ip_mreqn iface = {};
+ iface.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
+ ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+ sizeof(iface)),
+ SyscallSucceeds());
+
+ // Bind the first FD to the v4 any address to ensure that we can receive the
+ // multicast packet.
+ auto receiver_addr = V4Any();
+ ASSERT_THAT(
+ bind(socket1->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ receiver_addr.addr_len),
+ SyscallSucceeds());
+ socklen_t receiver_addr_len = receiver_addr.addr_len;
+ ASSERT_THAT(getsockname(socket1->get(),
+ reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ &receiver_addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+ // Register to receive multicast packets.
+ ip_mreqn group = {};
+ group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+ group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
+ ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+ sizeof(group)),
+ SyscallSucceeds());
+
+ // Send a multicast packet.
+ auto send_addr = V4Multicast();
+ reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+ char send_buf[200];
+ RandomizeBuffer(send_buf, sizeof(send_buf));
+ ASSERT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+ reinterpret_cast<sockaddr*>(&send_addr.addr),
+ send_addr.addr_len),
+ SyscallSucceedsWithValue(sizeof(send_buf)));
+
+ // Check that we received the multicast packet.
+ char recv_buf[sizeof(send_buf)] = {};
+ ASSERT_THAT(RetryEINTR(recv)(socket1->get(), recv_buf, sizeof(recv_buf), 0),
+ SyscallSucceedsWithValue(sizeof(recv_buf)));
+
+ EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+}
+
+// Check that multicast works when the default send interface is configured by
+// IP_MULTICAST_IF, the send address is specified in connect, and the group
+// membership is configured by address.
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfAddrSelfConnect) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Set the default send interface.
+ ip_mreq iface = {};
+ iface.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+ ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+ sizeof(iface)),
+ SyscallSucceeds());
+
+ // Bind the first FD to the v4 any address to ensure that we can receive the
+ // multicast packet.
+ auto receiver_addr = V4Any();
+ ASSERT_THAT(
+ bind(socket1->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ receiver_addr.addr_len),
+ SyscallSucceeds());
+ socklen_t receiver_addr_len = receiver_addr.addr_len;
+ ASSERT_THAT(getsockname(socket1->get(),
+ reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ &receiver_addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+ // Register to receive multicast packets.
+ ip_mreq group = {};
+ group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+ group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+ EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+ sizeof(group)),
+ SyscallSucceeds());
+
+ // Send a multicast packet.
+ auto connect_addr = V4Multicast();
+ reinterpret_cast<sockaddr_in*>(&connect_addr.addr)->sin_port =
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+ EXPECT_THAT(
+ RetryEINTR(connect)(socket1->get(),
+ reinterpret_cast<sockaddr*>(&connect_addr.addr),
+ connect_addr.addr_len),
+ SyscallSucceeds());
+
+ char send_buf[200];
+ RandomizeBuffer(send_buf, sizeof(send_buf));
+ ASSERT_THAT(RetryEINTR(send)(socket1->get(), send_buf, sizeof(send_buf), 0),
+ SyscallSucceedsWithValue(sizeof(send_buf)));
+
+ // Check that we did not receive the multicast packet.
+ char recv_buf[sizeof(send_buf)] = {};
+ EXPECT_THAT(RetryEINTR(recv)(socket1->get(), recv_buf, sizeof(recv_buf),
+ MSG_DONTWAIT),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+// Check that multicast works when the default send interface is configured by
+// IP_MULTICAST_IF, the send address is specified in connect, and the group
+// membership is configured by NIC ID.
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfNicSelfConnect) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Set the default send interface.
+ ip_mreqn iface = {};
+ iface.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
+ ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+ sizeof(iface)),
+ SyscallSucceeds());
+
+ // Bind the first FD to the v4 any address to ensure that we can receive the
+ // multicast packet.
+ auto receiver_addr = V4Any();
+ ASSERT_THAT(
+ bind(socket1->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ receiver_addr.addr_len),
+ SyscallSucceeds());
+ socklen_t receiver_addr_len = receiver_addr.addr_len;
+ ASSERT_THAT(getsockname(socket1->get(),
+ reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ &receiver_addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+ // Register to receive multicast packets.
+ ip_mreqn group = {};
+ group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+ group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
+ ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+ sizeof(group)),
+ SyscallSucceeds());
+
+ // Send a multicast packet.
+ auto connect_addr = V4Multicast();
+ reinterpret_cast<sockaddr_in*>(&connect_addr.addr)->sin_port =
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+ ASSERT_THAT(
+ RetryEINTR(connect)(socket1->get(),
+ reinterpret_cast<sockaddr*>(&connect_addr.addr),
+ connect_addr.addr_len),
+ SyscallSucceeds());
+
+ char send_buf[200];
+ RandomizeBuffer(send_buf, sizeof(send_buf));
+ ASSERT_THAT(RetryEINTR(send)(socket1->get(), send_buf, sizeof(send_buf), 0),
+ SyscallSucceedsWithValue(sizeof(send_buf)));
+
+ // Check that we did not receive the multicast packet.
+ char recv_buf[sizeof(send_buf)] = {};
+ EXPECT_THAT(RetryEINTR(recv)(socket1->get(), recv_buf, sizeof(recv_buf),
+ MSG_DONTWAIT),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+// Check that multicast works when the default send interface is configured by
+// IP_MULTICAST_IF, the send address is specified in sendto, and the group
+// membership is configured by address.
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfAddrSelfNoLoop) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Set the default send interface.
+ ip_mreq iface = {};
+ iface.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+ EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+ sizeof(iface)),
+ SyscallSucceeds());
+
+ ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_LOOP,
+ &kSockOptOff, sizeof(kSockOptOff)),
+ SyscallSucceeds());
+
+ // Bind the first FD to the v4 any address to ensure that we can receive the
+ // multicast packet.
+ auto receiver_addr = V4Any();
+ ASSERT_THAT(
+ bind(socket1->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ receiver_addr.addr_len),
+ SyscallSucceeds());
+ socklen_t receiver_addr_len = receiver_addr.addr_len;
+ ASSERT_THAT(getsockname(socket1->get(),
+ reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ &receiver_addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+ // Register to receive multicast packets.
+ ip_mreq group = {};
+ group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+ group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+ ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+ sizeof(group)),
+ SyscallSucceeds());
+
+ // Send a multicast packet.
+ auto send_addr = V4Multicast();
+ reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+ char send_buf[200];
+ RandomizeBuffer(send_buf, sizeof(send_buf));
+ ASSERT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+ reinterpret_cast<sockaddr*>(&send_addr.addr),
+ send_addr.addr_len),
+ SyscallSucceedsWithValue(sizeof(send_buf)));
+
+ // Check that we received the multicast packet.
+ char recv_buf[sizeof(send_buf)] = {};
+ ASSERT_THAT(RetryEINTR(recv)(socket1->get(), recv_buf, sizeof(recv_buf), 0),
+ SyscallSucceedsWithValue(sizeof(recv_buf)));
+
+ EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+}
+
+// Check that multicast works when the default send interface is configured by
+// IP_MULTICAST_IF, the send address is specified in sendto, and the group
+// membership is configured by NIC ID.
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastLoopbackIfNicSelfNoLoop) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Set the default send interface.
+ ip_mreqn iface = {};
+ iface.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
+ ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+ sizeof(iface)),
+ SyscallSucceeds());
+
+ ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_LOOP,
+ &kSockOptOff, sizeof(kSockOptOff)),
+ SyscallSucceeds());
+
+ // Bind the second FD to the v4 any address to ensure that we can receive the
+ // multicast packet.
+ auto receiver_addr = V4Any();
+ ASSERT_THAT(
+ bind(socket1->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ receiver_addr.addr_len),
+ SyscallSucceeds());
+ socklen_t receiver_addr_len = receiver_addr.addr_len;
+ ASSERT_THAT(getsockname(socket1->get(),
+ reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ &receiver_addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+ // Register to receive multicast packets.
+ ip_mreqn group = {};
+ group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+ group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
+ EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+ sizeof(group)),
+ SyscallSucceeds());
+
+ // Send a multicast packet.
+ auto send_addr = V4Multicast();
+ reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+ char send_buf[200];
+ RandomizeBuffer(send_buf, sizeof(send_buf));
+ ASSERT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+ reinterpret_cast<sockaddr*>(&send_addr.addr),
+ send_addr.addr_len),
+ SyscallSucceedsWithValue(sizeof(send_buf)));
+
+ // Check that we received the multicast packet.
+ char recv_buf[sizeof(send_buf)] = {};
+ ASSERT_THAT(RetryEINTR(recv)(socket1->get(), recv_buf, sizeof(recv_buf), 0),
+ SyscallSucceedsWithValue(sizeof(recv_buf)));
+
+ EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+}
+
+// Check that dropping a group membership that does not exist fails.
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastInvalidDrop) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Unregister from a membership that we didn't have.
+ ip_mreq group = {};
+ group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+ group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+ EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_DROP_MEMBERSHIP, &group,
+ sizeof(group)),
+ SyscallFailsWithErrno(EADDRNOTAVAIL));
+}
+
+// Check that dropping a group membership prevents multicast packets from being
+// delivered. Default send address configured by bind and group membership
+// interface configured by address.
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastDropAddr) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Bind the first FD to the loopback. This is an alternative to
+ // IP_MULTICAST_IF for setting the default send interface.
+ auto sender_addr = V4Loopback();
+ EXPECT_THAT(
+ bind(socket1->get(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
+ sender_addr.addr_len),
+ SyscallSucceeds());
+
+ // Bind the second FD to the v4 any address to ensure that we can receive the
+ // multicast packet.
+ auto receiver_addr = V4Any();
+ EXPECT_THAT(
+ bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ receiver_addr.addr_len),
+ SyscallSucceeds());
+ socklen_t receiver_addr_len = receiver_addr.addr_len;
+ ASSERT_THAT(getsockname(socket2->get(),
+ reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ &receiver_addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+ // Register and unregister to receive multicast packets.
+ ip_mreq group = {};
+ group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+ group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+ EXPECT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+ sizeof(group)),
+ SyscallSucceeds());
+ EXPECT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_DROP_MEMBERSHIP, &group,
+ sizeof(group)),
+ SyscallSucceeds());
+
+ // Send a multicast packet.
+ auto send_addr = V4Multicast();
+ reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+ char send_buf[200];
+ RandomizeBuffer(send_buf, sizeof(send_buf));
+ EXPECT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+ reinterpret_cast<sockaddr*>(&send_addr.addr),
+ send_addr.addr_len),
+ SyscallSucceedsWithValue(sizeof(send_buf)));
+
+ // Check that we did not receive the multicast packet.
+ char recv_buf[sizeof(send_buf)] = {};
+ EXPECT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf),
+ MSG_DONTWAIT),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+// Check that dropping a group membership prevents multicast packets from being
+// delivered. Default send address configured by bind and group membership
+// interface configured by NIC ID.
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastDropNic) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Bind the first FD to the loopback. This is an alternative to
+ // IP_MULTICAST_IF for setting the default send interface.
+ auto sender_addr = V4Loopback();
+ EXPECT_THAT(
+ bind(socket1->get(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
+ sender_addr.addr_len),
+ SyscallSucceeds());
+
+ // Bind the second FD to the v4 any address to ensure that we can receive the
+ // multicast packet.
+ auto receiver_addr = V4Any();
+ EXPECT_THAT(
+ bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ receiver_addr.addr_len),
+ SyscallSucceeds());
+ socklen_t receiver_addr_len = receiver_addr.addr_len;
+ ASSERT_THAT(getsockname(socket2->get(),
+ reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ &receiver_addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+ // Register and unregister to receive multicast packets.
+ ip_mreqn group = {};
+ group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+ group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
+ EXPECT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+ sizeof(group)),
+ SyscallSucceeds());
+ EXPECT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_DROP_MEMBERSHIP, &group,
+ sizeof(group)),
+ SyscallSucceeds());
+
+ // Send a multicast packet.
+ auto send_addr = V4Multicast();
+ reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+ char send_buf[200];
+ RandomizeBuffer(send_buf, sizeof(send_buf));
+ EXPECT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+ reinterpret_cast<sockaddr*>(&send_addr.addr),
+ send_addr.addr_len),
+ SyscallSucceedsWithValue(sizeof(send_buf)));
+
+ // Check that we did not receive the multicast packet.
+ char recv_buf[sizeof(send_buf)] = {};
+ EXPECT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf),
+ MSG_DONTWAIT),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfZero) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ ip_mreqn iface = {};
+ EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+ sizeof(iface)),
+ SyscallSucceeds());
+}
+
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfInvalidNic) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ ip_mreqn iface = {};
+ iface.imr_ifindex = -1;
+ EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+ sizeof(iface)),
+ SyscallFailsWithErrno(EADDRNOTAVAIL));
+}
+
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfInvalidAddr) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ ip_mreq iface = {};
+ iface.imr_interface.s_addr = inet_addr("255.255.255");
+ EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+ sizeof(iface)),
+ SyscallFailsWithErrno(EADDRNOTAVAIL));
+}
+
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfSetShort) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Create a valid full-sized request.
+ ip_mreqn iface = {};
+ iface.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
+
+ // Send an optlen of 1 to check that optlen is enforced.
+ EXPECT_THAT(
+ setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface, 1),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfDefault) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ in_addr get = {};
+ socklen_t size = sizeof(get);
+ ASSERT_THAT(
+ getsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
+ SyscallSucceeds());
+ EXPECT_EQ(size, sizeof(get));
+ EXPECT_EQ(get.s_addr, 0);
+}
+
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfDefaultReqn) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ ip_mreqn get = {};
+ socklen_t size = sizeof(get);
+ ASSERT_THAT(
+ getsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
+ SyscallSucceeds());
+
+ // getsockopt(IP_MULTICAST_IF) can only return an in_addr, so it treats the
+ // first sizeof(struct in_addr) bytes of struct ip_mreqn as a struct in_addr.
+ // Conveniently, this corresponds to the field ip_mreqn::imr_multiaddr.
+ EXPECT_EQ(size, sizeof(in_addr));
+
+ // getsockopt(IP_MULTICAST_IF) will only return the interface address which
+ // hasn't been set.
+ EXPECT_EQ(get.imr_multiaddr.s_addr, 0);
+ EXPECT_EQ(get.imr_address.s_addr, 0);
+ EXPECT_EQ(get.imr_ifindex, 0);
+}
+
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfSetAddrGetReqn) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ in_addr set = {};
+ set.s_addr = htonl(INADDR_LOOPBACK);
+ ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &set,
+ sizeof(set)),
+ SyscallSucceeds());
+
+ ip_mreqn get = {};
+ socklen_t size = sizeof(get);
+ ASSERT_THAT(
+ getsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
+ SyscallSucceeds());
+
+ // getsockopt(IP_MULTICAST_IF) can only return an in_addr, so it treats the
+ // first sizeof(struct in_addr) bytes of struct ip_mreqn as a struct in_addr.
+ // Conveniently, this corresponds to the field ip_mreqn::imr_multiaddr.
+ EXPECT_EQ(size, sizeof(in_addr));
+ EXPECT_EQ(get.imr_multiaddr.s_addr, set.s_addr);
+ EXPECT_EQ(get.imr_address.s_addr, 0);
+ EXPECT_EQ(get.imr_ifindex, 0);
+}
+
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfSetReqAddrGetReqn) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ ip_mreq set = {};
+ set.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+ ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &set,
+ sizeof(set)),
+ SyscallSucceeds());
+
+ ip_mreqn get = {};
+ socklen_t size = sizeof(get);
+ ASSERT_THAT(
+ getsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
+ SyscallSucceeds());
+
+ // getsockopt(IP_MULTICAST_IF) can only return an in_addr, so it treats the
+ // first sizeof(struct in_addr) bytes of struct ip_mreqn as a struct in_addr.
+ // Conveniently, this corresponds to the field ip_mreqn::imr_multiaddr.
+ EXPECT_EQ(size, sizeof(in_addr));
+ EXPECT_EQ(get.imr_multiaddr.s_addr, set.imr_interface.s_addr);
+ EXPECT_EQ(get.imr_address.s_addr, 0);
+ EXPECT_EQ(get.imr_ifindex, 0);
+}
+
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfSetNicGetReqn) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ ip_mreqn set = {};
+ set.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
+ ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &set,
+ sizeof(set)),
+ SyscallSucceeds());
+
+ ip_mreqn get = {};
+ socklen_t size = sizeof(get);
+ ASSERT_THAT(
+ getsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
+ SyscallSucceeds());
+ EXPECT_EQ(size, sizeof(in_addr));
+ EXPECT_EQ(get.imr_multiaddr.s_addr, 0);
+ EXPECT_EQ(get.imr_address.s_addr, 0);
+ EXPECT_EQ(get.imr_ifindex, 0);
+}
+
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfSetAddr) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ in_addr set = {};
+ set.s_addr = htonl(INADDR_LOOPBACK);
+ ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &set,
+ sizeof(set)),
+ SyscallSucceeds());
+
+ in_addr get = {};
+ socklen_t size = sizeof(get);
+ ASSERT_THAT(
+ getsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
+ SyscallSucceeds());
+
+ EXPECT_EQ(size, sizeof(get));
+ EXPECT_EQ(get.s_addr, set.s_addr);
+}
+
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfSetReqAddr) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ ip_mreq set = {};
+ set.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+ ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &set,
+ sizeof(set)),
+ SyscallSucceeds());
+
+ in_addr get = {};
+ socklen_t size = sizeof(get);
+ ASSERT_THAT(
+ getsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
+ SyscallSucceeds());
+
+ EXPECT_EQ(size, sizeof(get));
+ EXPECT_EQ(get.s_addr, set.imr_interface.s_addr);
+}
+
+TEST_P(IPv4UDPUnboundSocketTest, IpMulticastIfSetNic) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ ip_mreqn set = {};
+ set.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
+ ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &set,
+ sizeof(set)),
+ SyscallSucceeds());
+
+ in_addr get = {};
+ socklen_t size = sizeof(get);
+ ASSERT_THAT(
+ getsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &get, &size),
+ SyscallSucceeds());
+ EXPECT_EQ(size, sizeof(get));
+ EXPECT_EQ(get.s_addr, 0);
+}
+
+TEST_P(IPv4UDPUnboundSocketTest, TestJoinGroupNoIf) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ ip_mreqn group = {};
+ group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+ EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+ sizeof(group)),
+ SyscallFailsWithErrno(ENODEV));
+}
+
+TEST_P(IPv4UDPUnboundSocketTest, TestJoinGroupInvalidIf) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ ip_mreqn group = {};
+ group.imr_address.s_addr = inet_addr("255.255.255");
+ group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+ EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+ sizeof(group)),
+ SyscallFailsWithErrno(ENODEV));
+}
+
+// Check that multiple memberships are not allowed on the same socket.
+TEST_P(IPv4UDPUnboundSocketTest, TestMultipleJoinsOnSingleSocket) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto fd = socket1->get();
+ ip_mreqn group = {};
+ group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+ group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
+
+ EXPECT_THAT(
+ setsockopt(fd, IPPROTO_IP, IP_ADD_MEMBERSHIP, &group, sizeof(group)),
+ SyscallSucceeds());
+
+ EXPECT_THAT(
+ setsockopt(fd, IPPROTO_IP, IP_ADD_MEMBERSHIP, &group, sizeof(group)),
+ SyscallFailsWithErrno(EADDRINUSE));
+}
+
+// Check that two sockets can join the same multicast group at the same time.
+TEST_P(IPv4UDPUnboundSocketTest, TestTwoSocketsJoinSameMulticastGroup) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ ip_mreqn group = {};
+ group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+ group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
+ EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+ sizeof(group)),
+ SyscallSucceeds());
+ EXPECT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+ sizeof(group)),
+ SyscallSucceeds());
+
+ // Drop the membership twice on each socket, the second call for each socket
+ // should fail.
+ EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_DROP_MEMBERSHIP, &group,
+ sizeof(group)),
+ SyscallSucceeds());
+ EXPECT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_DROP_MEMBERSHIP, &group,
+ sizeof(group)),
+ SyscallFailsWithErrno(EADDRNOTAVAIL));
+ EXPECT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_DROP_MEMBERSHIP, &group,
+ sizeof(group)),
+ SyscallSucceeds());
+ EXPECT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_DROP_MEMBERSHIP, &group,
+ sizeof(group)),
+ SyscallFailsWithErrno(EADDRNOTAVAIL));
+}
+
+// Check that two sockets can join the same multicast group at the same time,
+// and both will receive data on it.
+TEST_P(IPv4UDPUnboundSocketTest, TestMcastReceptionOnTwoSockets) {
+ std::unique_ptr<SocketPair> socket_pairs[2] = {
+ absl::make_unique<FDSocketPair>(ASSERT_NO_ERRNO_AND_VALUE(NewSocket()),
+ ASSERT_NO_ERRNO_AND_VALUE(NewSocket())),
+ absl::make_unique<FDSocketPair>(ASSERT_NO_ERRNO_AND_VALUE(NewSocket()),
+ ASSERT_NO_ERRNO_AND_VALUE(NewSocket()))};
+
+ ip_mreq iface = {}, group = {};
+ iface.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+ group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+ group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+ auto receiver_addr = V4Any();
+ int bound_port = 0;
+
+ // Create two socketpairs with the exact same configuration.
+ for (auto& sockets : socket_pairs) {
+ ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
+ &iface, sizeof(iface)),
+ SyscallSucceeds());
+ ASSERT_THAT(setsockopt(sockets->second_fd(), SOL_SOCKET, SO_REUSEPORT,
+ &kSockOptOn, sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ ASSERT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+ &group, sizeof(group)),
+ SyscallSucceeds());
+ ASSERT_THAT(bind(sockets->second_fd(),
+ reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ receiver_addr.addr_len),
+ SyscallSucceeds());
+ // Get the port assigned.
+ socklen_t receiver_addr_len = receiver_addr.addr_len;
+ ASSERT_THAT(getsockname(sockets->second_fd(),
+ reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ &receiver_addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+ // On the first iteration, save the port we are bound to. On the second
+ // iteration, verify the port is the same as the one from the first
+ // iteration. In other words, both sockets listen on the same port.
+ if (bound_port == 0) {
+ bound_port =
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+ } else {
+ EXPECT_EQ(bound_port,
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port);
+ }
+ }
+
+ // Send a multicast packet to the group from two different sockets and verify
+ // it is received by both sockets that joined that group.
+ auto send_addr = V4Multicast();
+ reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port = bound_port;
+ for (auto& sockets : socket_pairs) {
+ char send_buf[200];
+ RandomizeBuffer(send_buf, sizeof(send_buf));
+ ASSERT_THAT(
+ RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
+ reinterpret_cast<sockaddr*>(&send_addr.addr),
+ send_addr.addr_len),
+ SyscallSucceedsWithValue(sizeof(send_buf)));
+
+ // Check that we received the multicast packet on both sockets.
+ for (auto& sockets : socket_pairs) {
+ char recv_buf[sizeof(send_buf)] = {};
+ ASSERT_THAT(
+ RetryEINTR(recv)(sockets->second_fd(), recv_buf, sizeof(recv_buf), 0),
+ SyscallSucceedsWithValue(sizeof(recv_buf)));
+ EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+ }
+ }
+}
+
+// Check that on two sockets that joined a group and listen on ANY, dropping
+// memberships one by one will continue to deliver packets to both sockets until
+// both memberships have been dropped.
+TEST_P(IPv4UDPUnboundSocketTest, TestMcastReceptionWhenDroppingMemberships) {
+ std::unique_ptr<SocketPair> socket_pairs[2] = {
+ absl::make_unique<FDSocketPair>(ASSERT_NO_ERRNO_AND_VALUE(NewSocket()),
+ ASSERT_NO_ERRNO_AND_VALUE(NewSocket())),
+ absl::make_unique<FDSocketPair>(ASSERT_NO_ERRNO_AND_VALUE(NewSocket()),
+ ASSERT_NO_ERRNO_AND_VALUE(NewSocket()))};
+
+ ip_mreq iface = {}, group = {};
+ iface.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+ group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+ group.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+ auto receiver_addr = V4Any();
+ int bound_port = 0;
+
+ // Create two socketpairs with the exact same configuration.
+ for (auto& sockets : socket_pairs) {
+ ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_IP, IP_MULTICAST_IF,
+ &iface, sizeof(iface)),
+ SyscallSucceeds());
+ ASSERT_THAT(setsockopt(sockets->second_fd(), SOL_SOCKET, SO_REUSEPORT,
+ &kSockOptOn, sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ ASSERT_THAT(setsockopt(sockets->second_fd(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+ &group, sizeof(group)),
+ SyscallSucceeds());
+ ASSERT_THAT(bind(sockets->second_fd(),
+ reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ receiver_addr.addr_len),
+ SyscallSucceeds());
+ // Get the port assigned.
+ socklen_t receiver_addr_len = receiver_addr.addr_len;
+ ASSERT_THAT(getsockname(sockets->second_fd(),
+ reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ &receiver_addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+ // On the first iteration, save the port we are bound to. On the second
+ // iteration, verify the port is the same as the one from the first
+ // iteration. In other words, both sockets listen on the same port.
+ if (bound_port == 0) {
+ bound_port =
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+ } else {
+ EXPECT_EQ(bound_port,
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port);
+ }
+ }
+
+ // Drop the membership of the first socket pair and verify data is still
+ // received.
+ ASSERT_THAT(setsockopt(socket_pairs[0]->second_fd(), IPPROTO_IP,
+ IP_DROP_MEMBERSHIP, &group, sizeof(group)),
+ SyscallSucceeds());
+ // Send a packet from each socket_pair.
+ auto send_addr = V4Multicast();
+ reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port = bound_port;
+ for (auto& sockets : socket_pairs) {
+ char send_buf[200];
+ RandomizeBuffer(send_buf, sizeof(send_buf));
+ ASSERT_THAT(
+ RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
+ reinterpret_cast<sockaddr*>(&send_addr.addr),
+ send_addr.addr_len),
+ SyscallSucceedsWithValue(sizeof(send_buf)));
+
+ // Check that we received the multicast packet on both sockets.
+ for (auto& sockets : socket_pairs) {
+ char recv_buf[sizeof(send_buf)] = {};
+ ASSERT_THAT(
+ RetryEINTR(recv)(sockets->second_fd(), recv_buf, sizeof(recv_buf), 0),
+ SyscallSucceedsWithValue(sizeof(recv_buf)));
+ EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+ }
+ }
+
+ // Drop the membership of the second socket pair and verify data stops being
+ // received.
+ ASSERT_THAT(setsockopt(socket_pairs[1]->second_fd(), IPPROTO_IP,
+ IP_DROP_MEMBERSHIP, &group, sizeof(group)),
+ SyscallSucceeds());
+ // Send a packet from each socket_pair.
+ for (auto& sockets : socket_pairs) {
+ char send_buf[200];
+ ASSERT_THAT(
+ RetryEINTR(sendto)(sockets->first_fd(), send_buf, sizeof(send_buf), 0,
+ reinterpret_cast<sockaddr*>(&send_addr.addr),
+ send_addr.addr_len),
+ SyscallSucceedsWithValue(sizeof(send_buf)));
+
+ char recv_buf[sizeof(send_buf)] = {};
+ for (auto& sockets : socket_pairs) {
+ ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), recv_buf,
+ sizeof(recv_buf), MSG_DONTWAIT),
+ SyscallFailsWithErrno(EAGAIN));
+ }
+ }
+}
+
+// Check that a receiving socket can bind to the multicast address before
+// joining the group and receive data once the group has been joined.
+TEST_P(IPv4UDPUnboundSocketTest, TestBindToMcastThenJoinThenReceive) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Bind second socket (receiver) to the multicast address.
+ auto receiver_addr = V4Multicast();
+ ASSERT_THAT(
+ bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ receiver_addr.addr_len),
+ SyscallSucceeds());
+ // Update receiver_addr with the correct port number.
+ socklen_t receiver_addr_len = receiver_addr.addr_len;
+ ASSERT_THAT(getsockname(socket2->get(),
+ reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ &receiver_addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+ // Register to receive multicast packets.
+ ip_mreqn group = {};
+ group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+ group.imr_ifindex = ASSERT_NO_ERRNO_AND_VALUE(InterfaceIndex("lo"));
+ ASSERT_THAT(setsockopt(socket2->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+ sizeof(group)),
+ SyscallSucceeds());
+
+ // Send a multicast packet on the first socket out the loopback interface.
+ ip_mreq iface = {};
+ iface.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+ ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+ sizeof(iface)),
+ SyscallSucceeds());
+ auto sendto_addr = V4Multicast();
+ reinterpret_cast<sockaddr_in*>(&sendto_addr.addr)->sin_port =
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+ char send_buf[200];
+ RandomizeBuffer(send_buf, sizeof(send_buf));
+ ASSERT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+ reinterpret_cast<sockaddr*>(&sendto_addr.addr),
+ sendto_addr.addr_len),
+ SyscallSucceedsWithValue(sizeof(send_buf)));
+
+ // Check that we received the multicast packet.
+ char recv_buf[sizeof(send_buf)] = {};
+ ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf),
+ MSG_DONTWAIT),
+ SyscallSucceedsWithValue(sizeof(recv_buf)));
+ EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+}
+
+// Check that a receiving socket can bind to the multicast address and won't
+// receive multicast data if it hasn't joined the group.
+TEST_P(IPv4UDPUnboundSocketTest, TestBindToMcastThenNoJoinThenNoReceive) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Bind second socket (receiver) to the multicast address.
+ auto receiver_addr = V4Multicast();
+ ASSERT_THAT(
+ bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ receiver_addr.addr_len),
+ SyscallSucceeds());
+ // Update receiver_addr with the correct port number.
+ socklen_t receiver_addr_len = receiver_addr.addr_len;
+ ASSERT_THAT(getsockname(socket2->get(),
+ reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ &receiver_addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+ // Send a multicast packet on the first socket out the loopback interface.
+ ip_mreq iface = {};
+ iface.imr_interface.s_addr = htonl(INADDR_LOOPBACK);
+ ASSERT_THAT(setsockopt(socket1->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+ sizeof(iface)),
+ SyscallSucceeds());
+ auto sendto_addr = V4Multicast();
+ reinterpret_cast<sockaddr_in*>(&sendto_addr.addr)->sin_port =
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+ char send_buf[200];
+ RandomizeBuffer(send_buf, sizeof(send_buf));
+ ASSERT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+ reinterpret_cast<sockaddr*>(&sendto_addr.addr),
+ sendto_addr.addr_len),
+ SyscallSucceedsWithValue(sizeof(send_buf)));
+
+ // Check that we don't receive the multicast packet.
+ char recv_buf[sizeof(send_buf)] = {};
+ ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf),
+ MSG_DONTWAIT),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+// Check that a socket can bind to a multicast address and still send out
+// packets.
+TEST_P(IPv4UDPUnboundSocketTest, TestBindToMcastThenSend) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Bind second socket (receiver) to the ANY address.
+ auto receiver_addr = V4Any();
+ ASSERT_THAT(
+ bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ receiver_addr.addr_len),
+ SyscallSucceeds());
+ socklen_t receiver_addr_len = receiver_addr.addr_len;
+ ASSERT_THAT(getsockname(socket2->get(),
+ reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ &receiver_addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+ // Bind the first socket (sender) to the multicast address.
+ auto sender_addr = V4Multicast();
+ ASSERT_THAT(
+ bind(socket1->get(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
+ sender_addr.addr_len),
+ SyscallSucceeds());
+ socklen_t sender_addr_len = sender_addr.addr_len;
+ ASSERT_THAT(getsockname(socket1->get(),
+ reinterpret_cast<sockaddr*>(&sender_addr.addr),
+ &sender_addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(sender_addr_len, sender_addr.addr_len);
+
+ // Send a packet on the first socket to the loopback address.
+ auto sendto_addr = V4Loopback();
+ reinterpret_cast<sockaddr_in*>(&sendto_addr.addr)->sin_port =
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+ char send_buf[200];
+ RandomizeBuffer(send_buf, sizeof(send_buf));
+ ASSERT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+ reinterpret_cast<sockaddr*>(&sendto_addr.addr),
+ sendto_addr.addr_len),
+ SyscallSucceedsWithValue(sizeof(send_buf)));
+
+ // Check that we received the packet.
+ char recv_buf[sizeof(send_buf)] = {};
+ ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf),
+ MSG_DONTWAIT),
+ SyscallSucceedsWithValue(sizeof(recv_buf)));
+ EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+}
+
+// Check that a receiving socket can bind to the broadcast address and receive
+// broadcast packets.
+TEST_P(IPv4UDPUnboundSocketTest, TestBindToBcastThenReceive) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Bind second socket (receiver) to the broadcast address.
+ auto receiver_addr = V4Broadcast();
+ ASSERT_THAT(
+ bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ receiver_addr.addr_len),
+ SyscallSucceeds());
+ socklen_t receiver_addr_len = receiver_addr.addr_len;
+ ASSERT_THAT(getsockname(socket2->get(),
+ reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ &receiver_addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+ // Send a broadcast packet on the first socket out the loopback interface.
+ EXPECT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_BROADCAST, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceedsWithValue(0));
+ // Note: Binding to the loopback interface makes the broadcast go out of it.
+ auto sender_bind_addr = V4Loopback();
+ ASSERT_THAT(
+ bind(socket1->get(), reinterpret_cast<sockaddr*>(&sender_bind_addr.addr),
+ sender_bind_addr.addr_len),
+ SyscallSucceeds());
+ auto sendto_addr = V4Broadcast();
+ reinterpret_cast<sockaddr_in*>(&sendto_addr.addr)->sin_port =
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+ char send_buf[200];
+ RandomizeBuffer(send_buf, sizeof(send_buf));
+ ASSERT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+ reinterpret_cast<sockaddr*>(&sendto_addr.addr),
+ sendto_addr.addr_len),
+ SyscallSucceedsWithValue(sizeof(send_buf)));
+
+ // Check that we received the multicast packet.
+ char recv_buf[sizeof(send_buf)] = {};
+ ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf),
+ MSG_DONTWAIT),
+ SyscallSucceedsWithValue(sizeof(recv_buf)));
+ EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+}
+
+// Check that a socket can bind to the broadcast address and still send out
+// packets.
+TEST_P(IPv4UDPUnboundSocketTest, TestBindToBcastThenSend) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Bind second socket (receiver) to the ANY address.
+ auto receiver_addr = V4Any();
+ ASSERT_THAT(
+ bind(socket2->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ receiver_addr.addr_len),
+ SyscallSucceeds());
+ socklen_t receiver_addr_len = receiver_addr.addr_len;
+ ASSERT_THAT(getsockname(socket2->get(),
+ reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ &receiver_addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+ // Bind the first socket (sender) to the broadcast address.
+ auto sender_addr = V4Broadcast();
+ ASSERT_THAT(
+ bind(socket1->get(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
+ sender_addr.addr_len),
+ SyscallSucceeds());
+ socklen_t sender_addr_len = sender_addr.addr_len;
+ ASSERT_THAT(getsockname(socket1->get(),
+ reinterpret_cast<sockaddr*>(&sender_addr.addr),
+ &sender_addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(sender_addr_len, sender_addr.addr_len);
+
+ // Send a packet on the first socket to the loopback address.
+ auto sendto_addr = V4Loopback();
+ reinterpret_cast<sockaddr_in*>(&sendto_addr.addr)->sin_port =
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+ char send_buf[200];
+ RandomizeBuffer(send_buf, sizeof(send_buf));
+ ASSERT_THAT(RetryEINTR(sendto)(socket1->get(), send_buf, sizeof(send_buf), 0,
+ reinterpret_cast<sockaddr*>(&sendto_addr.addr),
+ sendto_addr.addr_len),
+ SyscallSucceedsWithValue(sizeof(send_buf)));
+
+ // Check that we received the packet.
+ char recv_buf[sizeof(send_buf)] = {};
+ ASSERT_THAT(RetryEINTR(recv)(socket2->get(), recv_buf, sizeof(recv_buf),
+ MSG_DONTWAIT),
+ SyscallSucceedsWithValue(sizeof(recv_buf)));
+ EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+}
+
+// Check that SO_REUSEADDR always delivers to the most recently bound socket.
+//
+// FIXME(gvisor.dev/issue/873): Endpoint order is not restored correctly. Enable
+// random and co-op save (below) once that is fixed.
+TEST_P(IPv4UDPUnboundSocketTest, ReuseAddrDistribution_NoRandomSave) {
+ std::vector<std::unique_ptr<FileDescriptor>> sockets;
+ sockets.emplace_back(ASSERT_NO_ERRNO_AND_VALUE(NewSocket()));
+
+ ASSERT_THAT(setsockopt(sockets[0]->get(), SOL_SOCKET, SO_REUSEADDR,
+ &kSockOptOn, sizeof(kSockOptOn)),
+ SyscallSucceeds());
+
+ // Bind the first socket to the loopback and take note of the selected port.
+ auto addr = V4Loopback();
+ ASSERT_THAT(bind(sockets[0]->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+ addr.addr_len),
+ SyscallSucceeds());
+ socklen_t addr_len = addr.addr_len;
+ ASSERT_THAT(getsockname(sockets[0]->get(),
+ reinterpret_cast<sockaddr*>(&addr.addr), &addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(addr_len, addr.addr_len);
+
+ constexpr int kMessageSize = 200;
+
+ // FIXME(gvisor.dev/issue/873): Endpoint order is not restored correctly.
+ const DisableSave ds;
+
+ for (int i = 0; i < 10; i++) {
+ // Add a new receiver.
+ sockets.emplace_back(ASSERT_NO_ERRNO_AND_VALUE(NewSocket()));
+ auto& last = sockets.back();
+ ASSERT_THAT(setsockopt(last->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ ASSERT_THAT(bind(last->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+ addr.addr_len),
+ SyscallSucceeds());
+
+ // Send a new message to the SO_REUSEADDR group. We use a new socket each
+ // time so that a new ephemeral port will be used each time. This ensures
+ // that we aren't doing REUSEPORT-like hash load blancing.
+ auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ char send_buf[kMessageSize];
+ RandomizeBuffer(send_buf, sizeof(send_buf));
+ EXPECT_THAT(RetryEINTR(sendto)(sender->get(), send_buf, sizeof(send_buf), 0,
+ reinterpret_cast<sockaddr*>(&addr.addr),
+ addr.addr_len),
+ SyscallSucceedsWithValue(sizeof(send_buf)));
+
+ // Verify that the most recent socket got the message. We don't expect any
+ // of the other sockets to have received it, but we will check that later.
+ char recv_buf[sizeof(send_buf)] = {};
+ EXPECT_THAT(
+ RetryEINTR(recv)(last->get(), recv_buf, sizeof(recv_buf), MSG_DONTWAIT),
+ SyscallSucceedsWithValue(sizeof(send_buf)));
+ EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+ }
+
+ // Verify that no other messages were received.
+ for (auto& socket : sockets) {
+ char recv_buf[kMessageSize] = {};
+ EXPECT_THAT(RetryEINTR(recv)(socket->get(), recv_buf, sizeof(recv_buf),
+ MSG_DONTWAIT),
+ SyscallFailsWithErrno(EAGAIN));
+ }
+}
+
+TEST_P(IPv4UDPUnboundSocketTest, BindReuseAddrThenReusePort) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Bind socket1 with REUSEADDR.
+ ASSERT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+
+ // Bind the first socket to the loopback and take note of the selected port.
+ auto addr = V4Loopback();
+ ASSERT_THAT(bind(socket1->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+ addr.addr_len),
+ SyscallSucceeds());
+ socklen_t addr_len = addr.addr_len;
+ ASSERT_THAT(getsockname(socket1->get(),
+ reinterpret_cast<sockaddr*>(&addr.addr), &addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(addr_len, addr.addr_len);
+
+ // Bind socket2 to the same address as socket1, only with REUSEPORT.
+ ASSERT_THAT(setsockopt(socket2->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ ASSERT_THAT(bind(socket2->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+ addr.addr_len),
+ SyscallFailsWithErrno(EADDRINUSE));
+}
+
+TEST_P(IPv4UDPUnboundSocketTest, BindReusePortThenReuseAddr) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Bind socket1 with REUSEPORT.
+ ASSERT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+
+ // Bind the first socket to the loopback and take note of the selected port.
+ auto addr = V4Loopback();
+ ASSERT_THAT(bind(socket1->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+ addr.addr_len),
+ SyscallSucceeds());
+ socklen_t addr_len = addr.addr_len;
+ ASSERT_THAT(getsockname(socket1->get(),
+ reinterpret_cast<sockaddr*>(&addr.addr), &addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(addr_len, addr.addr_len);
+
+ // Bind socket2 to the same address as socket1, only with REUSEADDR.
+ ASSERT_THAT(setsockopt(socket2->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ ASSERT_THAT(bind(socket2->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+ addr.addr_len),
+ SyscallFailsWithErrno(EADDRINUSE));
+}
+
+TEST_P(IPv4UDPUnboundSocketTest, BindReuseAddrReusePortConvertibleToReusePort) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket3 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Bind socket1 with REUSEADDR and REUSEPORT.
+ ASSERT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ ASSERT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+
+ // Bind the first socket to the loopback and take note of the selected port.
+ auto addr = V4Loopback();
+ ASSERT_THAT(bind(socket1->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+ addr.addr_len),
+ SyscallSucceeds());
+ socklen_t addr_len = addr.addr_len;
+ ASSERT_THAT(getsockname(socket1->get(),
+ reinterpret_cast<sockaddr*>(&addr.addr), &addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(addr_len, addr.addr_len);
+
+ // Bind socket2 to the same address as socket1, only with REUSEPORT.
+ ASSERT_THAT(setsockopt(socket2->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ ASSERT_THAT(bind(socket2->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+ addr.addr_len),
+ SyscallSucceeds());
+
+ // Bind socket3 to the same address as socket1, only with REUSEADDR.
+ ASSERT_THAT(setsockopt(socket3->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ ASSERT_THAT(bind(socket3->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+ addr.addr_len),
+ SyscallFailsWithErrno(EADDRINUSE));
+}
+
+TEST_P(IPv4UDPUnboundSocketTest, BindReuseAddrReusePortConvertibleToReuseAddr) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket3 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Bind socket1 with REUSEADDR and REUSEPORT.
+ ASSERT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ ASSERT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+
+ // Bind the first socket to the loopback and take note of the selected port.
+ auto addr = V4Loopback();
+ ASSERT_THAT(bind(socket1->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+ addr.addr_len),
+ SyscallSucceeds());
+ socklen_t addr_len = addr.addr_len;
+ ASSERT_THAT(getsockname(socket1->get(),
+ reinterpret_cast<sockaddr*>(&addr.addr), &addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(addr_len, addr.addr_len);
+
+ // Bind socket2 to the same address as socket1, only with REUSEADDR.
+ ASSERT_THAT(setsockopt(socket2->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ ASSERT_THAT(bind(socket2->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+ addr.addr_len),
+ SyscallSucceeds());
+
+ // Bind socket3 to the same address as socket1, only with REUSEPORT.
+ ASSERT_THAT(setsockopt(socket3->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ ASSERT_THAT(bind(socket3->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+ addr.addr_len),
+ SyscallFailsWithErrno(EADDRINUSE));
+}
+
+TEST_P(IPv4UDPUnboundSocketTest, BindReuseAddrReusePortConversionReversable1) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket3 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Bind socket1 with REUSEADDR and REUSEPORT.
+ ASSERT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ ASSERT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+
+ // Bind the first socket to the loopback and take note of the selected port.
+ auto addr = V4Loopback();
+ ASSERT_THAT(bind(socket1->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+ addr.addr_len),
+ SyscallSucceeds());
+ socklen_t addr_len = addr.addr_len;
+ ASSERT_THAT(getsockname(socket1->get(),
+ reinterpret_cast<sockaddr*>(&addr.addr), &addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(addr_len, addr.addr_len);
+
+ // Bind socket2 to the same address as socket1, only with REUSEPORT.
+ ASSERT_THAT(setsockopt(socket2->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ ASSERT_THAT(bind(socket2->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+ addr.addr_len),
+ SyscallSucceeds());
+
+ // Close socket2 to revert to just socket1 with REUSEADDR and REUSEPORT.
+ socket2->reset();
+
+ // Bind socket3 to the same address as socket1, only with REUSEADDR.
+ ASSERT_THAT(setsockopt(socket3->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ ASSERT_THAT(bind(socket3->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+ addr.addr_len),
+ SyscallSucceeds());
+}
+
+TEST_P(IPv4UDPUnboundSocketTest, BindReuseAddrReusePortConversionReversable2) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket3 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Bind socket1 with REUSEADDR and REUSEPORT.
+ ASSERT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ ASSERT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+
+ // Bind the first socket to the loopback and take note of the selected port.
+ auto addr = V4Loopback();
+ ASSERT_THAT(bind(socket1->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+ addr.addr_len),
+ SyscallSucceeds());
+ socklen_t addr_len = addr.addr_len;
+ ASSERT_THAT(getsockname(socket1->get(),
+ reinterpret_cast<sockaddr*>(&addr.addr), &addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(addr_len, addr.addr_len);
+
+ // Bind socket2 to the same address as socket1, only with REUSEADDR.
+ ASSERT_THAT(setsockopt(socket2->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ ASSERT_THAT(bind(socket2->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+ addr.addr_len),
+ SyscallSucceeds());
+
+ // Close socket2 to revert to just socket1 with REUSEADDR and REUSEPORT.
+ socket2->reset();
+
+ // Bind socket3 to the same address as socket1, only with REUSEPORT.
+ ASSERT_THAT(setsockopt(socket3->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ ASSERT_THAT(bind(socket3->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+ addr.addr_len),
+ SyscallSucceeds());
+}
+
+TEST_P(IPv4UDPUnboundSocketTest, BindDoubleReuseAddrReusePortThenReusePort) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket3 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Bind socket1 with REUSEADDR and REUSEPORT.
+ ASSERT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ ASSERT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+
+ // Bind the first socket to the loopback and take note of the selected port.
+ auto addr = V4Loopback();
+ ASSERT_THAT(bind(socket1->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+ addr.addr_len),
+ SyscallSucceeds());
+ socklen_t addr_len = addr.addr_len;
+ ASSERT_THAT(getsockname(socket1->get(),
+ reinterpret_cast<sockaddr*>(&addr.addr), &addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(addr_len, addr.addr_len);
+
+ // Bind socket2 to the same address as socket1, also with REUSEADDR and
+ // REUSEPORT.
+ ASSERT_THAT(setsockopt(socket2->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ ASSERT_THAT(setsockopt(socket2->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+
+ ASSERT_THAT(bind(socket2->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+ addr.addr_len),
+ SyscallSucceeds());
+
+ // Bind socket3 to the same address as socket1, only with REUSEPORT.
+ ASSERT_THAT(setsockopt(socket3->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ ASSERT_THAT(bind(socket3->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+ addr.addr_len),
+ SyscallSucceeds());
+}
+
+TEST_P(IPv4UDPUnboundSocketTest, BindDoubleReuseAddrReusePortThenReuseAddr) {
+ auto socket1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto socket3 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Bind socket1 with REUSEADDR and REUSEPORT.
+ ASSERT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ ASSERT_THAT(setsockopt(socket1->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+
+ // Bind the first socket to the loopback and take note of the selected port.
+ auto addr = V4Loopback();
+ ASSERT_THAT(bind(socket1->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+ addr.addr_len),
+ SyscallSucceeds());
+ socklen_t addr_len = addr.addr_len;
+ ASSERT_THAT(getsockname(socket1->get(),
+ reinterpret_cast<sockaddr*>(&addr.addr), &addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(addr_len, addr.addr_len);
+
+ // Bind socket2 to the same address as socket1, also with REUSEADDR and
+ // REUSEPORT.
+ ASSERT_THAT(setsockopt(socket2->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ ASSERT_THAT(setsockopt(socket2->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+
+ ASSERT_THAT(bind(socket2->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+ addr.addr_len),
+ SyscallSucceeds());
+
+ // Bind socket3 to the same address as socket1, only with REUSEADDR.
+ ASSERT_THAT(setsockopt(socket3->get(), SOL_SOCKET, SO_REUSEADDR, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ ASSERT_THAT(bind(socket3->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+ addr.addr_len),
+ SyscallSucceeds());
+}
+
+// Check that REUSEPORT takes precedence over REUSEADDR.
+TEST_P(IPv4UDPUnboundSocketTest, ReuseAddrReusePortDistribution) {
+ auto receiver1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto receiver2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ ASSERT_THAT(setsockopt(receiver1->get(), SOL_SOCKET, SO_REUSEADDR,
+ &kSockOptOn, sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ ASSERT_THAT(setsockopt(receiver1->get(), SOL_SOCKET, SO_REUSEPORT,
+ &kSockOptOn, sizeof(kSockOptOn)),
+ SyscallSucceeds());
+
+ // Bind the first socket to the loopback and take note of the selected port.
+ auto addr = V4Loopback();
+ ASSERT_THAT(bind(receiver1->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+ addr.addr_len),
+ SyscallSucceeds());
+ socklen_t addr_len = addr.addr_len;
+ ASSERT_THAT(getsockname(receiver1->get(),
+ reinterpret_cast<sockaddr*>(&addr.addr), &addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(addr_len, addr.addr_len);
+
+ // Bind receiver2 to the same address as socket1, also with REUSEADDR and
+ // REUSEPORT.
+ ASSERT_THAT(setsockopt(receiver2->get(), SOL_SOCKET, SO_REUSEADDR,
+ &kSockOptOn, sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ ASSERT_THAT(setsockopt(receiver2->get(), SOL_SOCKET, SO_REUSEPORT,
+ &kSockOptOn, sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ ASSERT_THAT(bind(receiver2->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+ addr.addr_len),
+ SyscallSucceeds());
+
+ constexpr int kMessageSize = 10;
+
+ for (int i = 0; i < 100; ++i) {
+ // Send a new message to the REUSEADDR/REUSEPORT group. We use a new socket
+ // each time so that a new ephemerial port will be used each time. This
+ // ensures that we cycle through hashes.
+ auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ char send_buf[kMessageSize] = {};
+ EXPECT_THAT(RetryEINTR(sendto)(sender->get(), send_buf, sizeof(send_buf), 0,
+ reinterpret_cast<sockaddr*>(&addr.addr),
+ addr.addr_len),
+ SyscallSucceedsWithValue(sizeof(send_buf)));
+ }
+
+ // Check that both receivers got messages. This checks that we are using load
+ // balancing (REUSEPORT) instead of the most recently bound socket
+ // (REUSEADDR).
+ char recv_buf[kMessageSize] = {};
+ EXPECT_THAT(RetryEINTR(recv)(receiver1->get(), recv_buf, sizeof(recv_buf),
+ MSG_DONTWAIT),
+ SyscallSucceedsWithValue(kMessageSize));
+ EXPECT_THAT(RetryEINTR(recv)(receiver2->get(), recv_buf, sizeof(recv_buf),
+ MSG_DONTWAIT),
+ SyscallSucceedsWithValue(kMessageSize));
+}
+
+// Check that connect returns EADDRNOTAVAIL when out of local ephemeral ports.
+// We disable S/R because this test creates a large number of sockets.
+TEST_P(IPv4UDPUnboundSocketTest, UDPConnectPortExhaustion_NoRandomSave) {
+ auto receiver1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ constexpr int kClients = 65536;
+ // Bind the first socket to the loopback and take note of the selected port.
+ auto addr = V4Loopback();
+ ASSERT_THAT(bind(receiver1->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+ addr.addr_len),
+ SyscallSucceeds());
+ socklen_t addr_len = addr.addr_len;
+ ASSERT_THAT(getsockname(receiver1->get(),
+ reinterpret_cast<sockaddr*>(&addr.addr), &addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(addr_len, addr.addr_len);
+
+ // Disable cooperative S/R as we are making too many syscalls.
+ DisableSave ds;
+ std::vector<std::unique_ptr<FileDescriptor>> sockets;
+ for (int i = 0; i < kClients; i++) {
+ auto s = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ int ret = connect(s->get(), reinterpret_cast<sockaddr*>(&addr.addr),
+ addr.addr_len);
+ if (ret == 0) {
+ sockets.push_back(std::move(s));
+ continue;
+ }
+ ASSERT_THAT(ret, SyscallFailsWithErrno(EAGAIN));
+ break;
+ }
+}
+
+// Test that socket will receive packet info control message.
+TEST_P(IPv4UDPUnboundSocketTest, SetAndReceiveIPPKTINFO) {
+ // TODO(gvisor.dev/issue/1202): ioctl() is not supported by hostinet.
+ SKIP_IF((IsRunningWithHostinet()));
+
+ auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto receiver = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto sender_addr = V4Loopback();
+ int level = SOL_IP;
+ int type = IP_PKTINFO;
+
+ ASSERT_THAT(
+ bind(receiver->get(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
+ sender_addr.addr_len),
+ SyscallSucceeds());
+ socklen_t sender_addr_len = sender_addr.addr_len;
+ ASSERT_THAT(getsockname(receiver->get(),
+ reinterpret_cast<sockaddr*>(&sender_addr.addr),
+ &sender_addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(sender_addr_len, sender_addr.addr_len);
+
+ auto receiver_addr = V4Loopback();
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port =
+ reinterpret_cast<sockaddr_in*>(&sender_addr.addr)->sin_port;
+ ASSERT_THAT(
+ connect(sender->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ receiver_addr.addr_len),
+ SyscallSucceeds());
+
+ // Allow socket to receive control message.
+ ASSERT_THAT(
+ setsockopt(receiver->get(), level, type, &kSockOptOn, sizeof(kSockOptOn)),
+ SyscallSucceeds());
+
+ // Prepare message to send.
+ constexpr size_t kDataLength = 1024;
+ msghdr sent_msg = {};
+ iovec sent_iov = {};
+ char sent_data[kDataLength];
+ sent_iov.iov_base = sent_data;
+ sent_iov.iov_len = kDataLength;
+ sent_msg.msg_iov = &sent_iov;
+ sent_msg.msg_iovlen = 1;
+ sent_msg.msg_flags = 0;
+
+ ASSERT_THAT(RetryEINTR(sendmsg)(sender->get(), &sent_msg, 0),
+ SyscallSucceedsWithValue(kDataLength));
+
+ msghdr received_msg = {};
+ iovec received_iov = {};
+ char received_data[kDataLength];
+ char received_cmsg_buf[CMSG_SPACE(sizeof(in_pktinfo))] = {};
+ size_t cmsg_data_len = sizeof(in_pktinfo);
+ received_iov.iov_base = received_data;
+ received_iov.iov_len = kDataLength;
+ received_msg.msg_iov = &received_iov;
+ received_msg.msg_iovlen = 1;
+ received_msg.msg_controllen = CMSG_LEN(cmsg_data_len);
+ received_msg.msg_control = received_cmsg_buf;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(receiver->get(), &received_msg, 0),
+ SyscallSucceedsWithValue(kDataLength));
+
+ cmsghdr* cmsg = CMSG_FIRSTHDR(&received_msg);
+ ASSERT_NE(cmsg, nullptr);
+ EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(cmsg_data_len));
+ EXPECT_EQ(cmsg->cmsg_level, level);
+ EXPECT_EQ(cmsg->cmsg_type, type);
+
+ // Get loopback index.
+ ifreq ifr = {};
+ absl::SNPrintF(ifr.ifr_name, IFNAMSIZ, "lo");
+ ASSERT_THAT(ioctl(sender->get(), SIOCGIFINDEX, &ifr), SyscallSucceeds());
+ ASSERT_NE(ifr.ifr_ifindex, 0);
+
+ // Check the data
+ in_pktinfo received_pktinfo = {};
+ memcpy(&received_pktinfo, CMSG_DATA(cmsg), sizeof(in_pktinfo));
+ EXPECT_EQ(received_pktinfo.ipi_ifindex, ifr.ifr_ifindex);
+ EXPECT_EQ(received_pktinfo.ipi_spec_dst.s_addr, htonl(INADDR_LOOPBACK));
+ EXPECT_EQ(received_pktinfo.ipi_addr.s_addr, htonl(INADDR_LOOPBACK));
+}
+
+// Check that setting SO_RCVBUF below min is clamped to the minimum
+// receive buffer size.
+TEST_P(IPv4UDPUnboundSocketTest, SetSocketRecvBufBelowMin) {
+ auto s = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Discover minimum buffer size by setting it to zero.
+ constexpr int kRcvBufSz = 0;
+ ASSERT_THAT(setsockopt(s->get(), SOL_SOCKET, SO_RCVBUF, &kRcvBufSz,
+ sizeof(kRcvBufSz)),
+ SyscallSucceeds());
+
+ int min = 0;
+ socklen_t min_len = sizeof(min);
+ ASSERT_THAT(getsockopt(s->get(), SOL_SOCKET, SO_RCVBUF, &min, &min_len),
+ SyscallSucceeds());
+
+ // Linux doubles the value so let's use a value that when doubled will still
+ // be smaller than min.
+ int below_min = min / 2 - 1;
+ ASSERT_THAT(setsockopt(s->get(), SOL_SOCKET, SO_RCVBUF, &below_min,
+ sizeof(below_min)),
+ SyscallSucceeds());
+
+ int val = 0;
+ socklen_t val_len = sizeof(val);
+ ASSERT_THAT(getsockopt(s->get(), SOL_SOCKET, SO_RCVBUF, &val, &val_len),
+ SyscallSucceeds());
+
+ ASSERT_EQ(min, val);
+}
+
+// Check that setting SO_RCVBUF above max is clamped to the maximum
+// receive buffer size.
+TEST_P(IPv4UDPUnboundSocketTest, SetSocketRecvBufAboveMax) {
+ auto s = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Discover maxmimum buffer size by setting to a really large value.
+ constexpr int kRcvBufSz = 0xffffffff;
+ ASSERT_THAT(setsockopt(s->get(), SOL_SOCKET, SO_RCVBUF, &kRcvBufSz,
+ sizeof(kRcvBufSz)),
+ SyscallSucceeds());
+
+ int max = 0;
+ socklen_t max_len = sizeof(max);
+ ASSERT_THAT(getsockopt(s->get(), SOL_SOCKET, SO_RCVBUF, &max, &max_len),
+ SyscallSucceeds());
+
+ int above_max = max + 1;
+ ASSERT_THAT(setsockopt(s->get(), SOL_SOCKET, SO_RCVBUF, &above_max,
+ sizeof(above_max)),
+ SyscallSucceeds());
+
+ int val = 0;
+ socklen_t val_len = sizeof(val);
+ ASSERT_THAT(getsockopt(s->get(), SOL_SOCKET, SO_RCVBUF, &val, &val_len),
+ SyscallSucceeds());
+ ASSERT_EQ(max, val);
+}
+
+// Check that setting SO_RCVBUF min <= rcvBufSz <= max is honored.
+TEST_P(IPv4UDPUnboundSocketTest, SetSocketRecvBuf) {
+ auto s = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ int max = 0;
+ int min = 0;
+ {
+ // Discover maxmimum buffer size by setting to a really large value.
+ constexpr int kRcvBufSz = 0xffffffff;
+ ASSERT_THAT(setsockopt(s->get(), SOL_SOCKET, SO_RCVBUF, &kRcvBufSz,
+ sizeof(kRcvBufSz)),
+ SyscallSucceeds());
+
+ max = 0;
+ socklen_t max_len = sizeof(max);
+ ASSERT_THAT(getsockopt(s->get(), SOL_SOCKET, SO_RCVBUF, &max, &max_len),
+ SyscallSucceeds());
+ }
+
+ {
+ // Discover minimum buffer size by setting it to zero.
+ constexpr int kRcvBufSz = 0;
+ ASSERT_THAT(setsockopt(s->get(), SOL_SOCKET, SO_RCVBUF, &kRcvBufSz,
+ sizeof(kRcvBufSz)),
+ SyscallSucceeds());
+
+ socklen_t min_len = sizeof(min);
+ ASSERT_THAT(getsockopt(s->get(), SOL_SOCKET, SO_RCVBUF, &min, &min_len),
+ SyscallSucceeds());
+ }
+
+ int quarter_sz = min + (max - min) / 4;
+ ASSERT_THAT(setsockopt(s->get(), SOL_SOCKET, SO_RCVBUF, &quarter_sz,
+ sizeof(quarter_sz)),
+ SyscallSucceeds());
+
+ int val = 0;
+ socklen_t val_len = sizeof(val);
+ ASSERT_THAT(getsockopt(s->get(), SOL_SOCKET, SO_RCVBUF, &val, &val_len),
+ SyscallSucceeds());
+
+ // Linux doubles the value set by SO_SNDBUF/SO_RCVBUF.
+ if (!IsRunningOnGvisor()) {
+ quarter_sz *= 2;
+ }
+ ASSERT_EQ(quarter_sz, val);
+}
+
+// Check that setting SO_SNDBUF below min is clamped to the minimum
+// send buffer size.
+TEST_P(IPv4UDPUnboundSocketTest, SetSocketSendBufBelowMin) {
+ auto s = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Discover minimum buffer size by setting it to zero.
+ constexpr int kSndBufSz = 0;
+ ASSERT_THAT(setsockopt(s->get(), SOL_SOCKET, SO_SNDBUF, &kSndBufSz,
+ sizeof(kSndBufSz)),
+ SyscallSucceeds());
+
+ int min = 0;
+ socklen_t min_len = sizeof(min);
+ ASSERT_THAT(getsockopt(s->get(), SOL_SOCKET, SO_SNDBUF, &min, &min_len),
+ SyscallSucceeds());
+
+ // Linux doubles the value so let's use a value that when doubled will still
+ // be smaller than min.
+ int below_min = min / 2 - 1;
+ ASSERT_THAT(setsockopt(s->get(), SOL_SOCKET, SO_SNDBUF, &below_min,
+ sizeof(below_min)),
+ SyscallSucceeds());
+
+ int val = 0;
+ socklen_t val_len = sizeof(val);
+ ASSERT_THAT(getsockopt(s->get(), SOL_SOCKET, SO_SNDBUF, &val, &val_len),
+ SyscallSucceeds());
+
+ ASSERT_EQ(min, val);
+}
+
+// Check that setting SO_SNDBUF above max is clamped to the maximum
+// send buffer size.
+TEST_P(IPv4UDPUnboundSocketTest, SetSocketSendBufAboveMax) {
+ auto s = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Discover maxmimum buffer size by setting to a really large value.
+ constexpr int kSndBufSz = 0xffffffff;
+ ASSERT_THAT(setsockopt(s->get(), SOL_SOCKET, SO_SNDBUF, &kSndBufSz,
+ sizeof(kSndBufSz)),
+ SyscallSucceeds());
+
+ int max = 0;
+ socklen_t max_len = sizeof(max);
+ ASSERT_THAT(getsockopt(s->get(), SOL_SOCKET, SO_SNDBUF, &max, &max_len),
+ SyscallSucceeds());
+
+ int above_max = max + 1;
+ ASSERT_THAT(setsockopt(s->get(), SOL_SOCKET, SO_SNDBUF, &above_max,
+ sizeof(above_max)),
+ SyscallSucceeds());
+
+ int val = 0;
+ socklen_t val_len = sizeof(val);
+ ASSERT_THAT(getsockopt(s->get(), SOL_SOCKET, SO_SNDBUF, &val, &val_len),
+ SyscallSucceeds());
+ ASSERT_EQ(max, val);
+}
+
+// Check that setting SO_SNDBUF min <= kSndBufSz <= max is honored.
+TEST_P(IPv4UDPUnboundSocketTest, SetSocketSendBuf) {
+ auto s = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ int max = 0;
+ int min = 0;
+ {
+ // Discover maxmimum buffer size by setting to a really large value.
+ constexpr int kSndBufSz = 0xffffffff;
+ ASSERT_THAT(setsockopt(s->get(), SOL_SOCKET, SO_SNDBUF, &kSndBufSz,
+ sizeof(kSndBufSz)),
+ SyscallSucceeds());
+
+ max = 0;
+ socklen_t max_len = sizeof(max);
+ ASSERT_THAT(getsockopt(s->get(), SOL_SOCKET, SO_SNDBUF, &max, &max_len),
+ SyscallSucceeds());
+ }
+
+ {
+ // Discover minimum buffer size by setting it to zero.
+ constexpr int kSndBufSz = 0;
+ ASSERT_THAT(setsockopt(s->get(), SOL_SOCKET, SO_SNDBUF, &kSndBufSz,
+ sizeof(kSndBufSz)),
+ SyscallSucceeds());
+
+ socklen_t min_len = sizeof(min);
+ ASSERT_THAT(getsockopt(s->get(), SOL_SOCKET, SO_SNDBUF, &min, &min_len),
+ SyscallSucceeds());
+ }
+
+ int quarter_sz = min + (max - min) / 4;
+ ASSERT_THAT(setsockopt(s->get(), SOL_SOCKET, SO_SNDBUF, &quarter_sz,
+ sizeof(quarter_sz)),
+ SyscallSucceeds());
+
+ int val = 0;
+ socklen_t val_len = sizeof(val);
+ ASSERT_THAT(getsockopt(s->get(), SOL_SOCKET, SO_SNDBUF, &val, &val_len),
+ SyscallSucceeds());
+
+ // Linux doubles the value set by SO_SNDBUF/SO_RCVBUF.
+ if (!IsRunningOnGvisor()) {
+ quarter_sz *= 2;
+ }
+
+ ASSERT_EQ(quarter_sz, val);
+}
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound.h b/test/syscalls/linux/socket_ipv4_udp_unbound.h
new file mode 100644
index 000000000..f64c57645
--- /dev/null
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound.h
@@ -0,0 +1,29 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_UDP_UNBOUND_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_UDP_UNBOUND_H_
+
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to IPv4 UDP sockets.
+using IPv4UDPUnboundSocketTest = SimpleSocketTest;
+
+} // namespace testing
+} // namespace gvisor
+
+#endif // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_UDP_UNBOUND_H_
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
new file mode 100644
index 000000000..d690d9564
--- /dev/null
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
@@ -0,0 +1,1099 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h"
+
+#include <arpa/inet.h>
+#include <ifaddrs.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+TestAddress V4EmptyAddress() {
+ TestAddress t("V4Empty");
+ t.addr.ss_family = AF_INET;
+ t.addr_len = sizeof(sockaddr_in);
+ return t;
+}
+
+void IPv4UDPUnboundExternalNetworkingSocketTest::SetUp() {
+ got_if_infos_ = false;
+
+ // Get interface list.
+ ASSERT_NO_ERRNO(if_helper_.Load());
+ std::vector<std::string> if_names = if_helper_.InterfaceList(AF_INET);
+ if (if_names.size() != 2) {
+ return;
+ }
+
+ // Figure out which interface is where.
+ std::string lo = if_names[0];
+ std::string eth = if_names[1];
+ if (lo != "lo") std::swap(lo, eth);
+ if (lo != "lo") return;
+
+ lo_if_idx_ = ASSERT_NO_ERRNO_AND_VALUE(if_helper_.GetIndex(lo));
+ auto lo_if_addr = if_helper_.GetAddr(AF_INET, lo);
+ if (lo_if_addr == nullptr) {
+ return;
+ }
+ lo_if_addr_ = *reinterpret_cast<const sockaddr_in*>(lo_if_addr);
+
+ eth_if_idx_ = ASSERT_NO_ERRNO_AND_VALUE(if_helper_.GetIndex(eth));
+ auto eth_if_addr = if_helper_.GetAddr(AF_INET, eth);
+ if (eth_if_addr == nullptr) {
+ return;
+ }
+ eth_if_addr_ = *reinterpret_cast<const sockaddr_in*>(eth_if_addr);
+
+ got_if_infos_ = true;
+}
+
+// Verifies that a newly instantiated UDP socket does not have the
+// broadcast socket option enabled.
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, UDPBroadcastDefault) {
+ auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ int get = -1;
+ socklen_t get_sz = sizeof(get);
+ EXPECT_THAT(
+ getsockopt(socket->get(), SOL_SOCKET, SO_BROADCAST, &get, &get_sz),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get, kSockOptOff);
+ EXPECT_EQ(get_sz, sizeof(get));
+}
+
+// Verifies that a newly instantiated UDP socket returns true after enabling
+// the broadcast socket option.
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, SetUDPBroadcast) {
+ auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ EXPECT_THAT(setsockopt(socket->get(), SOL_SOCKET, SO_BROADCAST, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceedsWithValue(0));
+
+ int get = -1;
+ socklen_t get_sz = sizeof(get);
+ EXPECT_THAT(
+ getsockopt(socket->get(), SOL_SOCKET, SO_BROADCAST, &get, &get_sz),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get, kSockOptOn);
+ EXPECT_EQ(get_sz, sizeof(get));
+}
+
+// Verifies that a broadcast UDP packet will arrive at all UDP sockets with
+// the destination port number.
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
+ UDPBroadcastReceivedOnExpectedPort) {
+ auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto rcvr1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto rcvr2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto norcv = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Enable SO_BROADCAST on the sending socket.
+ ASSERT_THAT(setsockopt(sender->get(), SOL_SOCKET, SO_BROADCAST, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceedsWithValue(0));
+
+ // Enable SO_REUSEPORT on the receiving sockets so that they may both be bound
+ // to the broadcast messages destination port.
+ ASSERT_THAT(setsockopt(rcvr1->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceedsWithValue(0));
+ ASSERT_THAT(setsockopt(rcvr2->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceedsWithValue(0));
+
+ // Bind the first socket to the ANY address and let the system assign a port.
+ auto rcv1_addr = V4Any();
+ ASSERT_THAT(bind(rcvr1->get(), reinterpret_cast<sockaddr*>(&rcv1_addr.addr),
+ rcv1_addr.addr_len),
+ SyscallSucceedsWithValue(0));
+ // Retrieve port number from first socket so that it can be bound to the
+ // second socket.
+ socklen_t rcv_addr_sz = rcv1_addr.addr_len;
+ ASSERT_THAT(
+ getsockname(rcvr1->get(), reinterpret_cast<sockaddr*>(&rcv1_addr.addr),
+ &rcv_addr_sz),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(rcv_addr_sz, rcv1_addr.addr_len);
+ auto port = reinterpret_cast<sockaddr_in*>(&rcv1_addr.addr)->sin_port;
+
+ // Bind the second socket to the same address:port as the first.
+ ASSERT_THAT(bind(rcvr2->get(), reinterpret_cast<sockaddr*>(&rcv1_addr.addr),
+ rcv_addr_sz),
+ SyscallSucceedsWithValue(0));
+
+ // Bind the non-receiving socket to an ephemeral port.
+ auto norecv_addr = V4Any();
+ ASSERT_THAT(bind(norcv->get(), reinterpret_cast<sockaddr*>(&norecv_addr.addr),
+ norecv_addr.addr_len),
+ SyscallSucceedsWithValue(0));
+
+ // Broadcast a test message.
+ auto dst_addr = V4Broadcast();
+ reinterpret_cast<sockaddr_in*>(&dst_addr.addr)->sin_port = port;
+ constexpr char kTestMsg[] = "hello, world";
+ EXPECT_THAT(
+ sendto(sender->get(), kTestMsg, sizeof(kTestMsg), 0,
+ reinterpret_cast<sockaddr*>(&dst_addr.addr), dst_addr.addr_len),
+ SyscallSucceedsWithValue(sizeof(kTestMsg)));
+
+ // Verify that the receiving sockets received the test message.
+ char buf[sizeof(kTestMsg)] = {};
+ EXPECT_THAT(recv(rcvr1->get(), buf, sizeof(buf), 0),
+ SyscallSucceedsWithValue(sizeof(kTestMsg)));
+ EXPECT_EQ(0, memcmp(buf, kTestMsg, sizeof(kTestMsg)));
+ memset(buf, 0, sizeof(buf));
+ EXPECT_THAT(recv(rcvr2->get(), buf, sizeof(buf), 0),
+ SyscallSucceedsWithValue(sizeof(kTestMsg)));
+ EXPECT_EQ(0, memcmp(buf, kTestMsg, sizeof(kTestMsg)));
+
+ // Verify that the non-receiving socket did not receive the test message.
+ memset(buf, 0, sizeof(buf));
+ EXPECT_THAT(RetryEINTR(recv)(norcv->get(), buf, sizeof(buf), MSG_DONTWAIT),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+// Verifies that a broadcast UDP packet will arrive at all UDP sockets bound to
+// the destination port number and either INADDR_ANY or INADDR_BROADCAST, but
+// not a unicast address.
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
+ UDPBroadcastReceivedOnExpectedAddresses) {
+ // FIXME(b/137899561): Linux instance for syscall tests sometimes misses its
+ // IPv4 address on eth0.
+ SKIP_IF(!got_if_infos_);
+
+ auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto rcvr1 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto rcvr2 = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto norcv = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Enable SO_BROADCAST on the sending socket.
+ ASSERT_THAT(setsockopt(sender->get(), SOL_SOCKET, SO_BROADCAST, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceedsWithValue(0));
+
+ // Enable SO_REUSEPORT on all sockets so that they may all be bound to the
+ // broadcast messages destination port.
+ ASSERT_THAT(setsockopt(rcvr1->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceedsWithValue(0));
+ ASSERT_THAT(setsockopt(rcvr2->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceedsWithValue(0));
+ ASSERT_THAT(setsockopt(norcv->get(), SOL_SOCKET, SO_REUSEPORT, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceedsWithValue(0));
+
+ // Bind the first socket the ANY address and let the system assign a port.
+ auto rcv1_addr = V4Any();
+ ASSERT_THAT(bind(rcvr1->get(), reinterpret_cast<sockaddr*>(&rcv1_addr.addr),
+ rcv1_addr.addr_len),
+ SyscallSucceedsWithValue(0));
+ // Retrieve port number from first socket so that it can be bound to the
+ // second socket.
+ socklen_t rcv_addr_sz = rcv1_addr.addr_len;
+ ASSERT_THAT(
+ getsockname(rcvr1->get(), reinterpret_cast<sockaddr*>(&rcv1_addr.addr),
+ &rcv_addr_sz),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(rcv_addr_sz, rcv1_addr.addr_len);
+ auto port = reinterpret_cast<sockaddr_in*>(&rcv1_addr.addr)->sin_port;
+
+ // Bind the second socket to the broadcast address.
+ auto rcv2_addr = V4Broadcast();
+ reinterpret_cast<sockaddr_in*>(&rcv2_addr.addr)->sin_port = port;
+ ASSERT_THAT(bind(rcvr2->get(), reinterpret_cast<sockaddr*>(&rcv2_addr.addr),
+ rcv2_addr.addr_len),
+ SyscallSucceedsWithValue(0));
+
+ // Bind the non-receiving socket to the unicast ethernet address.
+ auto norecv_addr = rcv1_addr;
+ reinterpret_cast<sockaddr_in*>(&norecv_addr.addr)->sin_addr =
+ eth_if_addr_.sin_addr;
+ ASSERT_THAT(bind(norcv->get(), reinterpret_cast<sockaddr*>(&norecv_addr.addr),
+ norecv_addr.addr_len),
+ SyscallSucceedsWithValue(0));
+
+ // Broadcast a test message.
+ auto dst_addr = V4Broadcast();
+ reinterpret_cast<sockaddr_in*>(&dst_addr.addr)->sin_port = port;
+ constexpr char kTestMsg[] = "hello, world";
+ EXPECT_THAT(
+ sendto(sender->get(), kTestMsg, sizeof(kTestMsg), 0,
+ reinterpret_cast<sockaddr*>(&dst_addr.addr), dst_addr.addr_len),
+ SyscallSucceedsWithValue(sizeof(kTestMsg)));
+
+ // Verify that the receiving sockets received the test message.
+ char buf[sizeof(kTestMsg)] = {};
+ EXPECT_THAT(recv(rcvr1->get(), buf, sizeof(buf), 0),
+ SyscallSucceedsWithValue(sizeof(kTestMsg)));
+ EXPECT_EQ(0, memcmp(buf, kTestMsg, sizeof(kTestMsg)));
+ memset(buf, 0, sizeof(buf));
+ EXPECT_THAT(recv(rcvr2->get(), buf, sizeof(buf), 0),
+ SyscallSucceedsWithValue(sizeof(kTestMsg)));
+ EXPECT_EQ(0, memcmp(buf, kTestMsg, sizeof(kTestMsg)));
+
+ // Verify that the non-receiving socket did not receive the test message.
+ memset(buf, 0, sizeof(buf));
+ EXPECT_THAT(RetryEINTR(recv)(norcv->get(), buf, sizeof(buf), MSG_DONTWAIT),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+// Verifies that a UDP broadcast can be sent and then received back on the same
+// socket that is bound to the broadcast address (255.255.255.255).
+// FIXME(b/141938460): This can be combined with the next test
+// (UDPBroadcastSendRecvOnSocketBoundToAny).
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
+ UDPBroadcastSendRecvOnSocketBoundToBroadcast) {
+ auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Enable SO_BROADCAST.
+ ASSERT_THAT(setsockopt(sender->get(), SOL_SOCKET, SO_BROADCAST, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceedsWithValue(0));
+
+ // Bind the sender to the broadcast address.
+ auto src_addr = V4Broadcast();
+ ASSERT_THAT(bind(sender->get(), reinterpret_cast<sockaddr*>(&src_addr.addr),
+ src_addr.addr_len),
+ SyscallSucceedsWithValue(0));
+ socklen_t src_sz = src_addr.addr_len;
+ ASSERT_THAT(getsockname(sender->get(),
+ reinterpret_cast<sockaddr*>(&src_addr.addr), &src_sz),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(src_sz, src_addr.addr_len);
+
+ // Send the message.
+ auto dst_addr = V4Broadcast();
+ reinterpret_cast<sockaddr_in*>(&dst_addr.addr)->sin_port =
+ reinterpret_cast<sockaddr_in*>(&src_addr.addr)->sin_port;
+ constexpr char kTestMsg[] = "hello, world";
+ EXPECT_THAT(
+ sendto(sender->get(), kTestMsg, sizeof(kTestMsg), 0,
+ reinterpret_cast<sockaddr*>(&dst_addr.addr), dst_addr.addr_len),
+ SyscallSucceedsWithValue(sizeof(kTestMsg)));
+
+ // Verify that the message was received.
+ char buf[sizeof(kTestMsg)] = {};
+ EXPECT_THAT(RetryEINTR(recv)(sender->get(), buf, sizeof(buf), 0),
+ SyscallSucceedsWithValue(sizeof(kTestMsg)));
+ EXPECT_EQ(0, memcmp(buf, kTestMsg, sizeof(kTestMsg)));
+}
+
+// Verifies that a UDP broadcast can be sent and then received back on the same
+// socket that is bound to the ANY address (0.0.0.0).
+// FIXME(b/141938460): This can be combined with the previous test
+// (UDPBroadcastSendRecvOnSocketBoundToBroadcast).
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
+ UDPBroadcastSendRecvOnSocketBoundToAny) {
+ auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Enable SO_BROADCAST.
+ ASSERT_THAT(setsockopt(sender->get(), SOL_SOCKET, SO_BROADCAST, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceedsWithValue(0));
+
+ // Bind the sender to the ANY address.
+ auto src_addr = V4Any();
+ ASSERT_THAT(bind(sender->get(), reinterpret_cast<sockaddr*>(&src_addr.addr),
+ src_addr.addr_len),
+ SyscallSucceedsWithValue(0));
+ socklen_t src_sz = src_addr.addr_len;
+ ASSERT_THAT(getsockname(sender->get(),
+ reinterpret_cast<sockaddr*>(&src_addr.addr), &src_sz),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(src_sz, src_addr.addr_len);
+
+ // Send the message.
+ auto dst_addr = V4Broadcast();
+ reinterpret_cast<sockaddr_in*>(&dst_addr.addr)->sin_port =
+ reinterpret_cast<sockaddr_in*>(&src_addr.addr)->sin_port;
+ constexpr char kTestMsg[] = "hello, world";
+ EXPECT_THAT(
+ sendto(sender->get(), kTestMsg, sizeof(kTestMsg), 0,
+ reinterpret_cast<sockaddr*>(&dst_addr.addr), dst_addr.addr_len),
+ SyscallSucceedsWithValue(sizeof(kTestMsg)));
+
+ // Verify that the message was received.
+ char buf[sizeof(kTestMsg)] = {};
+ EXPECT_THAT(RetryEINTR(recv)(sender->get(), buf, sizeof(buf), 0),
+ SyscallSucceedsWithValue(sizeof(kTestMsg)));
+ EXPECT_EQ(0, memcmp(buf, kTestMsg, sizeof(kTestMsg)));
+}
+
+// Verifies that a UDP broadcast fails to send on a socket with SO_BROADCAST
+// disabled.
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, TestSendBroadcast) {
+ auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Broadcast a test message without having enabled SO_BROADCAST on the sending
+ // socket.
+ auto addr = V4Broadcast();
+ reinterpret_cast<sockaddr_in*>(&addr.addr)->sin_port = htons(12345);
+ constexpr char kTestMsg[] = "hello, world";
+
+ EXPECT_THAT(sendto(sender->get(), kTestMsg, sizeof(kTestMsg), 0,
+ reinterpret_cast<sockaddr*>(&addr.addr), addr.addr_len),
+ SyscallFailsWithErrno(EACCES));
+}
+
+// Verifies that a UDP unicast on an unbound socket reaches its destination.
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, TestSendUnicastOnUnbound) {
+ auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto rcvr = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Bind the receiver and retrieve its address and port number.
+ sockaddr_in addr = {};
+ addr.sin_family = AF_INET;
+ addr.sin_addr.s_addr = htonl(INADDR_ANY);
+ addr.sin_port = htons(0);
+ ASSERT_THAT(bind(rcvr->get(), reinterpret_cast<struct sockaddr*>(&addr),
+ sizeof(addr)),
+ SyscallSucceedsWithValue(0));
+ memset(&addr, 0, sizeof(addr));
+ socklen_t addr_sz = sizeof(addr);
+ ASSERT_THAT(getsockname(rcvr->get(),
+ reinterpret_cast<struct sockaddr*>(&addr), &addr_sz),
+ SyscallSucceedsWithValue(0));
+
+ // Send a test message to the receiver.
+ constexpr char kTestMsg[] = "hello, world";
+ ASSERT_THAT(sendto(sender->get(), kTestMsg, sizeof(kTestMsg), 0,
+ reinterpret_cast<struct sockaddr*>(&addr), addr_sz),
+ SyscallSucceedsWithValue(sizeof(kTestMsg)));
+ char buf[sizeof(kTestMsg)] = {};
+ ASSERT_THAT(recv(rcvr->get(), buf, sizeof(buf), 0),
+ SyscallSucceedsWithValue(sizeof(kTestMsg)));
+}
+
+// Check that multicast packets won't be delivered to the sending socket with no
+// set interface or group membership.
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
+ TestSendMulticastSelfNoGroup) {
+ // FIXME(b/125485338): A group membership is not required for external
+ // multicast on gVisor.
+ SKIP_IF(IsRunningOnGvisor());
+
+ auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ auto bind_addr = V4Any();
+ ASSERT_THAT(bind(socket->get(), reinterpret_cast<sockaddr*>(&bind_addr.addr),
+ bind_addr.addr_len),
+ SyscallSucceeds());
+ socklen_t bind_addr_len = bind_addr.addr_len;
+ ASSERT_THAT(
+ getsockname(socket->get(), reinterpret_cast<sockaddr*>(&bind_addr.addr),
+ &bind_addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(bind_addr_len, bind_addr.addr_len);
+
+ // Send a multicast packet.
+ auto send_addr = V4Multicast();
+ reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+ reinterpret_cast<sockaddr_in*>(&bind_addr.addr)->sin_port;
+ char send_buf[200];
+ RandomizeBuffer(send_buf, sizeof(send_buf));
+ ASSERT_THAT(RetryEINTR(sendto)(socket->get(), send_buf, sizeof(send_buf), 0,
+ reinterpret_cast<sockaddr*>(&send_addr.addr),
+ send_addr.addr_len),
+ SyscallSucceedsWithValue(sizeof(send_buf)));
+
+ // Check that we did not receive the multicast packet.
+ char recv_buf[sizeof(send_buf)] = {};
+ ASSERT_THAT(
+ RetryEINTR(recv)(socket->get(), recv_buf, sizeof(recv_buf), MSG_DONTWAIT),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+// Check that multicast packets will be delivered to the sending socket without
+// setting an interface.
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, TestSendMulticastSelf) {
+ auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ auto bind_addr = V4Any();
+ ASSERT_THAT(bind(socket->get(), reinterpret_cast<sockaddr*>(&bind_addr.addr),
+ bind_addr.addr_len),
+ SyscallSucceeds());
+ socklen_t bind_addr_len = bind_addr.addr_len;
+ ASSERT_THAT(
+ getsockname(socket->get(), reinterpret_cast<sockaddr*>(&bind_addr.addr),
+ &bind_addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(bind_addr_len, bind_addr.addr_len);
+
+ // Register to receive multicast packets.
+ ip_mreq group = {};
+ group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+ ASSERT_THAT(setsockopt(socket->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+ sizeof(group)),
+ SyscallSucceeds());
+
+ // Send a multicast packet.
+ auto send_addr = V4Multicast();
+ reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+ reinterpret_cast<sockaddr_in*>(&bind_addr.addr)->sin_port;
+ char send_buf[200];
+ RandomizeBuffer(send_buf, sizeof(send_buf));
+ ASSERT_THAT(RetryEINTR(sendto)(socket->get(), send_buf, sizeof(send_buf), 0,
+ reinterpret_cast<sockaddr*>(&send_addr.addr),
+ send_addr.addr_len),
+ SyscallSucceedsWithValue(sizeof(send_buf)));
+
+ // Check that we received the multicast packet.
+ char recv_buf[sizeof(send_buf)] = {};
+ ASSERT_THAT(RetryEINTR(recv)(socket->get(), recv_buf, sizeof(recv_buf), 0),
+ SyscallSucceedsWithValue(sizeof(recv_buf)));
+
+ EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+}
+
+// Check that multicast packets won't be delivered to the sending socket with no
+// set interface and IP_MULTICAST_LOOP disabled.
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
+ TestSendMulticastSelfLoopOff) {
+ auto socket = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ auto bind_addr = V4Any();
+ ASSERT_THAT(bind(socket->get(), reinterpret_cast<sockaddr*>(&bind_addr.addr),
+ bind_addr.addr_len),
+ SyscallSucceeds());
+ socklen_t bind_addr_len = bind_addr.addr_len;
+ ASSERT_THAT(
+ getsockname(socket->get(), reinterpret_cast<sockaddr*>(&bind_addr.addr),
+ &bind_addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(bind_addr_len, bind_addr.addr_len);
+
+ // Disable multicast looping.
+ EXPECT_THAT(setsockopt(socket->get(), IPPROTO_IP, IP_MULTICAST_LOOP,
+ &kSockOptOff, sizeof(kSockOptOff)),
+ SyscallSucceeds());
+
+ // Register to receive multicast packets.
+ ip_mreq group = {};
+ group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+ EXPECT_THAT(setsockopt(socket->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+ sizeof(group)),
+ SyscallSucceeds());
+
+ // Send a multicast packet.
+ auto send_addr = V4Multicast();
+ reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+ reinterpret_cast<sockaddr_in*>(&bind_addr.addr)->sin_port;
+ char send_buf[200];
+ RandomizeBuffer(send_buf, sizeof(send_buf));
+ ASSERT_THAT(RetryEINTR(sendto)(socket->get(), send_buf, sizeof(send_buf), 0,
+ reinterpret_cast<sockaddr*>(&send_addr.addr),
+ send_addr.addr_len),
+ SyscallSucceedsWithValue(sizeof(send_buf)));
+
+ // Check that we did not receive the multicast packet.
+ char recv_buf[sizeof(send_buf)] = {};
+ EXPECT_THAT(
+ RetryEINTR(recv)(socket->get(), recv_buf, sizeof(recv_buf), MSG_DONTWAIT),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+// Check that multicast packets won't be delivered to another socket with no
+// set interface or group membership.
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, TestSendMulticastNoGroup) {
+ // FIXME(b/125485338): A group membership is not required for external
+ // multicast on gVisor.
+ SKIP_IF(IsRunningOnGvisor());
+
+ auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto receiver = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Bind the second FD to the v4 any address to ensure that we can receive the
+ // multicast packet.
+ auto receiver_addr = V4Any();
+ ASSERT_THAT(
+ bind(receiver->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ receiver_addr.addr_len),
+ SyscallSucceeds());
+ socklen_t receiver_addr_len = receiver_addr.addr_len;
+ ASSERT_THAT(getsockname(receiver->get(),
+ reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ &receiver_addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+ // Send a multicast packet.
+ auto send_addr = V4Multicast();
+ reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+ char send_buf[200];
+ RandomizeBuffer(send_buf, sizeof(send_buf));
+ ASSERT_THAT(RetryEINTR(sendto)(sender->get(), send_buf, sizeof(send_buf), 0,
+ reinterpret_cast<sockaddr*>(&send_addr.addr),
+ send_addr.addr_len),
+ SyscallSucceedsWithValue(sizeof(send_buf)));
+
+ // Check that we did not receive the multicast packet.
+ char recv_buf[sizeof(send_buf)] = {};
+ ASSERT_THAT(RetryEINTR(recv)(receiver->get(), recv_buf, sizeof(recv_buf),
+ MSG_DONTWAIT),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+// Check that multicast packets will be delivered to another socket without
+// setting an interface.
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, TestSendMulticast) {
+ auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto receiver = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Bind the second FD to the v4 any address to ensure that we can receive the
+ // multicast packet.
+ auto receiver_addr = V4Any();
+ ASSERT_THAT(
+ bind(receiver->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ receiver_addr.addr_len),
+ SyscallSucceeds());
+ socklen_t receiver_addr_len = receiver_addr.addr_len;
+ ASSERT_THAT(getsockname(receiver->get(),
+ reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ &receiver_addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+ // Register to receive multicast packets.
+ ip_mreqn group = {};
+ group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+ ASSERT_THAT(setsockopt(receiver->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+ sizeof(group)),
+ SyscallSucceeds());
+
+ // Send a multicast packet.
+ auto send_addr = V4Multicast();
+ reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+ char send_buf[200];
+ RandomizeBuffer(send_buf, sizeof(send_buf));
+ ASSERT_THAT(RetryEINTR(sendto)(sender->get(), send_buf, sizeof(send_buf), 0,
+ reinterpret_cast<sockaddr*>(&send_addr.addr),
+ send_addr.addr_len),
+ SyscallSucceedsWithValue(sizeof(send_buf)));
+
+ // Check that we received the multicast packet.
+ char recv_buf[sizeof(send_buf)] = {};
+ ASSERT_THAT(RetryEINTR(recv)(receiver->get(), recv_buf, sizeof(recv_buf), 0),
+ SyscallSucceedsWithValue(sizeof(recv_buf)));
+
+ EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+}
+
+// Check that multicast packets won't be delivered to another socket with no
+// set interface and IP_MULTICAST_LOOP disabled on the sending socket.
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
+ TestSendMulticastSenderNoLoop) {
+ auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto receiver = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Bind the second FD to the v4 any address to ensure that we can receive the
+ // multicast packet.
+ auto receiver_addr = V4Any();
+ ASSERT_THAT(
+ bind(receiver->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ receiver_addr.addr_len),
+ SyscallSucceeds());
+ socklen_t receiver_addr_len = receiver_addr.addr_len;
+ ASSERT_THAT(getsockname(receiver->get(),
+ reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ &receiver_addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+ // Disable multicast looping on the sender.
+ EXPECT_THAT(setsockopt(sender->get(), IPPROTO_IP, IP_MULTICAST_LOOP,
+ &kSockOptOff, sizeof(kSockOptOff)),
+ SyscallSucceeds());
+
+ // Register to receive multicast packets.
+ ip_mreqn group = {};
+ group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+ EXPECT_THAT(setsockopt(receiver->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+ sizeof(group)),
+ SyscallSucceeds());
+
+ // Send a multicast packet.
+ auto send_addr = V4Multicast();
+ reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+ char send_buf[200];
+ RandomizeBuffer(send_buf, sizeof(send_buf));
+ ASSERT_THAT(RetryEINTR(sendto)(sender->get(), send_buf, sizeof(send_buf), 0,
+ reinterpret_cast<sockaddr*>(&send_addr.addr),
+ send_addr.addr_len),
+ SyscallSucceedsWithValue(sizeof(send_buf)));
+
+ // Check that we did not receive the multicast packet.
+ char recv_buf[sizeof(send_buf)] = {};
+ ASSERT_THAT(RetryEINTR(recv)(receiver->get(), recv_buf, sizeof(recv_buf),
+ MSG_DONTWAIT),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+// Check that multicast packets will be delivered to the sending socket without
+// setting an interface and IP_MULTICAST_LOOP disabled on the receiving socket.
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
+ TestSendMulticastReceiverNoLoop) {
+ auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto receiver = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ // Bind the second FD to the v4 any address to ensure that we can receive the
+ // multicast packet.
+ auto receiver_addr = V4Any();
+ ASSERT_THAT(
+ bind(receiver->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ receiver_addr.addr_len),
+ SyscallSucceeds());
+ socklen_t receiver_addr_len = receiver_addr.addr_len;
+ ASSERT_THAT(getsockname(receiver->get(),
+ reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ &receiver_addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+
+ // Disable multicast looping on the receiver.
+ ASSERT_THAT(setsockopt(receiver->get(), IPPROTO_IP, IP_MULTICAST_LOOP,
+ &kSockOptOff, sizeof(kSockOptOff)),
+ SyscallSucceeds());
+
+ // Register to receive multicast packets.
+ ip_mreqn group = {};
+ group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+ ASSERT_THAT(setsockopt(receiver->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+ sizeof(group)),
+ SyscallSucceeds());
+
+ // Send a multicast packet.
+ auto send_addr = V4Multicast();
+ reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port =
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+ char send_buf[200];
+ RandomizeBuffer(send_buf, sizeof(send_buf));
+ ASSERT_THAT(RetryEINTR(sendto)(sender->get(), send_buf, sizeof(send_buf), 0,
+ reinterpret_cast<sockaddr*>(&send_addr.addr),
+ send_addr.addr_len),
+ SyscallSucceedsWithValue(sizeof(send_buf)));
+
+ // Check that we received the multicast packet.
+ char recv_buf[sizeof(send_buf)] = {};
+ ASSERT_THAT(RetryEINTR(recv)(receiver->get(), recv_buf, sizeof(recv_buf), 0),
+ SyscallSucceedsWithValue(sizeof(recv_buf)));
+
+ EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+}
+
+// Check that two sockets can join the same multicast group at the same time,
+// and both will receive data on it when bound to the ANY address.
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
+ TestSendMulticastToTwoBoundToAny) {
+ auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ std::unique_ptr<FileDescriptor> receivers[2] = {
+ ASSERT_NO_ERRNO_AND_VALUE(NewSocket()),
+ ASSERT_NO_ERRNO_AND_VALUE(NewSocket())};
+
+ ip_mreq group = {};
+ group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+ auto receiver_addr = V4Any();
+ int bound_port = 0;
+ for (auto& receiver : receivers) {
+ ASSERT_THAT(setsockopt(receiver->get(), SOL_SOCKET, SO_REUSEPORT,
+ &kSockOptOn, sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ // Bind to ANY to receive multicast packets.
+ ASSERT_THAT(
+ bind(receiver->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ receiver_addr.addr_len),
+ SyscallSucceeds());
+ socklen_t receiver_addr_len = receiver_addr.addr_len;
+ ASSERT_THAT(getsockname(receiver->get(),
+ reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ &receiver_addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+ EXPECT_EQ(
+ htonl(INADDR_ANY),
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_addr.s_addr);
+ // On the first iteration, save the port we are bound to. On the second
+ // iteration, verify the port is the same as the one from the first
+ // iteration. In other words, both sockets listen on the same port.
+ if (bound_port == 0) {
+ bound_port =
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+ } else {
+ EXPECT_EQ(bound_port,
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port);
+ }
+
+ // Register to receive multicast packets.
+ ASSERT_THAT(setsockopt(receiver->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+ &group, sizeof(group)),
+ SyscallSucceeds());
+ }
+
+ // Send a multicast packet to the group and verify both receivers get it.
+ auto send_addr = V4Multicast();
+ reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port = bound_port;
+ char send_buf[200];
+ RandomizeBuffer(send_buf, sizeof(send_buf));
+ ASSERT_THAT(RetryEINTR(sendto)(sender->get(), send_buf, sizeof(send_buf), 0,
+ reinterpret_cast<sockaddr*>(&send_addr.addr),
+ send_addr.addr_len),
+ SyscallSucceedsWithValue(sizeof(send_buf)));
+ for (auto& receiver : receivers) {
+ char recv_buf[sizeof(send_buf)] = {};
+ ASSERT_THAT(
+ RetryEINTR(recv)(receiver->get(), recv_buf, sizeof(recv_buf), 0),
+ SyscallSucceedsWithValue(sizeof(recv_buf)));
+ EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+ }
+}
+
+// Check that two sockets can join the same multicast group at the same time,
+// and both will receive data on it when bound to the multicast address.
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
+ TestSendMulticastToTwoBoundToMulticastAddress) {
+ auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ std::unique_ptr<FileDescriptor> receivers[2] = {
+ ASSERT_NO_ERRNO_AND_VALUE(NewSocket()),
+ ASSERT_NO_ERRNO_AND_VALUE(NewSocket())};
+
+ ip_mreq group = {};
+ group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+ auto receiver_addr = V4Multicast();
+ int bound_port = 0;
+ for (auto& receiver : receivers) {
+ ASSERT_THAT(setsockopt(receiver->get(), SOL_SOCKET, SO_REUSEPORT,
+ &kSockOptOn, sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ ASSERT_THAT(
+ bind(receiver->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ receiver_addr.addr_len),
+ SyscallSucceeds());
+ socklen_t receiver_addr_len = receiver_addr.addr_len;
+ ASSERT_THAT(getsockname(receiver->get(),
+ reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ &receiver_addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+ EXPECT_EQ(
+ inet_addr(kMulticastAddress),
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_addr.s_addr);
+ // On the first iteration, save the port we are bound to. On the second
+ // iteration, verify the port is the same as the one from the first
+ // iteration. In other words, both sockets listen on the same port.
+ if (bound_port == 0) {
+ bound_port =
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+ } else {
+ EXPECT_EQ(
+ inet_addr(kMulticastAddress),
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_addr.s_addr);
+ EXPECT_EQ(bound_port,
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port);
+ }
+
+ // Register to receive multicast packets.
+ ASSERT_THAT(setsockopt(receiver->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+ &group, sizeof(group)),
+ SyscallSucceeds());
+ }
+
+ // Send a multicast packet to the group and verify both receivers get it.
+ auto send_addr = V4Multicast();
+ reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port = bound_port;
+ char send_buf[200];
+ RandomizeBuffer(send_buf, sizeof(send_buf));
+ ASSERT_THAT(RetryEINTR(sendto)(sender->get(), send_buf, sizeof(send_buf), 0,
+ reinterpret_cast<sockaddr*>(&send_addr.addr),
+ send_addr.addr_len),
+ SyscallSucceedsWithValue(sizeof(send_buf)));
+ for (auto& receiver : receivers) {
+ char recv_buf[sizeof(send_buf)] = {};
+ ASSERT_THAT(
+ RetryEINTR(recv)(receiver->get(), recv_buf, sizeof(recv_buf), 0),
+ SyscallSucceedsWithValue(sizeof(recv_buf)));
+ EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+ }
+}
+
+// Check that two sockets can join the same multicast group at the same time,
+// and with one bound to the wildcard address and the other bound to the
+// multicast address, both will receive data.
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
+ TestSendMulticastToTwoBoundToAnyAndMulticastAddress) {
+ auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ std::unique_ptr<FileDescriptor> receivers[2] = {
+ ASSERT_NO_ERRNO_AND_VALUE(NewSocket()),
+ ASSERT_NO_ERRNO_AND_VALUE(NewSocket())};
+
+ ip_mreq group = {};
+ group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+ // The first receiver binds to the wildcard address.
+ auto receiver_addr = V4Any();
+ int bound_port = 0;
+ for (auto& receiver : receivers) {
+ ASSERT_THAT(setsockopt(receiver->get(), SOL_SOCKET, SO_REUSEPORT,
+ &kSockOptOn, sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ ASSERT_THAT(
+ bind(receiver->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ receiver_addr.addr_len),
+ SyscallSucceeds());
+ socklen_t receiver_addr_len = receiver_addr.addr_len;
+ ASSERT_THAT(getsockname(receiver->get(),
+ reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ &receiver_addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+ // On the first iteration, save the port we are bound to and change the
+ // receiver address from V4Any to V4Multicast so the second receiver binds
+ // to that. On the second iteration, verify the port is the same as the one
+ // from the first iteration but the address is different.
+ if (bound_port == 0) {
+ EXPECT_EQ(
+ htonl(INADDR_ANY),
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_addr.s_addr);
+ bound_port =
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+ receiver_addr = V4Multicast();
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port =
+ bound_port;
+ } else {
+ EXPECT_EQ(
+ inet_addr(kMulticastAddress),
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_addr.s_addr);
+ EXPECT_EQ(bound_port,
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port);
+ }
+
+ // Register to receive multicast packets.
+ ASSERT_THAT(setsockopt(receiver->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+ &group, sizeof(group)),
+ SyscallSucceeds());
+ }
+
+ // Send a multicast packet to the group and verify both receivers get it.
+ auto send_addr = V4Multicast();
+ reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port = bound_port;
+ char send_buf[200];
+ RandomizeBuffer(send_buf, sizeof(send_buf));
+ ASSERT_THAT(RetryEINTR(sendto)(sender->get(), send_buf, sizeof(send_buf), 0,
+ reinterpret_cast<sockaddr*>(&send_addr.addr),
+ send_addr.addr_len),
+ SyscallSucceedsWithValue(sizeof(send_buf)));
+ for (auto& receiver : receivers) {
+ char recv_buf[sizeof(send_buf)] = {};
+ ASSERT_THAT(
+ RetryEINTR(recv)(receiver->get(), recv_buf, sizeof(recv_buf), 0),
+ SyscallSucceedsWithValue(sizeof(recv_buf)));
+ EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+ }
+}
+
+// Check that when receiving a looped-back multicast packet, its source address
+// is not a multicast address.
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
+ IpMulticastLoopbackFromAddr) {
+ auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto receiver = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ auto receiver_addr = V4Any();
+ ASSERT_THAT(
+ bind(receiver->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ receiver_addr.addr_len),
+ SyscallSucceeds());
+ socklen_t receiver_addr_len = receiver_addr.addr_len;
+ ASSERT_THAT(getsockname(receiver->get(),
+ reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ &receiver_addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+ int receiver_port =
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+
+ ip_mreq group = {};
+ group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+ ASSERT_THAT(setsockopt(receiver->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+ sizeof(group)),
+ SyscallSucceeds());
+
+ // Connect to the multicast address. This binds us to the outgoing interface
+ // and allows us to get its IP (to be compared against the src-IP on the
+ // receiver side).
+ auto sendto_addr = V4Multicast();
+ reinterpret_cast<sockaddr_in*>(&sendto_addr.addr)->sin_port = receiver_port;
+ ASSERT_THAT(RetryEINTR(connect)(
+ sender->get(), reinterpret_cast<sockaddr*>(&sendto_addr.addr),
+ sendto_addr.addr_len),
+ SyscallSucceeds());
+ auto sender_addr = V4EmptyAddress();
+ ASSERT_THAT(
+ getsockname(sender->get(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
+ &sender_addr.addr_len),
+ SyscallSucceeds());
+ ASSERT_EQ(sizeof(struct sockaddr_in), sender_addr.addr_len);
+ sockaddr_in* sender_addr_in =
+ reinterpret_cast<sockaddr_in*>(&sender_addr.addr);
+
+ // Send a multicast packet.
+ char send_buf[4] = {};
+ ASSERT_THAT(RetryEINTR(send)(sender->get(), send_buf, sizeof(send_buf), 0),
+ SyscallSucceedsWithValue(sizeof(send_buf)));
+
+ // Receive a multicast packet.
+ char recv_buf[sizeof(send_buf)] = {};
+ auto src_addr = V4EmptyAddress();
+ ASSERT_THAT(
+ RetryEINTR(recvfrom)(receiver->get(), recv_buf, sizeof(recv_buf), 0,
+ reinterpret_cast<sockaddr*>(&src_addr.addr),
+ &src_addr.addr_len),
+ SyscallSucceedsWithValue(sizeof(recv_buf)));
+ ASSERT_EQ(sizeof(struct sockaddr_in), src_addr.addr_len);
+ sockaddr_in* src_addr_in = reinterpret_cast<sockaddr_in*>(&src_addr.addr);
+
+ // Verify that the received source IP:port matches the sender one.
+ EXPECT_EQ(sender_addr_in->sin_port, src_addr_in->sin_port);
+ EXPECT_EQ(sender_addr_in->sin_addr.s_addr, src_addr_in->sin_addr.s_addr);
+}
+
+// Check that when setting the IP_MULTICAST_IF option to both an index pointing
+// to the loopback interface and an address pointing to the non-loopback
+// interface, a multicast packet sent out uses the latter as its source address.
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
+ IpMulticastLoopbackIfNicAndAddr) {
+ // FIXME(b/137899561): Linux instance for syscall tests sometimes misses its
+ // IPv4 address on eth0.
+ SKIP_IF(!got_if_infos_);
+
+ // Create receiver, bind to ANY and join the multicast group.
+ auto receiver = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto receiver_addr = V4Any();
+ ASSERT_THAT(
+ bind(receiver->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ receiver_addr.addr_len),
+ SyscallSucceeds());
+ socklen_t receiver_addr_len = receiver_addr.addr_len;
+ ASSERT_THAT(getsockname(receiver->get(),
+ reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ &receiver_addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+ int receiver_port =
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+ ip_mreqn group = {};
+ group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+ group.imr_ifindex = lo_if_idx_;
+ ASSERT_THAT(setsockopt(receiver->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+ sizeof(group)),
+ SyscallSucceeds());
+
+ // Set outgoing multicast interface config, with NIC and addr pointing to
+ // different interfaces.
+ auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ ip_mreqn iface = {};
+ iface.imr_ifindex = lo_if_idx_;
+ iface.imr_address = eth_if_addr_.sin_addr;
+ ASSERT_THAT(setsockopt(sender->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+ sizeof(iface)),
+ SyscallSucceeds());
+
+ // Send a multicast packet.
+ auto sendto_addr = V4Multicast();
+ reinterpret_cast<sockaddr_in*>(&sendto_addr.addr)->sin_port = receiver_port;
+ char send_buf[4] = {};
+ ASSERT_THAT(RetryEINTR(sendto)(sender->get(), send_buf, sizeof(send_buf), 0,
+ reinterpret_cast<sockaddr*>(&sendto_addr.addr),
+ sendto_addr.addr_len),
+ SyscallSucceedsWithValue(sizeof(send_buf)));
+
+ // Receive a multicast packet.
+ char recv_buf[sizeof(send_buf)] = {};
+ auto src_addr = V4EmptyAddress();
+ ASSERT_THAT(
+ RetryEINTR(recvfrom)(receiver->get(), recv_buf, sizeof(recv_buf), 0,
+ reinterpret_cast<sockaddr*>(&src_addr.addr),
+ &src_addr.addr_len),
+ SyscallSucceedsWithValue(sizeof(recv_buf)));
+ ASSERT_EQ(sizeof(struct sockaddr_in), src_addr.addr_len);
+ sockaddr_in* src_addr_in = reinterpret_cast<sockaddr_in*>(&src_addr.addr);
+
+ // FIXME (b/137781162): When sending a multicast packet use the proper logic
+ // to determine the packet's src-IP.
+ SKIP_IF(IsRunningOnGvisor());
+
+ // Verify the received source address.
+ EXPECT_EQ(eth_if_addr_.sin_addr.s_addr, src_addr_in->sin_addr.s_addr);
+}
+
+// Check that when we are bound to one interface we can set IP_MULTICAST_IF to
+// another interface.
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
+ IpMulticastLoopbackBindToOneIfSetMcastIfToAnother) {
+ // FIXME(b/137899561): Linux instance for syscall tests sometimes misses its
+ // IPv4 address on eth0.
+ SKIP_IF(!got_if_infos_);
+
+ // FIXME (b/137790511): When bound to one interface it is not possible to set
+ // IP_MULTICAST_IF to a different interface.
+ SKIP_IF(IsRunningOnGvisor());
+
+ // Create sender and bind to eth interface.
+ auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ ASSERT_THAT(bind(sender->get(), reinterpret_cast<sockaddr*>(&eth_if_addr_),
+ sizeof(eth_if_addr_)),
+ SyscallSucceeds());
+
+ // Run through all possible combinations of index and address for
+ // IP_MULTICAST_IF that selects the loopback interface.
+ struct {
+ int imr_ifindex;
+ struct in_addr imr_address;
+ } test_data[] = {
+ {lo_if_idx_, {}},
+ {0, lo_if_addr_.sin_addr},
+ {lo_if_idx_, lo_if_addr_.sin_addr},
+ {lo_if_idx_, eth_if_addr_.sin_addr},
+ };
+ for (auto t : test_data) {
+ ip_mreqn iface = {};
+ iface.imr_ifindex = t.imr_ifindex;
+ iface.imr_address = t.imr_address;
+ EXPECT_THAT(setsockopt(sender->get(), IPPROTO_IP, IP_MULTICAST_IF, &iface,
+ sizeof(iface)),
+ SyscallSucceeds())
+ << "imr_index=" << iface.imr_ifindex
+ << " imr_address=" << GetAddr4Str(&iface.imr_address);
+ }
+}
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h
new file mode 100644
index 000000000..10b90b1e0
--- /dev/null
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h
@@ -0,0 +1,46 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_UDP_UNBOUND_EXTERNAL_NETWORKING_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_UDP_UNBOUND_EXTERNAL_NETWORKING_H_
+
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to unbound IPv4 UDP sockets in a sandbox
+// with external networking support.
+class IPv4UDPUnboundExternalNetworkingSocketTest : public SimpleSocketTest {
+ protected:
+ void SetUp();
+
+ IfAddrHelper if_helper_;
+
+ // got_if_infos_ is set to false if SetUp() could not obtain all interface
+ // infos that we need.
+ bool got_if_infos_;
+
+ // Interface infos.
+ int lo_if_idx_;
+ int eth_if_idx_;
+ sockaddr_in lo_if_addr_;
+ sockaddr_in eth_if_addr_;
+};
+
+} // namespace testing
+} // namespace gvisor
+
+#endif // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_UDP_UNBOUND_EXTERNAL_NETWORKING_H_
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc
new file mode 100644
index 000000000..f6e64c157
--- /dev/null
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc
@@ -0,0 +1,39 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.h"
+
+#include <vector>
+
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+std::vector<SocketKind> GetSockets() {
+ return ApplyVec<SocketKind>(
+ IPv4UDPUnboundSocket,
+ AllBitwiseCombinations(List<int>{0, SOCK_NONBLOCK}));
+}
+
+INSTANTIATE_TEST_SUITE_P(IPv4UDPUnboundSockets,
+ IPv4UDPUnboundExternalNetworkingSocketTest,
+ ::testing::ValuesIn(GetSockets()));
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_loopback.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_loopback.cc
new file mode 100644
index 000000000..f121c044d
--- /dev/null
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_loopback.cc
@@ -0,0 +1,32 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_ipv4_udp_unbound.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+INSTANTIATE_TEST_SUITE_P(
+ IPv4UDPSockets, IPv4UDPUnboundSocketTest,
+ ::testing::ValuesIn(ApplyVec<SocketKind>(IPv4UDPUnboundSocket,
+ AllBitwiseCombinations(List<int>{
+ 0, SOCK_NONBLOCK}))));
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_netdevice.cc b/test/syscalls/linux/socket_netdevice.cc
new file mode 100644
index 000000000..15d4b85a7
--- /dev/null
+++ b/test/syscalls/linux/socket_netdevice.cc
@@ -0,0 +1,184 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <linux/sockios.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+
+#include "gtest/gtest.h"
+#include "absl/base/internal/endian.h"
+#include "test/syscalls/linux/socket_netlink_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/test_util.h"
+
+// Tests for netdevice queries.
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+using ::testing::AnyOf;
+using ::testing::Eq;
+
+TEST(NetdeviceTest, Loopback) {
+ FileDescriptor sock =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+
+ // Prepare the request.
+ struct ifreq ifr;
+ snprintf(ifr.ifr_name, IFNAMSIZ, "lo");
+
+ // Check for a non-zero interface index.
+ ASSERT_THAT(ioctl(sock.get(), SIOCGIFINDEX, &ifr), SyscallSucceeds());
+ EXPECT_NE(ifr.ifr_ifindex, 0);
+
+ // Check that the loopback is zero hardware address.
+ ASSERT_THAT(ioctl(sock.get(), SIOCGIFHWADDR, &ifr), SyscallSucceeds());
+ EXPECT_EQ(ifr.ifr_hwaddr.sa_data[0], 0);
+ EXPECT_EQ(ifr.ifr_hwaddr.sa_data[1], 0);
+ EXPECT_EQ(ifr.ifr_hwaddr.sa_data[2], 0);
+ EXPECT_EQ(ifr.ifr_hwaddr.sa_data[3], 0);
+ EXPECT_EQ(ifr.ifr_hwaddr.sa_data[4], 0);
+ EXPECT_EQ(ifr.ifr_hwaddr.sa_data[5], 0);
+}
+
+TEST(NetdeviceTest, Netmask) {
+ // We need an interface index to identify the loopback device.
+ FileDescriptor sock =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+ struct ifreq ifr;
+ snprintf(ifr.ifr_name, IFNAMSIZ, "lo");
+ ASSERT_THAT(ioctl(sock.get(), SIOCGIFINDEX, &ifr), SyscallSucceeds());
+ EXPECT_NE(ifr.ifr_ifindex, 0);
+
+ // Use a netlink socket to get the netmask, which we'll then compare to the
+ // netmask obtained via ioctl.
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+ uint32_t port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
+
+ struct request {
+ struct nlmsghdr hdr;
+ struct rtgenmsg rgm;
+ };
+
+ constexpr uint32_t kSeq = 12345;
+
+ struct request req;
+ req.hdr.nlmsg_len = sizeof(req);
+ req.hdr.nlmsg_type = RTM_GETADDR;
+ req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+ req.hdr.nlmsg_seq = kSeq;
+ req.rgm.rtgen_family = AF_UNSPEC;
+
+ // Iterate through messages until we find the one containing the prefix length
+ // (i.e. netmask) for the loopback device.
+ int prefixlen = -1;
+ ASSERT_NO_ERRNO(NetlinkRequestResponse(
+ fd, &req, sizeof(req),
+ [&](const struct nlmsghdr* hdr) {
+ EXPECT_THAT(hdr->nlmsg_type, AnyOf(Eq(RTM_NEWADDR), Eq(NLMSG_DONE)));
+
+ EXPECT_TRUE((hdr->nlmsg_flags & NLM_F_MULTI) == NLM_F_MULTI)
+ << std::hex << hdr->nlmsg_flags;
+
+ EXPECT_EQ(hdr->nlmsg_seq, kSeq);
+ EXPECT_EQ(hdr->nlmsg_pid, port);
+
+ if (hdr->nlmsg_type != RTM_NEWADDR) {
+ return;
+ }
+
+ // RTM_NEWADDR contains at least the header and ifaddrmsg.
+ EXPECT_GE(hdr->nlmsg_len, sizeof(*hdr) + sizeof(struct ifaddrmsg));
+
+ struct ifaddrmsg* ifaddrmsg =
+ reinterpret_cast<struct ifaddrmsg*>(NLMSG_DATA(hdr));
+ if (ifaddrmsg->ifa_index == static_cast<uint32_t>(ifr.ifr_ifindex) &&
+ ifaddrmsg->ifa_family == AF_INET) {
+ prefixlen = ifaddrmsg->ifa_prefixlen;
+ }
+ },
+ false));
+
+ ASSERT_GE(prefixlen, 0);
+
+ // Netmask is stored big endian in struct sockaddr_in, so we do the same for
+ // comparison.
+ uint32_t mask = 0xffffffff << (32 - prefixlen);
+ mask = absl::gbswap_32(mask);
+
+ // Check that the loopback interface has the correct subnet mask.
+ snprintf(ifr.ifr_name, IFNAMSIZ, "lo");
+ ASSERT_THAT(ioctl(sock.get(), SIOCGIFNETMASK, &ifr), SyscallSucceeds());
+ EXPECT_EQ(ifr.ifr_netmask.sa_family, AF_INET);
+ struct sockaddr_in* sin =
+ reinterpret_cast<struct sockaddr_in*>(&ifr.ifr_netmask);
+ EXPECT_EQ(sin->sin_addr.s_addr, mask);
+}
+
+TEST(NetdeviceTest, InterfaceName) {
+ FileDescriptor sock =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+
+ // Prepare the request.
+ struct ifreq ifr;
+ snprintf(ifr.ifr_name, IFNAMSIZ, "lo");
+
+ // Check for a non-zero interface index.
+ ASSERT_THAT(ioctl(sock.get(), SIOCGIFINDEX, &ifr), SyscallSucceeds());
+ EXPECT_NE(ifr.ifr_ifindex, 0);
+
+ // Check that SIOCGIFNAME finds the loopback interface.
+ snprintf(ifr.ifr_name, IFNAMSIZ, "foo");
+ ASSERT_THAT(ioctl(sock.get(), SIOCGIFNAME, &ifr), SyscallSucceeds());
+ EXPECT_STREQ(ifr.ifr_name, "lo");
+}
+
+TEST(NetdeviceTest, InterfaceFlags) {
+ FileDescriptor sock =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+
+ // Prepare the request.
+ struct ifreq ifr;
+ snprintf(ifr.ifr_name, IFNAMSIZ, "lo");
+
+ // Check that SIOCGIFFLAGS marks the interface with IFF_LOOPBACK, IFF_UP, and
+ // IFF_RUNNING.
+ ASSERT_THAT(ioctl(sock.get(), SIOCGIFFLAGS, &ifr), SyscallSucceeds());
+ EXPECT_EQ(ifr.ifr_flags & IFF_UP, IFF_UP);
+ EXPECT_EQ(ifr.ifr_flags & IFF_RUNNING, IFF_RUNNING);
+}
+
+TEST(NetdeviceTest, InterfaceMTU) {
+ FileDescriptor sock =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0));
+
+ // Prepare the request.
+ struct ifreq ifr = {};
+ snprintf(ifr.ifr_name, IFNAMSIZ, "lo");
+
+ // Check that SIOCGIFMTU returns a nonzero MTU.
+ ASSERT_THAT(ioctl(sock.get(), SIOCGIFMTU, &ifr), SyscallSucceeds());
+ EXPECT_GT(ifr.ifr_mtu, 0);
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_netlink.cc b/test/syscalls/linux/socket_netlink.cc
new file mode 100644
index 000000000..4ec0fd4fa
--- /dev/null
+++ b/test/syscalls/linux/socket_netlink.cc
@@ -0,0 +1,153 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <linux/netlink.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/test_util.h"
+
+// Tests for all netlink socket protocols.
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// NetlinkTest parameter is the protocol to test.
+using NetlinkTest = ::testing::TestWithParam<int>;
+
+// Netlink sockets must be SOCK_DGRAM or SOCK_RAW.
+TEST_P(NetlinkTest, Types) {
+ const int protocol = GetParam();
+
+ EXPECT_THAT(socket(AF_NETLINK, SOCK_STREAM, protocol),
+ SyscallFailsWithErrno(ESOCKTNOSUPPORT));
+ EXPECT_THAT(socket(AF_NETLINK, SOCK_SEQPACKET, protocol),
+ SyscallFailsWithErrno(ESOCKTNOSUPPORT));
+ EXPECT_THAT(socket(AF_NETLINK, SOCK_RDM, protocol),
+ SyscallFailsWithErrno(ESOCKTNOSUPPORT));
+ EXPECT_THAT(socket(AF_NETLINK, SOCK_DCCP, protocol),
+ SyscallFailsWithErrno(ESOCKTNOSUPPORT));
+ EXPECT_THAT(socket(AF_NETLINK, SOCK_PACKET, protocol),
+ SyscallFailsWithErrno(ESOCKTNOSUPPORT));
+
+ int fd;
+ EXPECT_THAT(fd = socket(AF_NETLINK, SOCK_DGRAM, protocol), SyscallSucceeds());
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+
+ EXPECT_THAT(fd = socket(AF_NETLINK, SOCK_RAW, protocol), SyscallSucceeds());
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
+TEST_P(NetlinkTest, AutomaticPort) {
+ const int protocol = GetParam();
+
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_NETLINK, SOCK_RAW, protocol));
+
+ struct sockaddr_nl addr = {};
+ addr.nl_family = AF_NETLINK;
+
+ EXPECT_THAT(
+ bind(fd.get(), reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr)),
+ SyscallSucceeds());
+
+ socklen_t addrlen = sizeof(addr);
+ EXPECT_THAT(getsockname(fd.get(), reinterpret_cast<struct sockaddr*>(&addr),
+ &addrlen),
+ SyscallSucceeds());
+ EXPECT_EQ(addrlen, sizeof(addr));
+ // This is the only netlink socket in the process, so it should get the PID as
+ // the port id.
+ //
+ // N.B. Another process could theoretically have explicitly reserved our pid
+ // as a port ID, but that is very unlikely.
+ EXPECT_EQ(addr.nl_pid, getpid());
+}
+
+// Calling connect automatically binds to an automatic port.
+TEST_P(NetlinkTest, ConnectBinds) {
+ const int protocol = GetParam();
+
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_NETLINK, SOCK_RAW, protocol));
+
+ struct sockaddr_nl addr = {};
+ addr.nl_family = AF_NETLINK;
+
+ EXPECT_THAT(connect(fd.get(), reinterpret_cast<struct sockaddr*>(&addr),
+ sizeof(addr)),
+ SyscallSucceeds());
+
+ socklen_t addrlen = sizeof(addr);
+ EXPECT_THAT(getsockname(fd.get(), reinterpret_cast<struct sockaddr*>(&addr),
+ &addrlen),
+ SyscallSucceeds());
+ EXPECT_EQ(addrlen, sizeof(addr));
+
+ // Each test is running in a pid namespace, so another process can explicitly
+ // reserve our pid as a port ID. In this case, a negative portid value will be
+ // set.
+ if (static_cast<pid_t>(addr.nl_pid) > 0) {
+ EXPECT_EQ(addr.nl_pid, getpid());
+ }
+
+ memset(&addr, 0, sizeof(addr));
+ addr.nl_family = AF_NETLINK;
+
+ // Connecting again is allowed, but keeps the same port.
+ EXPECT_THAT(connect(fd.get(), reinterpret_cast<struct sockaddr*>(&addr),
+ sizeof(addr)),
+ SyscallSucceeds());
+
+ addrlen = sizeof(addr);
+ EXPECT_THAT(getsockname(fd.get(), reinterpret_cast<struct sockaddr*>(&addr),
+ &addrlen),
+ SyscallSucceeds());
+ EXPECT_EQ(addrlen, sizeof(addr));
+ EXPECT_EQ(addr.nl_pid, getpid());
+}
+
+TEST_P(NetlinkTest, GetPeerName) {
+ const int protocol = GetParam();
+
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_NETLINK, SOCK_RAW, protocol));
+
+ struct sockaddr_nl addr = {};
+ socklen_t addrlen = sizeof(addr);
+
+ EXPECT_THAT(getpeername(fd.get(), reinterpret_cast<struct sockaddr*>(&addr),
+ &addrlen),
+ SyscallSucceeds());
+
+ EXPECT_EQ(addrlen, sizeof(addr));
+ EXPECT_EQ(addr.nl_family, AF_NETLINK);
+ // Peer is the kernel if we didn't connect elsewhere.
+ EXPECT_EQ(addr.nl_pid, 0);
+}
+
+INSTANTIATE_TEST_SUITE_P(ProtocolTest, NetlinkTest,
+ ::testing::Values(NETLINK_ROUTE,
+ NETLINK_KOBJECT_UEVENT));
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_netlink_route.cc b/test/syscalls/linux/socket_netlink_route.cc
new file mode 100644
index 000000000..e6647a1c3
--- /dev/null
+++ b/test/syscalls/linux/socket_netlink_route.cc
@@ -0,0 +1,935 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arpa/inet.h>
+#include <ifaddrs.h>
+#include <linux/if.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <iostream>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "absl/strings/str_format.h"
+#include "test/syscalls/linux/socket_netlink_route_util.h"
+#include "test/syscalls/linux/socket_netlink_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/capability_util.h"
+#include "test/util/cleanup.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/test_util.h"
+
+// Tests for NETLINK_ROUTE sockets.
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+constexpr uint32_t kSeq = 12345;
+
+using ::testing::AnyOf;
+using ::testing::Eq;
+
+// Parameters for SockOptTest. They are:
+// 0: Socket option to query.
+// 1: A predicate to run on the returned sockopt value. Should return true if
+// the value is considered ok.
+// 2: A description of what the sockopt value is expected to be. Should complete
+// the sentence "<value> was unexpected, expected <description>"
+using SockOptTest = ::testing::TestWithParam<
+ std::tuple<int, std::function<bool(int)>, std::string>>;
+
+TEST_P(SockOptTest, GetSockOpt) {
+ int sockopt = std::get<0>(GetParam());
+ auto verifier = std::get<1>(GetParam());
+ std::string verifier_description = std::get<2>(GetParam());
+
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE));
+
+ int res;
+ socklen_t len = sizeof(res);
+
+ EXPECT_THAT(getsockopt(fd.get(), SOL_SOCKET, sockopt, &res, &len),
+ SyscallSucceeds());
+
+ EXPECT_EQ(len, sizeof(res));
+ EXPECT_TRUE(verifier(res)) << absl::StrFormat(
+ "getsockopt(%d, SOL_SOCKET, %d, &res, &len) => res=%d was unexpected, "
+ "expected %s",
+ fd.get(), sockopt, res, verifier_description);
+}
+
+std::function<bool(int)> IsPositive() {
+ return [](int val) { return val > 0; };
+}
+
+std::function<bool(int)> IsEqual(int target) {
+ return [target](int val) { return val == target; };
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ NetlinkRouteTest, SockOptTest,
+ ::testing::Values(
+ std::make_tuple(SO_SNDBUF, IsPositive(), "positive send buffer size"),
+ std::make_tuple(SO_RCVBUF, IsPositive(),
+ "positive receive buffer size"),
+ std::make_tuple(SO_TYPE, IsEqual(SOCK_RAW),
+ absl::StrFormat("SOCK_RAW (%d)", SOCK_RAW)),
+ std::make_tuple(SO_DOMAIN, IsEqual(AF_NETLINK),
+ absl::StrFormat("AF_NETLINK (%d)", AF_NETLINK)),
+ std::make_tuple(SO_PROTOCOL, IsEqual(NETLINK_ROUTE),
+ absl::StrFormat("NETLINK_ROUTE (%d)", NETLINK_ROUTE)),
+ std::make_tuple(SO_PASSCRED, IsEqual(0), "0")));
+
+// Validates the reponses to RTM_GETLINK + NLM_F_DUMP.
+void CheckGetLinkResponse(const struct nlmsghdr* hdr, int seq, int port) {
+ EXPECT_THAT(hdr->nlmsg_type, AnyOf(Eq(RTM_NEWLINK), Eq(NLMSG_DONE)));
+
+ EXPECT_TRUE((hdr->nlmsg_flags & NLM_F_MULTI) == NLM_F_MULTI)
+ << std::hex << hdr->nlmsg_flags;
+
+ EXPECT_EQ(hdr->nlmsg_seq, seq);
+ EXPECT_EQ(hdr->nlmsg_pid, port);
+
+ if (hdr->nlmsg_type != RTM_NEWLINK) {
+ return;
+ }
+
+ // RTM_NEWLINK contains at least the header and ifinfomsg.
+ EXPECT_GE(hdr->nlmsg_len, NLMSG_SPACE(sizeof(struct ifinfomsg)));
+
+ // TODO(mpratt): Check ifinfomsg contents and following attrs.
+}
+
+TEST(NetlinkRouteTest, GetLinkDump) {
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+ uint32_t port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
+
+ // Loopback is common among all tests, check that it's found.
+ bool loopbackFound = false;
+ ASSERT_NO_ERRNO(DumpLinks(fd, kSeq, [&](const struct nlmsghdr* hdr) {
+ CheckGetLinkResponse(hdr, kSeq, port);
+ if (hdr->nlmsg_type != RTM_NEWLINK) {
+ return;
+ }
+ ASSERT_GE(hdr->nlmsg_len, NLMSG_SPACE(sizeof(struct ifinfomsg)));
+ const struct ifinfomsg* msg =
+ reinterpret_cast<const struct ifinfomsg*>(NLMSG_DATA(hdr));
+ std::cout << "Found interface idx=" << msg->ifi_index
+ << ", type=" << std::hex << msg->ifi_type << std::endl;
+ if (msg->ifi_type == ARPHRD_LOOPBACK) {
+ loopbackFound = true;
+ EXPECT_NE(msg->ifi_flags & IFF_LOOPBACK, 0);
+ }
+ }));
+ EXPECT_TRUE(loopbackFound);
+}
+
+// CheckLinkMsg checks a netlink message against an expected link.
+void CheckLinkMsg(const struct nlmsghdr* hdr, const Link& link) {
+ ASSERT_THAT(hdr->nlmsg_type, Eq(RTM_NEWLINK));
+ ASSERT_GE(hdr->nlmsg_len, NLMSG_SPACE(sizeof(struct ifinfomsg)));
+ const struct ifinfomsg* msg =
+ reinterpret_cast<const struct ifinfomsg*>(NLMSG_DATA(hdr));
+ EXPECT_EQ(msg->ifi_index, link.index);
+
+ const struct rtattr* rta = FindRtAttr(hdr, msg, IFLA_IFNAME);
+ EXPECT_NE(nullptr, rta) << "IFLA_IFNAME not found in message.";
+ if (rta != nullptr) {
+ std::string name(reinterpret_cast<const char*>(RTA_DATA(rta)));
+ EXPECT_EQ(name, link.name);
+ }
+}
+
+TEST(NetlinkRouteTest, GetLinkByIndex) {
+ Link loopback_link = ASSERT_NO_ERRNO_AND_VALUE(LoopbackLink());
+
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+
+ struct request {
+ struct nlmsghdr hdr;
+ struct ifinfomsg ifm;
+ };
+
+ struct request req = {};
+ req.hdr.nlmsg_len = sizeof(req);
+ req.hdr.nlmsg_type = RTM_GETLINK;
+ req.hdr.nlmsg_flags = NLM_F_REQUEST;
+ req.hdr.nlmsg_seq = kSeq;
+ req.ifm.ifi_family = AF_UNSPEC;
+ req.ifm.ifi_index = loopback_link.index;
+
+ bool found = false;
+ ASSERT_NO_ERRNO(NetlinkRequestResponse(
+ fd, &req, sizeof(req),
+ [&](const struct nlmsghdr* hdr) {
+ CheckLinkMsg(hdr, loopback_link);
+ found = true;
+ },
+ false));
+ EXPECT_TRUE(found) << "Netlink response does not contain any links.";
+}
+
+TEST(NetlinkRouteTest, GetLinkByName) {
+ Link loopback_link = ASSERT_NO_ERRNO_AND_VALUE(LoopbackLink());
+
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+
+ struct request {
+ struct nlmsghdr hdr;
+ struct ifinfomsg ifm;
+ struct rtattr rtattr;
+ char ifname[IFNAMSIZ];
+ char pad[NLMSG_ALIGNTO + RTA_ALIGNTO];
+ };
+
+ struct request req = {};
+ req.hdr.nlmsg_type = RTM_GETLINK;
+ req.hdr.nlmsg_flags = NLM_F_REQUEST;
+ req.hdr.nlmsg_seq = kSeq;
+ req.ifm.ifi_family = AF_UNSPEC;
+ req.rtattr.rta_type = IFLA_IFNAME;
+ req.rtattr.rta_len = RTA_LENGTH(loopback_link.name.size() + 1);
+ strncpy(req.ifname, loopback_link.name.c_str(), sizeof(req.ifname));
+ req.hdr.nlmsg_len =
+ NLMSG_LENGTH(sizeof(req.ifm)) + NLMSG_ALIGN(req.rtattr.rta_len);
+
+ bool found = false;
+ ASSERT_NO_ERRNO(NetlinkRequestResponse(
+ fd, &req, sizeof(req),
+ [&](const struct nlmsghdr* hdr) {
+ CheckLinkMsg(hdr, loopback_link);
+ found = true;
+ },
+ false));
+ EXPECT_TRUE(found) << "Netlink response does not contain any links.";
+}
+
+TEST(NetlinkRouteTest, GetLinkByIndexNotFound) {
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+
+ struct request {
+ struct nlmsghdr hdr;
+ struct ifinfomsg ifm;
+ };
+
+ struct request req = {};
+ req.hdr.nlmsg_len = sizeof(req);
+ req.hdr.nlmsg_type = RTM_GETLINK;
+ req.hdr.nlmsg_flags = NLM_F_REQUEST;
+ req.hdr.nlmsg_seq = kSeq;
+ req.ifm.ifi_family = AF_UNSPEC;
+ req.ifm.ifi_index = 1234590;
+
+ EXPECT_THAT(NetlinkRequestAckOrError(fd, kSeq, &req, sizeof(req)),
+ PosixErrorIs(ENODEV, ::testing::_));
+}
+
+TEST(NetlinkRouteTest, GetLinkByNameNotFound) {
+ const std::string name = "nodevice?!";
+
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+
+ struct request {
+ struct nlmsghdr hdr;
+ struct ifinfomsg ifm;
+ struct rtattr rtattr;
+ char ifname[IFNAMSIZ];
+ char pad[NLMSG_ALIGNTO + RTA_ALIGNTO];
+ };
+
+ struct request req = {};
+ req.hdr.nlmsg_type = RTM_GETLINK;
+ req.hdr.nlmsg_flags = NLM_F_REQUEST;
+ req.hdr.nlmsg_seq = kSeq;
+ req.ifm.ifi_family = AF_UNSPEC;
+ req.rtattr.rta_type = IFLA_IFNAME;
+ req.rtattr.rta_len = RTA_LENGTH(name.size() + 1);
+ strncpy(req.ifname, name.c_str(), sizeof(req.ifname));
+ req.hdr.nlmsg_len =
+ NLMSG_LENGTH(sizeof(req.ifm)) + NLMSG_ALIGN(req.rtattr.rta_len);
+
+ EXPECT_THAT(NetlinkRequestAckOrError(fd, kSeq, &req, sizeof(req)),
+ PosixErrorIs(ENODEV, ::testing::_));
+}
+
+TEST(NetlinkRouteTest, MsgHdrMsgUnsuppType) {
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+
+ struct request {
+ struct nlmsghdr hdr;
+ struct ifinfomsg ifm;
+ };
+
+ struct request req = {};
+ req.hdr.nlmsg_len = sizeof(req);
+ // If type & 0x3 is equal to 0x2, this means a get request
+ // which doesn't require CAP_SYS_ADMIN.
+ req.hdr.nlmsg_type = ((__RTM_MAX + 1024) & (~0x3)) | 0x2;
+ req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+ req.hdr.nlmsg_seq = kSeq;
+ req.ifm.ifi_family = AF_UNSPEC;
+
+ EXPECT_THAT(NetlinkRequestAckOrError(fd, kSeq, &req, sizeof(req)),
+ PosixErrorIs(EOPNOTSUPP, ::testing::_));
+}
+
+TEST(NetlinkRouteTest, MsgHdrMsgTrunc) {
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+
+ struct request {
+ struct nlmsghdr hdr;
+ struct ifinfomsg ifm;
+ };
+
+ struct request req = {};
+ req.hdr.nlmsg_len = sizeof(req);
+ req.hdr.nlmsg_type = RTM_GETLINK;
+ req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+ req.hdr.nlmsg_seq = kSeq;
+ req.ifm.ifi_family = AF_UNSPEC;
+
+ struct iovec iov = {};
+ iov.iov_base = &req;
+ iov.iov_len = sizeof(req);
+
+ struct msghdr msg = {};
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ // No destination required; it defaults to pid 0, the kernel.
+
+ ASSERT_THAT(RetryEINTR(sendmsg)(fd.get(), &msg, 0), SyscallSucceeds());
+
+ // Small enough to ensure that the response doesn't fit.
+ constexpr size_t kBufferSize = 10;
+ std::vector<char> buf(kBufferSize);
+ iov.iov_base = buf.data();
+ iov.iov_len = buf.size();
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(fd.get(), &msg, 0),
+ SyscallSucceedsWithValue(kBufferSize));
+ EXPECT_EQ((msg.msg_flags & MSG_TRUNC), MSG_TRUNC);
+}
+
+TEST(NetlinkRouteTest, MsgTruncMsgHdrMsgTrunc) {
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+
+ struct request {
+ struct nlmsghdr hdr;
+ struct ifinfomsg ifm;
+ };
+
+ struct request req = {};
+ req.hdr.nlmsg_len = sizeof(req);
+ req.hdr.nlmsg_type = RTM_GETLINK;
+ req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+ req.hdr.nlmsg_seq = kSeq;
+ req.ifm.ifi_family = AF_UNSPEC;
+
+ struct iovec iov = {};
+ iov.iov_base = &req;
+ iov.iov_len = sizeof(req);
+
+ struct msghdr msg = {};
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ // No destination required; it defaults to pid 0, the kernel.
+
+ ASSERT_THAT(RetryEINTR(sendmsg)(fd.get(), &msg, 0), SyscallSucceeds());
+
+ // Small enough to ensure that the response doesn't fit.
+ constexpr size_t kBufferSize = 10;
+ std::vector<char> buf(kBufferSize);
+ iov.iov_base = buf.data();
+ iov.iov_len = buf.size();
+
+ int res = 0;
+ ASSERT_THAT(res = RetryEINTR(recvmsg)(fd.get(), &msg, MSG_TRUNC),
+ SyscallSucceeds());
+ EXPECT_GT(res, kBufferSize);
+ EXPECT_EQ((msg.msg_flags & MSG_TRUNC), MSG_TRUNC);
+}
+
+TEST(NetlinkRouteTest, ControlMessageIgnored) {
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+ uint32_t port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
+
+ struct request {
+ struct nlmsghdr control_hdr;
+ struct nlmsghdr message_hdr;
+ struct ifinfomsg ifm;
+ };
+
+ struct request req = {};
+
+ // This control message is ignored. We still receive a response for the
+ // following RTM_GETLINK.
+ req.control_hdr.nlmsg_len = sizeof(req.control_hdr);
+ req.control_hdr.nlmsg_type = NLMSG_DONE;
+ req.control_hdr.nlmsg_seq = kSeq;
+
+ req.message_hdr.nlmsg_len = sizeof(req.message_hdr) + sizeof(req.ifm);
+ req.message_hdr.nlmsg_type = RTM_GETLINK;
+ req.message_hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+ req.message_hdr.nlmsg_seq = kSeq;
+
+ req.ifm.ifi_family = AF_UNSPEC;
+
+ ASSERT_NO_ERRNO(NetlinkRequestResponse(
+ fd, &req, sizeof(req),
+ [&](const struct nlmsghdr* hdr) {
+ CheckGetLinkResponse(hdr, kSeq, port);
+ },
+ false));
+}
+
+TEST(NetlinkRouteTest, GetAddrDump) {
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+ uint32_t port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
+
+ struct request {
+ struct nlmsghdr hdr;
+ struct rtgenmsg rgm;
+ };
+
+ struct request req;
+ req.hdr.nlmsg_len = sizeof(req);
+ req.hdr.nlmsg_type = RTM_GETADDR;
+ req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+ req.hdr.nlmsg_seq = kSeq;
+ req.rgm.rtgen_family = AF_UNSPEC;
+
+ ASSERT_NO_ERRNO(NetlinkRequestResponse(
+ fd, &req, sizeof(req),
+ [&](const struct nlmsghdr* hdr) {
+ EXPECT_THAT(hdr->nlmsg_type, AnyOf(Eq(RTM_NEWADDR), Eq(NLMSG_DONE)));
+
+ EXPECT_TRUE((hdr->nlmsg_flags & NLM_F_MULTI) == NLM_F_MULTI)
+ << std::hex << hdr->nlmsg_flags;
+
+ EXPECT_EQ(hdr->nlmsg_seq, kSeq);
+ EXPECT_EQ(hdr->nlmsg_pid, port);
+
+ if (hdr->nlmsg_type != RTM_NEWADDR) {
+ return;
+ }
+
+ // RTM_NEWADDR contains at least the header and ifaddrmsg.
+ EXPECT_GE(hdr->nlmsg_len, sizeof(*hdr) + sizeof(struct ifaddrmsg));
+
+ // TODO(mpratt): Check ifaddrmsg contents and following attrs.
+ },
+ false));
+}
+
+TEST(NetlinkRouteTest, LookupAll) {
+ struct ifaddrs* if_addr_list = nullptr;
+ auto cleanup = Cleanup([&if_addr_list]() { freeifaddrs(if_addr_list); });
+
+ // Not a syscall but we can use the syscall matcher as glibc sets errno.
+ ASSERT_THAT(getifaddrs(&if_addr_list), SyscallSucceeds());
+
+ int count = 0;
+ for (struct ifaddrs* i = if_addr_list; i; i = i->ifa_next) {
+ if (!i->ifa_addr || (i->ifa_addr->sa_family != AF_INET &&
+ i->ifa_addr->sa_family != AF_INET6)) {
+ continue;
+ }
+ count++;
+ }
+ ASSERT_GT(count, 0);
+}
+
+TEST(NetlinkRouteTest, AddAddr) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+ Link loopback_link = ASSERT_NO_ERRNO_AND_VALUE(LoopbackLink());
+
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+
+ struct request {
+ struct nlmsghdr hdr;
+ struct ifaddrmsg ifa;
+ struct rtattr rtattr;
+ struct in_addr addr;
+ char pad[NLMSG_ALIGNTO + RTA_ALIGNTO];
+ };
+
+ struct request req = {};
+ req.hdr.nlmsg_type = RTM_NEWADDR;
+ req.hdr.nlmsg_seq = kSeq;
+ req.ifa.ifa_family = AF_INET;
+ req.ifa.ifa_prefixlen = 24;
+ req.ifa.ifa_flags = 0;
+ req.ifa.ifa_scope = 0;
+ req.ifa.ifa_index = loopback_link.index;
+ req.rtattr.rta_type = IFA_LOCAL;
+ req.rtattr.rta_len = RTA_LENGTH(sizeof(req.addr));
+ inet_pton(AF_INET, "10.0.0.1", &req.addr);
+ req.hdr.nlmsg_len =
+ NLMSG_LENGTH(sizeof(req.ifa)) + NLMSG_ALIGN(req.rtattr.rta_len);
+
+ // Create should succeed, as no such address in kernel.
+ req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_ACK;
+ EXPECT_NO_ERRNO(
+ NetlinkRequestAckOrError(fd, req.hdr.nlmsg_seq, &req, req.hdr.nlmsg_len));
+
+ // Replace an existing address should succeed.
+ req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_REPLACE | NLM_F_ACK;
+ req.hdr.nlmsg_seq++;
+ EXPECT_NO_ERRNO(
+ NetlinkRequestAckOrError(fd, req.hdr.nlmsg_seq, &req, req.hdr.nlmsg_len));
+
+ // Create exclusive should fail, as we created the address above.
+ req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL | NLM_F_ACK;
+ req.hdr.nlmsg_seq++;
+ EXPECT_THAT(
+ NetlinkRequestAckOrError(fd, req.hdr.nlmsg_seq, &req, req.hdr.nlmsg_len),
+ PosixErrorIs(EEXIST, ::testing::_));
+}
+
+// GetRouteDump tests a RTM_GETROUTE + NLM_F_DUMP request.
+TEST(NetlinkRouteTest, GetRouteDump) {
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+ uint32_t port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
+
+ struct request {
+ struct nlmsghdr hdr;
+ struct rtmsg rtm;
+ };
+
+ struct request req = {};
+ req.hdr.nlmsg_len = sizeof(req);
+ req.hdr.nlmsg_type = RTM_GETROUTE;
+ req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+ req.hdr.nlmsg_seq = kSeq;
+ req.rtm.rtm_family = AF_UNSPEC;
+
+ bool routeFound = false;
+ bool dstFound = true;
+ ASSERT_NO_ERRNO(NetlinkRequestResponse(
+ fd, &req, sizeof(req),
+ [&](const struct nlmsghdr* hdr) {
+ // Validate the reponse to RTM_GETROUTE + NLM_F_DUMP.
+ EXPECT_THAT(hdr->nlmsg_type, AnyOf(Eq(RTM_NEWROUTE), Eq(NLMSG_DONE)));
+
+ EXPECT_TRUE((hdr->nlmsg_flags & NLM_F_MULTI) == NLM_F_MULTI)
+ << std::hex << hdr->nlmsg_flags;
+
+ EXPECT_EQ(hdr->nlmsg_seq, kSeq);
+ EXPECT_EQ(hdr->nlmsg_pid, port);
+
+ // The test should not proceed if it's not a RTM_NEWROUTE message.
+ if (hdr->nlmsg_type != RTM_NEWROUTE) {
+ return;
+ }
+
+ // RTM_NEWROUTE contains at least the header and rtmsg.
+ ASSERT_GE(hdr->nlmsg_len, NLMSG_SPACE(sizeof(struct rtmsg)));
+ const struct rtmsg* msg =
+ reinterpret_cast<const struct rtmsg*>(NLMSG_DATA(hdr));
+ // NOTE: rtmsg fields are char fields.
+ std::cout << "Found route table=" << static_cast<int>(msg->rtm_table)
+ << ", protocol=" << static_cast<int>(msg->rtm_protocol)
+ << ", scope=" << static_cast<int>(msg->rtm_scope)
+ << ", type=" << static_cast<int>(msg->rtm_type);
+
+ int len = RTM_PAYLOAD(hdr);
+ bool rtDstFound = false;
+ for (struct rtattr* attr = RTM_RTA(msg); RTA_OK(attr, len);
+ attr = RTA_NEXT(attr, len)) {
+ if (attr->rta_type == RTA_DST) {
+ char address[INET_ADDRSTRLEN] = {};
+ inet_ntop(AF_INET, RTA_DATA(attr), address, sizeof(address));
+ std::cout << ", dst=" << address;
+ rtDstFound = true;
+ }
+ }
+
+ std::cout << std::endl;
+
+ if (msg->rtm_table == RT_TABLE_MAIN) {
+ routeFound = true;
+ dstFound = rtDstFound && dstFound;
+ }
+ },
+ false));
+ // At least one route found in main route table.
+ EXPECT_TRUE(routeFound);
+ // Found RTA_DST for each route in main table.
+ EXPECT_TRUE(dstFound);
+}
+
+// GetRouteRequest tests a RTM_GETROUTE request with RTM_F_LOOKUP_TABLE flag.
+TEST(NetlinkRouteTest, GetRouteRequest) {
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+ uint32_t port = ASSERT_NO_ERRNO_AND_VALUE(NetlinkPortID(fd.get()));
+
+ struct request {
+ struct nlmsghdr hdr;
+ struct rtmsg rtm;
+ struct nlattr nla;
+ struct in_addr sin_addr;
+ };
+
+ constexpr uint32_t kSeq = 12345;
+
+ struct request req = {};
+ req.hdr.nlmsg_len = sizeof(req);
+ req.hdr.nlmsg_type = RTM_GETROUTE;
+ req.hdr.nlmsg_flags = NLM_F_REQUEST;
+ req.hdr.nlmsg_seq = kSeq;
+
+ req.rtm.rtm_family = AF_INET;
+ req.rtm.rtm_dst_len = 32;
+ req.rtm.rtm_src_len = 0;
+ req.rtm.rtm_tos = 0;
+ req.rtm.rtm_table = RT_TABLE_UNSPEC;
+ req.rtm.rtm_protocol = RTPROT_UNSPEC;
+ req.rtm.rtm_scope = RT_SCOPE_UNIVERSE;
+ req.rtm.rtm_type = RTN_UNSPEC;
+ req.rtm.rtm_flags = RTM_F_LOOKUP_TABLE;
+
+ req.nla.nla_len = 8;
+ req.nla.nla_type = RTA_DST;
+ inet_aton("127.0.0.2", &req.sin_addr);
+
+ bool rtDstFound = false;
+ ASSERT_NO_ERRNO(NetlinkRequestResponseSingle(
+ fd, &req, sizeof(req), [&](const struct nlmsghdr* hdr) {
+ // Validate the reponse to RTM_GETROUTE request with RTM_F_LOOKUP_TABLE
+ // flag.
+ EXPECT_THAT(hdr->nlmsg_type, RTM_NEWROUTE);
+
+ EXPECT_TRUE(hdr->nlmsg_flags == 0) << std::hex << hdr->nlmsg_flags;
+
+ EXPECT_EQ(hdr->nlmsg_seq, kSeq);
+ EXPECT_EQ(hdr->nlmsg_pid, port);
+
+ // RTM_NEWROUTE contains at least the header and rtmsg.
+ ASSERT_GE(hdr->nlmsg_len, NLMSG_SPACE(sizeof(struct rtmsg)));
+ const struct rtmsg* msg =
+ reinterpret_cast<const struct rtmsg*>(NLMSG_DATA(hdr));
+
+ // NOTE: rtmsg fields are char fields.
+ std::cout << "Found route table=" << static_cast<int>(msg->rtm_table)
+ << ", protocol=" << static_cast<int>(msg->rtm_protocol)
+ << ", scope=" << static_cast<int>(msg->rtm_scope)
+ << ", type=" << static_cast<int>(msg->rtm_type);
+
+ EXPECT_EQ(msg->rtm_family, AF_INET);
+ EXPECT_EQ(msg->rtm_dst_len, 32);
+ EXPECT_TRUE((msg->rtm_flags & RTM_F_CLONED) == RTM_F_CLONED)
+ << std::hex << msg->rtm_flags;
+
+ int len = RTM_PAYLOAD(hdr);
+ std::cout << ", len=" << len;
+ for (struct rtattr* attr = RTM_RTA(msg); RTA_OK(attr, len);
+ attr = RTA_NEXT(attr, len)) {
+ if (attr->rta_type == RTA_DST) {
+ char address[INET_ADDRSTRLEN] = {};
+ inet_ntop(AF_INET, RTA_DATA(attr), address, sizeof(address));
+ std::cout << ", dst=" << address;
+ rtDstFound = true;
+ } else if (attr->rta_type == RTA_OIF) {
+ const char* oif = reinterpret_cast<const char*>(RTA_DATA(attr));
+ std::cout << ", oif=" << oif;
+ }
+ }
+
+ std::cout << std::endl;
+ }));
+ // Found RTA_DST for RTM_F_LOOKUP_TABLE.
+ EXPECT_TRUE(rtDstFound);
+}
+
+// RecvmsgTrunc tests the recvmsg MSG_TRUNC flag with zero length output
+// buffer. MSG_TRUNC with a zero length buffer should consume subsequent
+// messages off the socket.
+TEST(NetlinkRouteTest, RecvmsgTrunc) {
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+
+ struct request {
+ struct nlmsghdr hdr;
+ struct rtgenmsg rgm;
+ };
+
+ struct request req;
+ req.hdr.nlmsg_len = sizeof(req);
+ req.hdr.nlmsg_type = RTM_GETADDR;
+ req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+ req.hdr.nlmsg_seq = kSeq;
+ req.rgm.rtgen_family = AF_UNSPEC;
+
+ struct iovec iov = {};
+ iov.iov_base = &req;
+ iov.iov_len = sizeof(req);
+
+ struct msghdr msg = {};
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(sendmsg)(fd.get(), &msg, 0), SyscallSucceeds());
+
+ iov.iov_base = NULL;
+ iov.iov_len = 0;
+
+ int trunclen, trunclen2;
+
+ // Note: This test assumes at least two messages are returned by the
+ // RTM_GETADDR request. That means at least one RTM_NEWLINK message and one
+ // NLMSG_DONE message. We cannot read all the messages without blocking
+ // because we would need to read the message into a buffer and check the
+ // nlmsg_type for NLMSG_DONE. However, the test depends on reading into a
+ // zero-length buffer.
+
+ // First, call recvmsg with MSG_TRUNC. This will read the full message from
+ // the socket and return it's full length. Subsequent calls to recvmsg will
+ // read the next messages from the socket.
+ ASSERT_THAT(trunclen = RetryEINTR(recvmsg)(fd.get(), &msg, MSG_TRUNC),
+ SyscallSucceeds());
+
+ // Message should always be truncated. However, While the destination iov is
+ // zero length, MSG_TRUNC returns the size of the next message so it should
+ // not be zero.
+ ASSERT_EQ(msg.msg_flags & MSG_TRUNC, MSG_TRUNC);
+ ASSERT_NE(trunclen, 0);
+ // Returned length is at least the header and ifaddrmsg.
+ EXPECT_GE(trunclen, sizeof(struct nlmsghdr) + sizeof(struct ifaddrmsg));
+
+ // Reset the msg_flags to make sure that the recvmsg call is setting them
+ // properly.
+ msg.msg_flags = 0;
+
+ // Make a second recvvmsg call to get the next message.
+ ASSERT_THAT(trunclen2 = RetryEINTR(recvmsg)(fd.get(), &msg, MSG_TRUNC),
+ SyscallSucceeds());
+ ASSERT_EQ(msg.msg_flags & MSG_TRUNC, MSG_TRUNC);
+ ASSERT_NE(trunclen2, 0);
+
+ // Assert that the received messages are not the same.
+ //
+ // We are calling recvmsg with a zero length buffer so we have no way to
+ // inspect the messages to make sure they are not equal in value. The best
+ // we can do is to compare their lengths.
+ ASSERT_NE(trunclen, trunclen2);
+}
+
+// RecvmsgTruncPeek tests recvmsg with the combination of the MSG_TRUNC and
+// MSG_PEEK flags and a zero length output buffer. This is normally used to
+// read the full length of the next message on the socket without consuming
+// it, so a properly sized buffer can be allocated to store the message. This
+// test tests that scenario.
+TEST(NetlinkRouteTest, RecvmsgTruncPeek) {
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+
+ struct request {
+ struct nlmsghdr hdr;
+ struct rtgenmsg rgm;
+ };
+
+ struct request req;
+ req.hdr.nlmsg_len = sizeof(req);
+ req.hdr.nlmsg_type = RTM_GETADDR;
+ req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+ req.hdr.nlmsg_seq = kSeq;
+ req.rgm.rtgen_family = AF_UNSPEC;
+
+ struct iovec iov = {};
+ iov.iov_base = &req;
+ iov.iov_len = sizeof(req);
+
+ struct msghdr msg = {};
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(sendmsg)(fd.get(), &msg, 0), SyscallSucceeds());
+
+ int type = -1;
+ do {
+ int peeklen;
+ int len;
+
+ iov.iov_base = NULL;
+ iov.iov_len = 0;
+
+ // Call recvmsg with MSG_PEEK and MSG_TRUNC. This will peek at the message
+ // and return it's full length.
+ // See: MSG_TRUNC http://man7.org/linux/man-pages/man2/recv.2.html
+ ASSERT_THAT(
+ peeklen = RetryEINTR(recvmsg)(fd.get(), &msg, MSG_PEEK | MSG_TRUNC),
+ SyscallSucceeds());
+
+ // Message should always be truncated.
+ ASSERT_EQ(msg.msg_flags & MSG_TRUNC, MSG_TRUNC);
+ ASSERT_NE(peeklen, 0);
+
+ // Reset the message flags for the next call.
+ msg.msg_flags = 0;
+
+ // Make the actual call to recvmsg to get the actual data. We will use
+ // the length returned from the peek call for the allocated buffer size..
+ std::vector<char> buf(peeklen);
+ iov.iov_base = buf.data();
+ iov.iov_len = buf.size();
+ ASSERT_THAT(len = RetryEINTR(recvmsg)(fd.get(), &msg, 0),
+ SyscallSucceeds());
+
+ // Message should not be truncated since we allocated the correct buffer
+ // size.
+ EXPECT_NE(msg.msg_flags & MSG_TRUNC, MSG_TRUNC);
+
+ // MSG_PEEK should have left data on the socket and the subsequent call
+ // with should have retrieved the same data. Both calls should have
+ // returned the message's full length so they should be equal.
+ ASSERT_NE(len, 0);
+ ASSERT_EQ(peeklen, len);
+
+ for (struct nlmsghdr* hdr = reinterpret_cast<struct nlmsghdr*>(buf.data());
+ NLMSG_OK(hdr, len); hdr = NLMSG_NEXT(hdr, len)) {
+ type = hdr->nlmsg_type;
+ }
+ } while (type != NLMSG_DONE && type != NLMSG_ERROR);
+}
+
+// No SCM_CREDENTIALS are received without SO_PASSCRED set.
+TEST(NetlinkRouteTest, NoPasscredNoCreds) {
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+
+ ASSERT_THAT(setsockopt(fd.get(), SOL_SOCKET, SO_PASSCRED, &kSockOptOff,
+ sizeof(kSockOptOff)),
+ SyscallSucceeds());
+
+ struct request {
+ struct nlmsghdr hdr;
+ struct rtgenmsg rgm;
+ };
+
+ struct request req;
+ req.hdr.nlmsg_len = sizeof(req);
+ req.hdr.nlmsg_type = RTM_GETADDR;
+ req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+ req.hdr.nlmsg_seq = kSeq;
+ req.rgm.rtgen_family = AF_UNSPEC;
+
+ struct iovec iov = {};
+ iov.iov_base = &req;
+ iov.iov_len = sizeof(req);
+
+ struct msghdr msg = {};
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(sendmsg)(fd.get(), &msg, 0), SyscallSucceeds());
+
+ iov.iov_base = NULL;
+ iov.iov_len = 0;
+
+ char control[CMSG_SPACE(sizeof(struct ucred))] = {};
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ // Note: This test assumes at least one message is returned by the
+ // RTM_GETADDR request.
+ ASSERT_THAT(RetryEINTR(recvmsg)(fd.get(), &msg, 0), SyscallSucceeds());
+
+ // No control messages.
+ EXPECT_EQ(CMSG_FIRSTHDR(&msg), nullptr);
+}
+
+// SCM_CREDENTIALS are received with SO_PASSCRED set.
+TEST(NetlinkRouteTest, PasscredCreds) {
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_ROUTE));
+
+ ASSERT_THAT(setsockopt(fd.get(), SOL_SOCKET, SO_PASSCRED, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+
+ struct request {
+ struct nlmsghdr hdr;
+ struct rtgenmsg rgm;
+ };
+
+ struct request req;
+ req.hdr.nlmsg_len = sizeof(req);
+ req.hdr.nlmsg_type = RTM_GETADDR;
+ req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+ req.hdr.nlmsg_seq = kSeq;
+ req.rgm.rtgen_family = AF_UNSPEC;
+
+ struct iovec iov = {};
+ iov.iov_base = &req;
+ iov.iov_len = sizeof(req);
+
+ struct msghdr msg = {};
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(sendmsg)(fd.get(), &msg, 0), SyscallSucceeds());
+
+ iov.iov_base = NULL;
+ iov.iov_len = 0;
+
+ char control[CMSG_SPACE(sizeof(struct ucred))] = {};
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ // Note: This test assumes at least one message is returned by the
+ // RTM_GETADDR request.
+ ASSERT_THAT(RetryEINTR(recvmsg)(fd.get(), &msg, 0), SyscallSucceeds());
+
+ struct ucred creds;
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, nullptr);
+ ASSERT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(creds)));
+ ASSERT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+ ASSERT_EQ(cmsg->cmsg_type, SCM_CREDENTIALS);
+
+ memcpy(&creds, CMSG_DATA(cmsg), sizeof(creds));
+
+ // The peer is the kernel, which is "PID" 0.
+ EXPECT_EQ(creds.pid, 0);
+ // The kernel identifies as root. Also allow nobody in case this test is
+ // running in a userns without root mapped.
+ EXPECT_THAT(creds.uid, AnyOf(Eq(0), Eq(65534)));
+ EXPECT_THAT(creds.gid, AnyOf(Eq(0), Eq(65534)));
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_netlink_route_util.cc b/test/syscalls/linux/socket_netlink_route_util.cc
new file mode 100644
index 000000000..bde1dbb4d
--- /dev/null
+++ b/test/syscalls/linux/socket_netlink_route_util.cc
@@ -0,0 +1,162 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_netlink_route_util.h"
+
+#include <linux/if.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+
+#include "test/syscalls/linux/socket_netlink_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+constexpr uint32_t kSeq = 12345;
+
+} // namespace
+
+PosixError DumpLinks(
+ const FileDescriptor& fd, uint32_t seq,
+ const std::function<void(const struct nlmsghdr* hdr)>& fn) {
+ struct request {
+ struct nlmsghdr hdr;
+ struct ifinfomsg ifm;
+ };
+
+ struct request req = {};
+ req.hdr.nlmsg_len = sizeof(req);
+ req.hdr.nlmsg_type = RTM_GETLINK;
+ req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+ req.hdr.nlmsg_seq = seq;
+ req.ifm.ifi_family = AF_UNSPEC;
+
+ return NetlinkRequestResponse(fd, &req, sizeof(req), fn, false);
+}
+
+PosixErrorOr<std::vector<Link>> DumpLinks() {
+ ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, NetlinkBoundSocket(NETLINK_ROUTE));
+
+ std::vector<Link> links;
+ RETURN_IF_ERRNO(DumpLinks(fd, kSeq, [&](const struct nlmsghdr* hdr) {
+ if (hdr->nlmsg_type != RTM_NEWLINK ||
+ hdr->nlmsg_len < NLMSG_SPACE(sizeof(struct ifinfomsg))) {
+ return;
+ }
+ const struct ifinfomsg* msg =
+ reinterpret_cast<const struct ifinfomsg*>(NLMSG_DATA(hdr));
+ const auto* rta = FindRtAttr(hdr, msg, IFLA_IFNAME);
+ if (rta == nullptr) {
+ // Ignore links that do not have a name.
+ return;
+ }
+
+ links.emplace_back();
+ links.back().index = msg->ifi_index;
+ links.back().type = msg->ifi_type;
+ links.back().name =
+ std::string(reinterpret_cast<const char*>(RTA_DATA(rta)));
+ }));
+ return links;
+}
+
+PosixErrorOr<Link> LoopbackLink() {
+ ASSIGN_OR_RETURN_ERRNO(auto links, DumpLinks());
+ for (const auto& link : links) {
+ if (link.type == ARPHRD_LOOPBACK) {
+ return link;
+ }
+ }
+ return PosixError(ENOENT, "loopback link not found");
+}
+
+PosixError LinkAddLocalAddr(int index, int family, int prefixlen,
+ const void* addr, int addrlen) {
+ ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, NetlinkBoundSocket(NETLINK_ROUTE));
+
+ struct request {
+ struct nlmsghdr hdr;
+ struct ifaddrmsg ifaddr;
+ char attrbuf[512];
+ };
+
+ struct request req = {};
+ req.hdr.nlmsg_len = NLMSG_LENGTH(sizeof(req.ifaddr));
+ req.hdr.nlmsg_type = RTM_NEWADDR;
+ req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+ req.hdr.nlmsg_seq = kSeq;
+ req.ifaddr.ifa_index = index;
+ req.ifaddr.ifa_family = family;
+ req.ifaddr.ifa_prefixlen = prefixlen;
+
+ struct rtattr* rta = reinterpret_cast<struct rtattr*>(
+ reinterpret_cast<int8_t*>(&req) + NLMSG_ALIGN(req.hdr.nlmsg_len));
+ rta->rta_type = IFA_LOCAL;
+ rta->rta_len = RTA_LENGTH(addrlen);
+ req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) + RTA_LENGTH(addrlen);
+ memcpy(RTA_DATA(rta), addr, addrlen);
+
+ return NetlinkRequestAckOrError(fd, kSeq, &req, req.hdr.nlmsg_len);
+}
+
+PosixError LinkChangeFlags(int index, unsigned int flags, unsigned int change) {
+ ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, NetlinkBoundSocket(NETLINK_ROUTE));
+
+ struct request {
+ struct nlmsghdr hdr;
+ struct ifinfomsg ifinfo;
+ char pad[NLMSG_ALIGNTO];
+ };
+
+ struct request req = {};
+ req.hdr.nlmsg_len = NLMSG_LENGTH(sizeof(req.ifinfo));
+ req.hdr.nlmsg_type = RTM_NEWLINK;
+ req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+ req.hdr.nlmsg_seq = kSeq;
+ req.ifinfo.ifi_index = index;
+ req.ifinfo.ifi_flags = flags;
+ req.ifinfo.ifi_change = change;
+
+ return NetlinkRequestAckOrError(fd, kSeq, &req, req.hdr.nlmsg_len);
+}
+
+PosixError LinkSetMacAddr(int index, const void* addr, int addrlen) {
+ ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, NetlinkBoundSocket(NETLINK_ROUTE));
+
+ struct request {
+ struct nlmsghdr hdr;
+ struct ifinfomsg ifinfo;
+ char attrbuf[512];
+ };
+
+ struct request req = {};
+ req.hdr.nlmsg_len = NLMSG_LENGTH(sizeof(req.ifinfo));
+ req.hdr.nlmsg_type = RTM_NEWLINK;
+ req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+ req.hdr.nlmsg_seq = kSeq;
+ req.ifinfo.ifi_index = index;
+
+ struct rtattr* rta = reinterpret_cast<struct rtattr*>(
+ reinterpret_cast<int8_t*>(&req) + NLMSG_ALIGN(req.hdr.nlmsg_len));
+ rta->rta_type = IFLA_ADDRESS;
+ rta->rta_len = RTA_LENGTH(addrlen);
+ req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) + RTA_LENGTH(addrlen);
+ memcpy(RTA_DATA(rta), addr, addrlen);
+
+ return NetlinkRequestAckOrError(fd, kSeq, &req, req.hdr.nlmsg_len);
+}
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_netlink_route_util.h b/test/syscalls/linux/socket_netlink_route_util.h
new file mode 100644
index 000000000..149c4a7f6
--- /dev/null
+++ b/test/syscalls/linux/socket_netlink_route_util.h
@@ -0,0 +1,55 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_NETLINK_ROUTE_UTIL_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_NETLINK_ROUTE_UTIL_H_
+
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+
+#include <vector>
+
+#include "test/syscalls/linux/socket_netlink_util.h"
+
+namespace gvisor {
+namespace testing {
+
+struct Link {
+ int index;
+ int16_t type;
+ std::string name;
+};
+
+PosixError DumpLinks(const FileDescriptor& fd, uint32_t seq,
+ const std::function<void(const struct nlmsghdr* hdr)>& fn);
+
+PosixErrorOr<std::vector<Link>> DumpLinks();
+
+// Returns the loopback link on the system. ENOENT if not found.
+PosixErrorOr<Link> LoopbackLink();
+
+// LinkAddLocalAddr sets IFA_LOCAL attribute on the interface.
+PosixError LinkAddLocalAddr(int index, int family, int prefixlen,
+ const void* addr, int addrlen);
+
+// LinkChangeFlags changes interface flags. E.g. IFF_UP.
+PosixError LinkChangeFlags(int index, unsigned int flags, unsigned int change);
+
+// LinkSetMacAddr sets IFLA_ADDRESS attribute of the interface.
+PosixError LinkSetMacAddr(int index, const void* addr, int addrlen);
+
+} // namespace testing
+} // namespace gvisor
+
+#endif // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_NETLINK_ROUTE_UTIL_H_
diff --git a/test/syscalls/linux/socket_netlink_uevent.cc b/test/syscalls/linux/socket_netlink_uevent.cc
new file mode 100644
index 000000000..da425bed4
--- /dev/null
+++ b/test/syscalls/linux/socket_netlink_uevent.cc
@@ -0,0 +1,83 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <linux/filter.h>
+#include <linux/netlink.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_netlink_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/test_util.h"
+
+// Tests for NETLINK_KOBJECT_UEVENT sockets.
+//
+// gVisor never sends any messages on these sockets, so we don't test the events
+// themselves.
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// SO_PASSCRED can be enabled. Since no messages are sent in gVisor, we don't
+// actually test receiving credentials.
+TEST(NetlinkUeventTest, PassCred) {
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_KOBJECT_UEVENT));
+
+ EXPECT_THAT(setsockopt(fd.get(), SOL_SOCKET, SO_PASSCRED, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+}
+
+// SO_DETACH_FILTER fails without a filter already installed.
+TEST(NetlinkUeventTest, DetachNoFilter) {
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_KOBJECT_UEVENT));
+
+ int opt;
+ EXPECT_THAT(
+ setsockopt(fd.get(), SOL_SOCKET, SO_DETACH_FILTER, &opt, sizeof(opt)),
+ SyscallFailsWithErrno(ENOENT));
+}
+
+// We can attach a BPF filter.
+TEST(NetlinkUeventTest, AttachFilter) {
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(NetlinkBoundSocket(NETLINK_KOBJECT_UEVENT));
+
+ // Minimal BPF program: a single ret.
+ struct sock_filter filter = {0x6, 0, 0, 0};
+ struct sock_fprog prog = {};
+ prog.len = 1;
+ prog.filter = &filter;
+
+ EXPECT_THAT(
+ setsockopt(fd.get(), SOL_SOCKET, SO_ATTACH_FILTER, &prog, sizeof(prog)),
+ SyscallSucceeds());
+
+ int opt;
+ EXPECT_THAT(
+ setsockopt(fd.get(), SOL_SOCKET, SO_DETACH_FILTER, &opt, sizeof(opt)),
+ SyscallSucceeds());
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_netlink_util.cc b/test/syscalls/linux/socket_netlink_util.cc
new file mode 100644
index 000000000..952eecfe8
--- /dev/null
+++ b/test/syscalls/linux/socket_netlink_util.cc
@@ -0,0 +1,187 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_netlink_util.h"
+
+#include <linux/if_arp.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <sys/socket.h>
+
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+PosixErrorOr<FileDescriptor> NetlinkBoundSocket(int protocol) {
+ FileDescriptor fd;
+ ASSIGN_OR_RETURN_ERRNO(fd, Socket(AF_NETLINK, SOCK_RAW, protocol));
+
+ struct sockaddr_nl addr = {};
+ addr.nl_family = AF_NETLINK;
+
+ RETURN_ERROR_IF_SYSCALL_FAIL(
+ bind(fd.get(), reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr)));
+ MaybeSave();
+
+ return std::move(fd);
+}
+
+PosixErrorOr<uint32_t> NetlinkPortID(int fd) {
+ struct sockaddr_nl addr;
+ socklen_t addrlen = sizeof(addr);
+
+ RETURN_ERROR_IF_SYSCALL_FAIL(
+ getsockname(fd, reinterpret_cast<struct sockaddr*>(&addr), &addrlen));
+ MaybeSave();
+
+ return static_cast<uint32_t>(addr.nl_pid);
+}
+
+PosixError NetlinkRequestResponse(
+ const FileDescriptor& fd, void* request, size_t len,
+ const std::function<void(const struct nlmsghdr* hdr)>& fn,
+ bool expect_nlmsgerr) {
+ struct iovec iov = {};
+ iov.iov_base = request;
+ iov.iov_len = len;
+
+ struct msghdr msg = {};
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ // No destination required; it defaults to pid 0, the kernel.
+
+ RETURN_ERROR_IF_SYSCALL_FAIL(RetryEINTR(sendmsg)(fd.get(), &msg, 0));
+
+ constexpr size_t kBufferSize = 4096;
+ std::vector<char> buf(kBufferSize);
+ iov.iov_base = buf.data();
+ iov.iov_len = buf.size();
+
+ // If NLM_F_MULTI is set, response is a series of messages that ends with a
+ // NLMSG_DONE message.
+ int type = -1;
+ int flags = 0;
+ do {
+ int len;
+ RETURN_ERROR_IF_SYSCALL_FAIL(len = RetryEINTR(recvmsg)(fd.get(), &msg, 0));
+
+ // We don't bother with the complexity of dealing with truncated messages.
+ // We must allocate a large enough buffer up front.
+ if ((msg.msg_flags & MSG_TRUNC) == MSG_TRUNC) {
+ return PosixError(EIO,
+ absl::StrCat("Received truncated message with flags: ",
+ msg.msg_flags));
+ }
+
+ for (struct nlmsghdr* hdr = reinterpret_cast<struct nlmsghdr*>(buf.data());
+ NLMSG_OK(hdr, len); hdr = NLMSG_NEXT(hdr, len)) {
+ fn(hdr);
+ flags = hdr->nlmsg_flags;
+ type = hdr->nlmsg_type;
+ // Done should include an integer payload for dump_done_errno.
+ // See net/netlink/af_netlink.c:netlink_dump
+ // Some tools like the 'ip' tool check the minimum length of the
+ // NLMSG_DONE message.
+ if (type == NLMSG_DONE) {
+ EXPECT_GE(hdr->nlmsg_len, NLMSG_LENGTH(sizeof(int)));
+ }
+ }
+ } while ((flags & NLM_F_MULTI) && type != NLMSG_DONE && type != NLMSG_ERROR);
+
+ if (expect_nlmsgerr) {
+ EXPECT_EQ(type, NLMSG_ERROR);
+ } else if (flags & NLM_F_MULTI) {
+ EXPECT_EQ(type, NLMSG_DONE);
+ }
+ return NoError();
+}
+
+PosixError NetlinkRequestResponseSingle(
+ const FileDescriptor& fd, void* request, size_t len,
+ const std::function<void(const struct nlmsghdr* hdr)>& fn) {
+ struct iovec iov = {};
+ iov.iov_base = request;
+ iov.iov_len = len;
+
+ struct msghdr msg = {};
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ // No destination required; it defaults to pid 0, the kernel.
+
+ RETURN_ERROR_IF_SYSCALL_FAIL(RetryEINTR(sendmsg)(fd.get(), &msg, 0));
+
+ constexpr size_t kBufferSize = 4096;
+ std::vector<char> buf(kBufferSize);
+ iov.iov_base = buf.data();
+ iov.iov_len = buf.size();
+
+ int ret;
+ RETURN_ERROR_IF_SYSCALL_FAIL(ret = RetryEINTR(recvmsg)(fd.get(), &msg, 0));
+
+ // We don't bother with the complexity of dealing with truncated messages.
+ // We must allocate a large enough buffer up front.
+ if ((msg.msg_flags & MSG_TRUNC) == MSG_TRUNC) {
+ return PosixError(
+ EIO,
+ absl::StrCat("Received truncated message with flags: ", msg.msg_flags));
+ }
+
+ for (struct nlmsghdr* hdr = reinterpret_cast<struct nlmsghdr*>(buf.data());
+ NLMSG_OK(hdr, ret); hdr = NLMSG_NEXT(hdr, ret)) {
+ fn(hdr);
+ }
+
+ return NoError();
+}
+
+PosixError NetlinkRequestAckOrError(const FileDescriptor& fd, uint32_t seq,
+ void* request, size_t len) {
+ // Dummy negative number for no error message received.
+ // We won't get a negative error number so there will be no confusion.
+ int err = -42;
+ RETURN_IF_ERRNO(NetlinkRequestResponse(
+ fd, request, len,
+ [&](const struct nlmsghdr* hdr) {
+ EXPECT_EQ(NLMSG_ERROR, hdr->nlmsg_type);
+ EXPECT_EQ(hdr->nlmsg_seq, seq);
+ EXPECT_GE(hdr->nlmsg_len, sizeof(*hdr) + sizeof(struct nlmsgerr));
+
+ const struct nlmsgerr* msg =
+ reinterpret_cast<const struct nlmsgerr*>(NLMSG_DATA(hdr));
+ err = -msg->error;
+ },
+ true));
+ return PosixError(err);
+}
+
+const struct rtattr* FindRtAttr(const struct nlmsghdr* hdr,
+ const struct ifinfomsg* msg, int16_t attr) {
+ const int ifi_space = NLMSG_SPACE(sizeof(*msg));
+ int attrlen = hdr->nlmsg_len - ifi_space;
+ const struct rtattr* rta = reinterpret_cast<const struct rtattr*>(
+ reinterpret_cast<const uint8_t*>(hdr) + NLMSG_ALIGN(ifi_space));
+ for (; RTA_OK(rta, attrlen); rta = RTA_NEXT(rta, attrlen)) {
+ if (rta->rta_type == attr) {
+ return rta;
+ }
+ }
+ return nullptr;
+}
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_netlink_util.h b/test/syscalls/linux/socket_netlink_util.h
new file mode 100644
index 000000000..e13ead406
--- /dev/null
+++ b/test/syscalls/linux/socket_netlink_util.h
@@ -0,0 +1,62 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_SOCKET_NETLINK_UTIL_H_
+#define GVISOR_TEST_SYSCALLS_SOCKET_NETLINK_UTIL_H_
+
+#include <sys/socket.h>
+// socket.h has to be included before if_arp.h.
+#include <linux/if_arp.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+
+#include "test/util/file_descriptor.h"
+#include "test/util/posix_error.h"
+
+namespace gvisor {
+namespace testing {
+
+// Returns a bound netlink socket.
+PosixErrorOr<FileDescriptor> NetlinkBoundSocket(int protocol);
+
+// Returns the port ID of the passed socket.
+PosixErrorOr<uint32_t> NetlinkPortID(int fd);
+
+// Send the passed request and call fn on all response netlink messages.
+//
+// To be used on requests with NLM_F_MULTI reponses.
+PosixError NetlinkRequestResponse(
+ const FileDescriptor& fd, void* request, size_t len,
+ const std::function<void(const struct nlmsghdr* hdr)>& fn,
+ bool expect_nlmsgerr);
+
+// Send the passed request and call fn on all response netlink messages.
+//
+// To be used on requests without NLM_F_MULTI reponses.
+PosixError NetlinkRequestResponseSingle(
+ const FileDescriptor& fd, void* request, size_t len,
+ const std::function<void(const struct nlmsghdr* hdr)>& fn);
+
+// Send the passed request then expect and return an ack or error.
+PosixError NetlinkRequestAckOrError(const FileDescriptor& fd, uint32_t seq,
+ void* request, size_t len);
+
+// Find rtnetlink attribute in message.
+const struct rtattr* FindRtAttr(const struct nlmsghdr* hdr,
+ const struct ifinfomsg* msg, int16_t attr);
+
+} // namespace testing
+} // namespace gvisor
+
+#endif // GVISOR_TEST_SYSCALLS_SOCKET_NETLINK_UTIL_H_
diff --git a/test/syscalls/linux/socket_non_blocking.cc b/test/syscalls/linux/socket_non_blocking.cc
new file mode 100644
index 000000000..c3520cadd
--- /dev/null
+++ b/test/syscalls/linux/socket_non_blocking.cc
@@ -0,0 +1,62 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_non_blocking.h"
+
+#include <stdio.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+TEST_P(NonBlockingSocketPairTest, ReadNothingAvailable) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ char buf[20] = {};
+ ASSERT_THAT(ReadFd(sockets->first_fd(), buf, sizeof(buf)),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST_P(NonBlockingSocketPairTest, RecvNothingAvailable) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ char buf[20] = {};
+ ASSERT_THAT(RetryEINTR(recv)(sockets->first_fd(), buf, sizeof(buf), 0),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST_P(NonBlockingSocketPairTest, RecvMsgNothingAvailable) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ struct iovec iov;
+ char buf[20] = {};
+ iov.iov_base = buf;
+ iov.iov_len = sizeof(buf);
+ struct msghdr msg = {};
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->first_fd(), &msg, 0),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_non_blocking.h b/test/syscalls/linux/socket_non_blocking.h
new file mode 100644
index 000000000..bd3e02fd2
--- /dev/null
+++ b/test/syscalls/linux/socket_non_blocking.h
@@ -0,0 +1,29 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_NON_BLOCKING_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_NON_BLOCKING_H_
+
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to pairs of connected non-blocking sockets.
+using NonBlockingSocketPairTest = SocketPairTest;
+
+} // namespace testing
+} // namespace gvisor
+
+#endif // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_NON_BLOCKING_H_
diff --git a/test/syscalls/linux/socket_non_stream.cc b/test/syscalls/linux/socket_non_stream.cc
new file mode 100644
index 000000000..c61817f14
--- /dev/null
+++ b/test/syscalls/linux/socket_non_stream.cc
@@ -0,0 +1,337 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_non_stream.h"
+
+#include <stdio.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+TEST_P(NonStreamSocketPairTest, SendMsgTooLarge) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ int sndbuf;
+ socklen_t length = sizeof(sndbuf);
+ ASSERT_THAT(
+ getsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDBUF, &sndbuf, &length),
+ SyscallSucceeds());
+
+ // Make the call too large to fit in the send buffer.
+ const int buffer_size = 3 * sndbuf;
+
+ EXPECT_THAT(SendLargeSendMsg(sockets, buffer_size, false /* reader */),
+ SyscallFailsWithErrno(EMSGSIZE));
+}
+
+// Stream sockets allow data sent with a single (e.g. write, sendmsg) syscall
+// to be read in pieces with multiple (e.g. read, recvmsg) syscalls.
+//
+// SplitRecv checks that control messages can only be read on the first (e.g.
+// read, recvmsg) syscall, even if it doesn't provide space for the control
+// message.
+TEST_P(NonStreamSocketPairTest, SplitRecv) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ char sent_data[512];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+ ASSERT_THAT(
+ RetryEINTR(send)(sockets->first_fd(), sent_data, sizeof(sent_data), 0),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+ char received_data[sizeof(sent_data) / 2];
+ ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+ sizeof(received_data), 0),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(received_data)));
+ ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+ sizeof(received_data), MSG_DONTWAIT),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+}
+
+// Stream sockets allow data sent with multiple sends to be read in a single
+// recv. Datagram sockets do not.
+//
+// SingleRecv checks that only a single message is readable in a single recv.
+TEST_P(NonStreamSocketPairTest, SingleRecv) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ char sent_data1[20];
+ RandomizeBuffer(sent_data1, sizeof(sent_data1));
+ ASSERT_THAT(
+ RetryEINTR(send)(sockets->first_fd(), sent_data1, sizeof(sent_data1), 0),
+ SyscallSucceedsWithValue(sizeof(sent_data1)));
+ char sent_data2[20];
+ RandomizeBuffer(sent_data2, sizeof(sent_data2));
+ ASSERT_THAT(
+ RetryEINTR(send)(sockets->first_fd(), sent_data2, sizeof(sent_data2), 0),
+ SyscallSucceedsWithValue(sizeof(sent_data2)));
+ char received_data[sizeof(sent_data1) + sizeof(sent_data2)];
+ ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+ sizeof(received_data), 0),
+ SyscallSucceedsWithValue(sizeof(sent_data1)));
+ EXPECT_EQ(0, memcmp(sent_data1, received_data, sizeof(sent_data1)));
+}
+
+TEST_P(NonStreamSocketPairTest, RecvmsgMsghdrFlagMsgTrunc) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[10];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+ ASSERT_THAT(
+ RetryEINTR(send)(sockets->first_fd(), sent_data, sizeof(sent_data), 0),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ char received_data[sizeof(sent_data) / 2] = {};
+
+ struct iovec iov;
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ struct msghdr msg = {};
+ msg.msg_flags = -1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+ EXPECT_EQ(0, memcmp(received_data, sent_data, sizeof(received_data)));
+
+ // Check that msghdr flags were updated.
+ EXPECT_EQ(msg.msg_flags & MSG_TRUNC, MSG_TRUNC);
+}
+
+// Stream sockets allow data sent with multiple sends to be peeked at in a
+// single recv. Datagram sockets (except for unix sockets) do not.
+//
+// SinglePeek checks that only a single message is peekable in a single recv.
+TEST_P(NonStreamSocketPairTest, SinglePeek) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ char sent_data1[20];
+ RandomizeBuffer(sent_data1, sizeof(sent_data1));
+ ASSERT_THAT(
+ RetryEINTR(send)(sockets->first_fd(), sent_data1, sizeof(sent_data1), 0),
+ SyscallSucceedsWithValue(sizeof(sent_data1)));
+ char sent_data2[20];
+ RandomizeBuffer(sent_data2, sizeof(sent_data2));
+ ASSERT_THAT(
+ RetryEINTR(send)(sockets->first_fd(), sent_data2, sizeof(sent_data2), 0),
+ SyscallSucceedsWithValue(sizeof(sent_data2)));
+ char received_data[sizeof(sent_data1) + sizeof(sent_data2)];
+ for (int i = 0; i < 3; i++) {
+ memset(received_data, 0, sizeof(received_data));
+ ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+ sizeof(received_data), MSG_PEEK),
+ SyscallSucceedsWithValue(sizeof(sent_data1)));
+ EXPECT_EQ(0, memcmp(sent_data1, received_data, sizeof(sent_data1)));
+ }
+ ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+ sizeof(sent_data1), 0),
+ SyscallSucceedsWithValue(sizeof(sent_data1)));
+ EXPECT_EQ(0, memcmp(sent_data1, received_data, sizeof(sent_data1)));
+ ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+ sizeof(sent_data2), 0),
+ SyscallSucceedsWithValue(sizeof(sent_data2)));
+ EXPECT_EQ(0, memcmp(sent_data2, received_data, sizeof(sent_data2)));
+}
+
+TEST_P(NonStreamSocketPairTest, MsgTruncTruncation) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ char sent_data[512];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+ ASSERT_THAT(
+ RetryEINTR(send)(sockets->first_fd(), sent_data, sizeof(sent_data), 0),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+ char received_data[sizeof(sent_data)] = {};
+ ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+ sizeof(received_data) / 2, MSG_TRUNC),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data) / 2));
+
+ // Check that we didn't get any extra data.
+ EXPECT_NE(0, memcmp(sent_data + sizeof(sent_data) / 2,
+ received_data + sizeof(received_data) / 2,
+ sizeof(sent_data) / 2));
+}
+
+TEST_P(NonStreamSocketPairTest, MsgTruncTruncationRecvmsgMsghdrFlagMsgTrunc) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[10];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+ ASSERT_THAT(
+ RetryEINTR(send)(sockets->first_fd(), sent_data, sizeof(sent_data), 0),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ char received_data[sizeof(sent_data) / 2] = {};
+
+ struct iovec iov;
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ struct msghdr msg = {};
+ msg.msg_flags = -1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, MSG_TRUNC),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+ EXPECT_EQ(0, memcmp(received_data, sent_data, sizeof(received_data)));
+
+ // Check that msghdr flags were updated.
+ EXPECT_EQ(msg.msg_flags & MSG_TRUNC, MSG_TRUNC);
+}
+
+TEST_P(NonStreamSocketPairTest, MsgTruncSameSize) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ char sent_data[512];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+ ASSERT_THAT(
+ RetryEINTR(send)(sockets->first_fd(), sent_data, sizeof(sent_data), 0),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+ char received_data[sizeof(sent_data)];
+ ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+ sizeof(received_data), MSG_TRUNC),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+}
+
+TEST_P(NonStreamSocketPairTest, MsgTruncNotFull) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ char sent_data[512];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+ ASSERT_THAT(
+ RetryEINTR(send)(sockets->first_fd(), sent_data, sizeof(sent_data), 0),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+ char received_data[2 * sizeof(sent_data)];
+ ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+ sizeof(received_data), MSG_TRUNC),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+}
+
+// This test tests reading from a socket with MSG_TRUNC and a zero length
+// receive buffer. The user should be able to get the message length.
+TEST_P(NonStreamSocketPairTest, RecvmsgMsgTruncZeroLen) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[10];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+ ASSERT_THAT(
+ RetryEINTR(send)(sockets->first_fd(), sent_data, sizeof(sent_data), 0),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ // The receive buffer is of zero length.
+ char received_data[0] = {};
+
+ struct iovec iov;
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ struct msghdr msg = {};
+ msg.msg_flags = -1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ // The syscall succeeds returning the full size of the message on the socket.
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, MSG_TRUNC),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ // Check that MSG_TRUNC is set on msghdr flags.
+ EXPECT_EQ(msg.msg_flags & MSG_TRUNC, MSG_TRUNC);
+}
+
+// This test tests reading from a socket with MSG_TRUNC | MSG_PEEK and a zero
+// length receive buffer. The user should be able to get the message length
+// without reading data off the socket.
+TEST_P(NonStreamSocketPairTest, RecvmsgMsgTruncMsgPeekZeroLen) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[10];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+ ASSERT_THAT(
+ RetryEINTR(send)(sockets->first_fd(), sent_data, sizeof(sent_data), 0),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ // The receive buffer is of zero length.
+ char peek_data[0] = {};
+
+ struct iovec peek_iov;
+ peek_iov.iov_base = peek_data;
+ peek_iov.iov_len = sizeof(peek_data);
+ struct msghdr peek_msg = {};
+ peek_msg.msg_flags = -1;
+ peek_msg.msg_iov = &peek_iov;
+ peek_msg.msg_iovlen = 1;
+
+ // The syscall succeeds returning the full size of the message on the socket.
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &peek_msg,
+ MSG_TRUNC | MSG_PEEK),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ // Check that MSG_TRUNC is set on msghdr flags because the receive buffer is
+ // smaller than the message size.
+ EXPECT_EQ(peek_msg.msg_flags & MSG_TRUNC, MSG_TRUNC);
+
+ char received_data[sizeof(sent_data)] = {};
+
+ struct iovec received_iov;
+ received_iov.iov_base = received_data;
+ received_iov.iov_len = sizeof(received_data);
+ struct msghdr received_msg = {};
+ received_msg.msg_flags = -1;
+ received_msg.msg_iov = &received_iov;
+ received_msg.msg_iovlen = 1;
+
+ // Next we can read the actual data.
+ ASSERT_THAT(
+ RetryEINTR(recvmsg)(sockets->second_fd(), &received_msg, MSG_TRUNC),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ // Check that MSG_TRUNC is not set on msghdr flags because we read the whole
+ // message.
+ EXPECT_EQ(received_msg.msg_flags & MSG_TRUNC, 0);
+}
+
+// This test tests reading from a socket with MSG_TRUNC | MSG_PEEK and a zero
+// length receive buffer and MSG_DONTWAIT. The user should be able to get an
+// EAGAIN or EWOULDBLOCK error response.
+TEST_P(NonStreamSocketPairTest, RecvmsgTruncPeekDontwaitZeroLen) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ // NOTE: We don't send any data on the socket.
+
+ // The receive buffer is of zero length.
+ char peek_data[0] = {};
+
+ struct iovec peek_iov;
+ peek_iov.iov_base = peek_data;
+ peek_iov.iov_len = sizeof(peek_data);
+ struct msghdr peek_msg = {};
+ peek_msg.msg_flags = -1;
+ peek_msg.msg_iov = &peek_iov;
+ peek_msg.msg_iovlen = 1;
+
+ // recvmsg fails with EAGAIN because no data is available on the socket.
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &peek_msg,
+ MSG_TRUNC | MSG_PEEK | MSG_DONTWAIT),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_non_stream.h b/test/syscalls/linux/socket_non_stream.h
new file mode 100644
index 000000000..469fbe6a2
--- /dev/null
+++ b/test/syscalls/linux/socket_non_stream.h
@@ -0,0 +1,29 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_NON_STREAM_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_NON_STREAM_H_
+
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to pairs of connected non-stream sockets.
+using NonStreamSocketPairTest = SocketPairTest;
+
+} // namespace testing
+} // namespace gvisor
+
+#endif // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_NON_STREAM_H_
diff --git a/test/syscalls/linux/socket_non_stream_blocking.cc b/test/syscalls/linux/socket_non_stream_blocking.cc
new file mode 100644
index 000000000..b052f6e61
--- /dev/null
+++ b/test/syscalls/linux/socket_non_stream_blocking.cc
@@ -0,0 +1,85 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_non_stream_blocking.h"
+
+#include <stdio.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+
+#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+TEST_P(BlockingNonStreamSocketPairTest, RecvLessThanBufferWaitAll) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[100];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ ASSERT_THAT(write(sockets->first_fd(), sent_data, sizeof(sent_data)),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ char received_data[sizeof(sent_data) * 2] = {};
+ ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+ sizeof(received_data), MSG_WAITALL),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+}
+
+// This test tests reading from a socket with MSG_TRUNC | MSG_PEEK and a zero
+// length receive buffer and MSG_DONTWAIT. The recvmsg call should block on
+// reading the data.
+TEST_P(BlockingNonStreamSocketPairTest,
+ RecvmsgTruncPeekDontwaitZeroLenBlocking) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ // NOTE: We don't initially send any data on the socket.
+ const int data_size = 10;
+ char sent_data[data_size];
+ RandomizeBuffer(sent_data, data_size);
+
+ // The receive buffer is of zero length.
+ char peek_data[0] = {};
+
+ struct iovec peek_iov;
+ peek_iov.iov_base = peek_data;
+ peek_iov.iov_len = sizeof(peek_data);
+ struct msghdr peek_msg = {};
+ peek_msg.msg_flags = -1;
+ peek_msg.msg_iov = &peek_iov;
+ peek_msg.msg_iovlen = 1;
+
+ ScopedThread t([&]() {
+ // The syscall succeeds returning the full size of the message on the
+ // socket. This should block until there is data on the socket.
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &peek_msg,
+ MSG_TRUNC | MSG_PEEK),
+ SyscallSucceedsWithValue(data_size));
+ });
+
+ absl::SleepFor(absl::Seconds(1));
+ ASSERT_THAT(RetryEINTR(send)(sockets->first_fd(), sent_data, data_size, 0),
+ SyscallSucceedsWithValue(data_size));
+}
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_non_stream_blocking.h b/test/syscalls/linux/socket_non_stream_blocking.h
new file mode 100644
index 000000000..6e205a039
--- /dev/null
+++ b/test/syscalls/linux/socket_non_stream_blocking.h
@@ -0,0 +1,30 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_NON_STREAM_BLOCKING_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_NON_STREAM_BLOCKING_H_
+
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to pairs of blocking connected non-stream
+// sockets.
+using BlockingNonStreamSocketPairTest = SocketPairTest;
+
+} // namespace testing
+} // namespace gvisor
+
+#endif // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_NON_STREAM_BLOCKING_H_
diff --git a/test/syscalls/linux/socket_stream.cc b/test/syscalls/linux/socket_stream.cc
new file mode 100644
index 000000000..6522b2e01
--- /dev/null
+++ b/test/syscalls/linux/socket_stream.cc
@@ -0,0 +1,178 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_stream.h"
+
+#include <stdio.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+
+#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+TEST_P(StreamSocketPairTest, SplitRecv) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ char sent_data[512];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+ ASSERT_THAT(
+ RetryEINTR(send)(sockets->first_fd(), sent_data, sizeof(sent_data), 0),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+ char received_data[sizeof(sent_data) / 2];
+ ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+ sizeof(received_data), 0),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(received_data)));
+ ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+ sizeof(received_data), 0),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+ EXPECT_EQ(0, memcmp(sent_data + sizeof(received_data), received_data,
+ sizeof(received_data)));
+}
+
+// Stream sockets allow data sent with multiple sends to be read in a single
+// recv.
+//
+// CoalescedRecv checks that multiple messages are readable in a single recv.
+TEST_P(StreamSocketPairTest, CoalescedRecv) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ char sent_data1[20];
+ RandomizeBuffer(sent_data1, sizeof(sent_data1));
+ ASSERT_THAT(
+ RetryEINTR(send)(sockets->first_fd(), sent_data1, sizeof(sent_data1), 0),
+ SyscallSucceedsWithValue(sizeof(sent_data1)));
+ char sent_data2[20];
+ RandomizeBuffer(sent_data2, sizeof(sent_data2));
+ ASSERT_THAT(
+ RetryEINTR(send)(sockets->first_fd(), sent_data2, sizeof(sent_data2), 0),
+ SyscallSucceedsWithValue(sizeof(sent_data2)));
+ char received_data[sizeof(sent_data1) + sizeof(sent_data2)];
+ ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+ sizeof(received_data), 0),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+ EXPECT_EQ(0, memcmp(sent_data1, received_data, sizeof(sent_data1)));
+ EXPECT_EQ(0, memcmp(sent_data2, received_data + sizeof(sent_data1),
+ sizeof(sent_data2)));
+}
+
+TEST_P(StreamSocketPairTest, WriteOneSideClosed) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ ASSERT_THAT(close(sockets->release_first_fd()), SyscallSucceeds());
+ const char str[] = "abc";
+ ASSERT_THAT(write(sockets->second_fd(), str, 3),
+ SyscallFailsWithErrno(EPIPE));
+}
+
+TEST_P(StreamSocketPairTest, RecvmsgMsghdrFlagsNoMsgTrunc) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[10];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+ ASSERT_THAT(
+ RetryEINTR(send)(sockets->first_fd(), sent_data, sizeof(sent_data), 0),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ char received_data[sizeof(sent_data) / 2] = {};
+
+ struct iovec iov;
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ struct msghdr msg = {};
+ msg.msg_flags = -1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+ EXPECT_EQ(0, memcmp(received_data, sent_data, sizeof(received_data)));
+
+ // Check that msghdr flags were cleared (MSG_TRUNC was not set).
+ ASSERT_EQ(msg.msg_flags & MSG_TRUNC, 0);
+}
+
+TEST_P(StreamSocketPairTest, RecvmsgTruncZeroLen) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[10];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+ ASSERT_THAT(
+ RetryEINTR(send)(sockets->first_fd(), sent_data, sizeof(sent_data), 0),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ char received_data[0] = {};
+
+ struct iovec iov;
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ struct msghdr msg = {};
+ msg.msg_flags = -1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, MSG_TRUNC),
+ SyscallSucceedsWithValue(0));
+
+ // Check that msghdr flags were cleared (MSG_TRUNC was not set).
+ ASSERT_EQ(msg.msg_flags & MSG_TRUNC, 0);
+}
+
+TEST_P(StreamSocketPairTest, RecvmsgTruncPeekZeroLen) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[10];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+ ASSERT_THAT(
+ RetryEINTR(send)(sockets->first_fd(), sent_data, sizeof(sent_data), 0),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ char received_data[0] = {};
+
+ struct iovec iov;
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ struct msghdr msg = {};
+ msg.msg_flags = -1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(
+ RetryEINTR(recvmsg)(sockets->second_fd(), &msg, MSG_TRUNC | MSG_PEEK),
+ SyscallSucceedsWithValue(0));
+
+ // Check that msghdr flags were cleared (MSG_TRUNC was not set).
+ ASSERT_EQ(msg.msg_flags & MSG_TRUNC, 0);
+}
+
+TEST_P(StreamSocketPairTest, MsgTrunc) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ char sent_data[512];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+ ASSERT_THAT(
+ RetryEINTR(send)(sockets->first_fd(), sent_data, sizeof(sent_data), 0),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+ char received_data[sizeof(sent_data)];
+ ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+ sizeof(received_data) / 2, MSG_TRUNC),
+ SyscallSucceedsWithValue(sizeof(sent_data) / 2));
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data) / 2));
+}
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_stream.h b/test/syscalls/linux/socket_stream.h
new file mode 100644
index 000000000..b837b8f8c
--- /dev/null
+++ b/test/syscalls/linux/socket_stream.h
@@ -0,0 +1,30 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_STREAM_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_STREAM_H_
+
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to pairs of blocking and non-blocking
+// connected stream sockets.
+using StreamSocketPairTest = SocketPairTest;
+
+} // namespace testing
+} // namespace gvisor
+
+#endif // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_STREAM_H_
diff --git a/test/syscalls/linux/socket_stream_blocking.cc b/test/syscalls/linux/socket_stream_blocking.cc
new file mode 100644
index 000000000..538ee2268
--- /dev/null
+++ b/test/syscalls/linux/socket_stream_blocking.cc
@@ -0,0 +1,163 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_stream_blocking.h"
+
+#include <stdio.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+
+#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+#include "test/util/timer_util.h"
+
+namespace gvisor {
+namespace testing {
+
+TEST_P(BlockingStreamSocketPairTest, BlockPartialWriteClosed) {
+ // FIXME(b/35921550): gVisor doesn't support SO_SNDBUF on UDS, nor does it
+ // enforce any limit; it will write arbitrary amounts of data without
+ // blocking.
+ SKIP_IF(IsRunningOnGvisor());
+
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ int buffer_size;
+ socklen_t length = sizeof(buffer_size);
+ ASSERT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDBUF,
+ &buffer_size, &length),
+ SyscallSucceeds());
+
+ int wfd = sockets->first_fd();
+ ScopedThread t([wfd, buffer_size]() {
+ std::vector<char> buf(2 * buffer_size);
+ // Write more than fits in the buffer. Blocks then returns partial write
+ // when the other end is closed. The next call returns EPIPE.
+ //
+ // N.B. writes occur in chunks, so we may see less than buffer_size from
+ // the first call.
+ ASSERT_THAT(write(wfd, buf.data(), buf.size()),
+ SyscallSucceedsWithValue(::testing::Gt(0)));
+ ASSERT_THAT(write(wfd, buf.data(), buf.size()),
+ ::testing::AnyOf(SyscallFailsWithErrno(EPIPE),
+ SyscallFailsWithErrno(ECONNRESET)));
+ });
+
+ // Leave time for write to become blocked.
+ absl::SleepFor(absl::Seconds(1));
+
+ ASSERT_THAT(close(sockets->release_second_fd()), SyscallSucceeds());
+}
+
+// Random save may interrupt the call to sendmsg() in SendLargeSendMsg(),
+// causing the write to be incomplete and the test to hang.
+TEST_P(BlockingStreamSocketPairTest, SendMsgTooLarge_NoRandomSave) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ int sndbuf;
+ socklen_t length = sizeof(sndbuf);
+ ASSERT_THAT(
+ getsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDBUF, &sndbuf, &length),
+ SyscallSucceeds());
+
+ // Make the call too large to fit in the send buffer.
+ const int buffer_size = 3 * sndbuf;
+
+ EXPECT_THAT(SendLargeSendMsg(sockets, buffer_size, true /* reader */),
+ SyscallSucceedsWithValue(buffer_size));
+}
+
+TEST_P(BlockingStreamSocketPairTest, RecvLessThanBuffer) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[100];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ ASSERT_THAT(write(sockets->first_fd(), sent_data, sizeof(sent_data)),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ char received_data[200] = {};
+ ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+ sizeof(received_data), 0),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+}
+
+// Test that MSG_WAITALL causes recv to block until all requested data is
+// received. Random save can interrupt blocking and cause received data to be
+// returned, even if the amount received is less than the full requested amount.
+TEST_P(BlockingStreamSocketPairTest, RecvLessThanBufferWaitAll_NoRandomSave) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[100];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ ASSERT_THAT(write(sockets->first_fd(), sent_data, sizeof(sent_data)),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ constexpr auto kDuration = absl::Milliseconds(200);
+ auto before = Now(CLOCK_MONOTONIC);
+
+ const ScopedThread t([&]() {
+ absl::SleepFor(kDuration);
+
+ // Don't let saving after the write interrupt the blocking recv.
+ const DisableSave ds;
+
+ ASSERT_THAT(write(sockets->first_fd(), sent_data, sizeof(sent_data)),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+ });
+
+ char received_data[sizeof(sent_data) * 2] = {};
+ ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+ sizeof(received_data), MSG_WAITALL),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+
+ auto after = Now(CLOCK_MONOTONIC);
+ EXPECT_GE(after - before, kDuration);
+}
+
+TEST_P(BlockingStreamSocketPairTest, SendTimeout) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ struct timeval tv {
+ .tv_sec = 0, .tv_usec = 10
+ };
+ EXPECT_THAT(
+ setsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)),
+ SyscallSucceeds());
+
+ std::vector<char> buf(kPageSize);
+ // We don't know how much data the socketpair will buffer, so we may do an
+ // arbitrarily large number of writes; saving after each write causes this
+ // test's time to explode.
+ const DisableSave ds;
+ for (;;) {
+ int ret;
+ ASSERT_THAT(
+ ret = RetryEINTR(send)(sockets->first_fd(), buf.data(), buf.size(), 0),
+ ::testing::AnyOf(SyscallSucceeds(), SyscallFailsWithErrno(EAGAIN)));
+ if (ret == -1) {
+ break;
+ }
+ }
+}
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_stream_blocking.h b/test/syscalls/linux/socket_stream_blocking.h
new file mode 100644
index 000000000..9fd19ff90
--- /dev/null
+++ b/test/syscalls/linux/socket_stream_blocking.h
@@ -0,0 +1,30 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_STREAM_BLOCKING_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_STREAM_BLOCKING_H_
+
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to pairs of blocking connected stream
+// sockets.
+using BlockingStreamSocketPairTest = SocketPairTest;
+
+} // namespace testing
+} // namespace gvisor
+
+#endif // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_STREAM_BLOCKING_H_
diff --git a/test/syscalls/linux/socket_stream_nonblock.cc b/test/syscalls/linux/socket_stream_nonblock.cc
new file mode 100644
index 000000000..74d608741
--- /dev/null
+++ b/test/syscalls/linux/socket_stream_nonblock.cc
@@ -0,0 +1,49 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_stream_nonblock.h"
+
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <sys/un.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+using ::testing::Le;
+
+TEST_P(NonBlockingStreamSocketPairTest, SendMsgTooLarge) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ int sndbuf;
+ socklen_t length = sizeof(sndbuf);
+ ASSERT_THAT(
+ getsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDBUF, &sndbuf, &length),
+ SyscallSucceeds());
+
+ // Make the call too large to fit in the send buffer.
+ const int buffer_size = 3 * sndbuf;
+
+ EXPECT_THAT(SendLargeSendMsg(sockets, buffer_size, false /* reader */),
+ SyscallSucceedsWithValue(Le(buffer_size)));
+}
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_stream_nonblock.h b/test/syscalls/linux/socket_stream_nonblock.h
new file mode 100644
index 000000000..c3b7fad91
--- /dev/null
+++ b/test/syscalls/linux/socket_stream_nonblock.h
@@ -0,0 +1,30 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_STREAM_NONBLOCK_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_STREAM_NONBLOCK_H_
+
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to pairs of non-blocking connected stream
+// sockets.
+using NonBlockingStreamSocketPairTest = SocketPairTest;
+
+} // namespace testing
+} // namespace gvisor
+
+#endif // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_STREAM_NONBLOCK_H_
diff --git a/test/syscalls/linux/socket_test_util.cc b/test/syscalls/linux/socket_test_util.cc
new file mode 100644
index 000000000..53b678e94
--- /dev/null
+++ b/test/syscalls/linux/socket_test_util.cc
@@ -0,0 +1,907 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_test_util.h"
+
+#include <arpa/inet.h>
+#include <poll.h>
+#include <sys/socket.h>
+
+#include <memory>
+
+#include "gtest/gtest.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
+#include "absl/time/clock.h"
+#include "absl/types/optional.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/posix_error.h"
+#include "test/util/temp_path.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+Creator<SocketPair> SyscallSocketPairCreator(int domain, int type,
+ int protocol) {
+ return [=]() -> PosixErrorOr<std::unique_ptr<FDSocketPair>> {
+ int pair[2];
+ RETURN_ERROR_IF_SYSCALL_FAIL(socketpair(domain, type, protocol, pair));
+ MaybeSave(); // Save on successful creation.
+ return absl::make_unique<FDSocketPair>(pair[0], pair[1]);
+ };
+}
+
+Creator<FileDescriptor> SyscallSocketCreator(int domain, int type,
+ int protocol) {
+ return [=]() -> PosixErrorOr<std::unique_ptr<FileDescriptor>> {
+ int fd = 0;
+ RETURN_ERROR_IF_SYSCALL_FAIL(fd = socket(domain, type, protocol));
+ MaybeSave(); // Save on successful creation.
+ return absl::make_unique<FileDescriptor>(fd);
+ };
+}
+
+PosixErrorOr<struct sockaddr_un> UniqueUnixAddr(bool abstract, int domain) {
+ struct sockaddr_un addr = {};
+ std::string path = NewTempAbsPathInDir("/tmp");
+ if (path.size() >= sizeof(addr.sun_path)) {
+ return PosixError(EINVAL,
+ "Unable to generate a temp path of appropriate length");
+ }
+
+ if (abstract) {
+ // Indicate that the path is in the abstract namespace.
+ path[0] = 0;
+ }
+ memcpy(addr.sun_path, path.c_str(), path.length());
+ addr.sun_family = domain;
+ return addr;
+}
+
+Creator<SocketPair> AcceptBindSocketPairCreator(bool abstract, int domain,
+ int type, int protocol) {
+ return [=]() -> PosixErrorOr<std::unique_ptr<AddrFDSocketPair>> {
+ ASSIGN_OR_RETURN_ERRNO(struct sockaddr_un bind_addr,
+ UniqueUnixAddr(abstract, domain));
+ ASSIGN_OR_RETURN_ERRNO(struct sockaddr_un extra_addr,
+ UniqueUnixAddr(abstract, domain));
+
+ int bound;
+ RETURN_ERROR_IF_SYSCALL_FAIL(bound = socket(domain, type, protocol));
+ MaybeSave(); // Successful socket creation.
+ RETURN_ERROR_IF_SYSCALL_FAIL(
+ bind(bound, reinterpret_cast<struct sockaddr*>(&bind_addr),
+ sizeof(bind_addr)));
+ MaybeSave(); // Successful bind.
+ RETURN_ERROR_IF_SYSCALL_FAIL(listen(bound, /* backlog = */ 5));
+ MaybeSave(); // Successful listen.
+
+ int connected;
+ RETURN_ERROR_IF_SYSCALL_FAIL(connected = socket(domain, type, protocol));
+ MaybeSave(); // Successful socket creation.
+ RETURN_ERROR_IF_SYSCALL_FAIL(
+ connect(connected, reinterpret_cast<struct sockaddr*>(&bind_addr),
+ sizeof(bind_addr)));
+ MaybeSave(); // Successful connect.
+
+ int accepted;
+ RETURN_ERROR_IF_SYSCALL_FAIL(
+ accepted = accept4(bound, nullptr, nullptr,
+ type & (SOCK_NONBLOCK | SOCK_CLOEXEC)));
+ MaybeSave(); // Successful connect.
+
+ // Cleanup no longer needed resources.
+ RETURN_ERROR_IF_SYSCALL_FAIL(close(bound));
+ MaybeSave(); // Dropped original socket.
+
+ // Only unlink if path is not in abstract namespace.
+ if (bind_addr.sun_path[0] != 0) {
+ RETURN_ERROR_IF_SYSCALL_FAIL(unlink(bind_addr.sun_path));
+ MaybeSave(); // Unlinked path.
+ }
+
+ // accepted is before connected to destruct connected before accepted.
+ // Destructors for nonstatic member objects are called in the reverse order
+ // in which they appear in the class declaration.
+ return absl::make_unique<AddrFDSocketPair>(accepted, connected, bind_addr,
+ extra_addr);
+ };
+}
+
+Creator<SocketPair> FilesystemAcceptBindSocketPairCreator(int domain, int type,
+ int protocol) {
+ return AcceptBindSocketPairCreator(/* abstract= */ false, domain, type,
+ protocol);
+}
+
+Creator<SocketPair> AbstractAcceptBindSocketPairCreator(int domain, int type,
+ int protocol) {
+ return AcceptBindSocketPairCreator(/* abstract= */ true, domain, type,
+ protocol);
+}
+
+Creator<SocketPair> BidirectionalBindSocketPairCreator(bool abstract,
+ int domain, int type,
+ int protocol) {
+ return [=]() -> PosixErrorOr<std::unique_ptr<FDSocketPair>> {
+ ASSIGN_OR_RETURN_ERRNO(struct sockaddr_un addr1,
+ UniqueUnixAddr(abstract, domain));
+ ASSIGN_OR_RETURN_ERRNO(struct sockaddr_un addr2,
+ UniqueUnixAddr(abstract, domain));
+
+ int sock1;
+ RETURN_ERROR_IF_SYSCALL_FAIL(sock1 = socket(domain, type, protocol));
+ MaybeSave(); // Successful socket creation.
+ RETURN_ERROR_IF_SYSCALL_FAIL(
+ bind(sock1, reinterpret_cast<struct sockaddr*>(&addr1), sizeof(addr1)));
+ MaybeSave(); // Successful bind.
+
+ int sock2;
+ RETURN_ERROR_IF_SYSCALL_FAIL(sock2 = socket(domain, type, protocol));
+ MaybeSave(); // Successful socket creation.
+ RETURN_ERROR_IF_SYSCALL_FAIL(
+ bind(sock2, reinterpret_cast<struct sockaddr*>(&addr2), sizeof(addr2)));
+ MaybeSave(); // Successful bind.
+
+ RETURN_ERROR_IF_SYSCALL_FAIL(connect(
+ sock1, reinterpret_cast<struct sockaddr*>(&addr2), sizeof(addr2)));
+ MaybeSave(); // Successful connect.
+
+ RETURN_ERROR_IF_SYSCALL_FAIL(connect(
+ sock2, reinterpret_cast<struct sockaddr*>(&addr1), sizeof(addr1)));
+ MaybeSave(); // Successful connect.
+
+ // Cleanup no longer needed resources.
+
+ // Only unlink if path is not in abstract namespace.
+ if (addr1.sun_path[0] != 0) {
+ RETURN_ERROR_IF_SYSCALL_FAIL(unlink(addr1.sun_path));
+ MaybeSave(); // Successful unlink.
+ }
+
+ // Only unlink if path is not in abstract namespace.
+ if (addr2.sun_path[0] != 0) {
+ RETURN_ERROR_IF_SYSCALL_FAIL(unlink(addr2.sun_path));
+ MaybeSave(); // Successful unlink.
+ }
+
+ return absl::make_unique<FDSocketPair>(sock1, sock2);
+ };
+}
+
+Creator<SocketPair> FilesystemBidirectionalBindSocketPairCreator(int domain,
+ int type,
+ int protocol) {
+ return BidirectionalBindSocketPairCreator(/* abstract= */ false, domain, type,
+ protocol);
+}
+
+Creator<SocketPair> AbstractBidirectionalBindSocketPairCreator(int domain,
+ int type,
+ int protocol) {
+ return BidirectionalBindSocketPairCreator(/* abstract= */ true, domain, type,
+ protocol);
+}
+
+Creator<SocketPair> SocketpairGoferSocketPairCreator(int domain, int type,
+ int protocol) {
+ return [=]() -> PosixErrorOr<std::unique_ptr<FDSocketPair>> {
+ struct sockaddr_un addr = {};
+ constexpr char kSocketGoferPath[] = "/socket";
+ memcpy(addr.sun_path, kSocketGoferPath, sizeof(kSocketGoferPath));
+ addr.sun_family = domain;
+
+ int sock1;
+ RETURN_ERROR_IF_SYSCALL_FAIL(sock1 = socket(domain, type, protocol));
+ MaybeSave(); // Successful socket creation.
+ RETURN_ERROR_IF_SYSCALL_FAIL(connect(
+ sock1, reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr)));
+ MaybeSave(); // Successful connect.
+
+ int sock2;
+ RETURN_ERROR_IF_SYSCALL_FAIL(sock2 = socket(domain, type, protocol));
+ MaybeSave(); // Successful socket creation.
+ RETURN_ERROR_IF_SYSCALL_FAIL(connect(
+ sock2, reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr)));
+ MaybeSave(); // Successful connect.
+
+ // Make and close another socketpair to ensure that the duped ends of the
+ // first socketpair get closed.
+ //
+ // The problem is that there is no way to atomically send and close an FD.
+ // The closest that we can do is send and then immediately close the FD,
+ // which is what we do in the gofer. The gofer won't respond to another
+ // request until the reply is sent and the FD is closed, so forcing the
+ // gofer to handle another request will ensure that this has happened.
+ for (int i = 0; i < 2; i++) {
+ int sock;
+ RETURN_ERROR_IF_SYSCALL_FAIL(sock = socket(domain, type, protocol));
+ RETURN_ERROR_IF_SYSCALL_FAIL(connect(
+ sock, reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr)));
+ RETURN_ERROR_IF_SYSCALL_FAIL(close(sock));
+ }
+
+ return absl::make_unique<FDSocketPair>(sock1, sock2);
+ };
+}
+
+Creator<SocketPair> SocketpairGoferFileSocketPairCreator(int flags) {
+ return [=]() -> PosixErrorOr<std::unique_ptr<FDSocketPair>> {
+ constexpr char kSocketGoferPath[] = "/socket";
+
+ int sock1;
+ RETURN_ERROR_IF_SYSCALL_FAIL(sock1 =
+ open(kSocketGoferPath, O_RDWR | flags));
+ MaybeSave(); // Successful socket creation.
+
+ int sock2;
+ RETURN_ERROR_IF_SYSCALL_FAIL(sock2 =
+ open(kSocketGoferPath, O_RDWR | flags));
+ MaybeSave(); // Successful socket creation.
+
+ return absl::make_unique<FDSocketPair>(sock1, sock2);
+ };
+}
+
+Creator<SocketPair> UnboundSocketPairCreator(bool abstract, int domain,
+ int type, int protocol) {
+ return [=]() -> PosixErrorOr<std::unique_ptr<AddrFDSocketPair>> {
+ ASSIGN_OR_RETURN_ERRNO(struct sockaddr_un addr1,
+ UniqueUnixAddr(abstract, domain));
+ ASSIGN_OR_RETURN_ERRNO(struct sockaddr_un addr2,
+ UniqueUnixAddr(abstract, domain));
+
+ int sock1;
+ RETURN_ERROR_IF_SYSCALL_FAIL(sock1 = socket(domain, type, protocol));
+ MaybeSave(); // Successful socket creation.
+ int sock2;
+ RETURN_ERROR_IF_SYSCALL_FAIL(sock2 = socket(domain, type, protocol));
+ MaybeSave(); // Successful socket creation.
+ return absl::make_unique<AddrFDSocketPair>(sock1, sock2, addr1, addr2);
+ };
+}
+
+Creator<SocketPair> FilesystemUnboundSocketPairCreator(int domain, int type,
+ int protocol) {
+ return UnboundSocketPairCreator(/* abstract= */ false, domain, type,
+ protocol);
+}
+
+Creator<SocketPair> AbstractUnboundSocketPairCreator(int domain, int type,
+ int protocol) {
+ return UnboundSocketPairCreator(/* abstract= */ true, domain, type, protocol);
+}
+
+void LocalhostAddr(struct sockaddr_in* addr, bool dual_stack) {
+ addr->sin_family = AF_INET;
+ addr->sin_port = htons(0);
+ inet_pton(AF_INET, "127.0.0.1",
+ reinterpret_cast<void*>(&addr->sin_addr.s_addr));
+}
+
+void LocalhostAddr(struct sockaddr_in6* addr, bool dual_stack) {
+ addr->sin6_family = AF_INET6;
+ addr->sin6_port = htons(0);
+ if (dual_stack) {
+ inet_pton(AF_INET6, "::ffff:127.0.0.1",
+ reinterpret_cast<void*>(&addr->sin6_addr.s6_addr));
+ } else {
+ inet_pton(AF_INET6, "::1",
+ reinterpret_cast<void*>(&addr->sin6_addr.s6_addr));
+ }
+ addr->sin6_scope_id = 0;
+}
+
+template <typename T>
+PosixErrorOr<T> BindIP(int fd, bool dual_stack) {
+ T addr = {};
+ LocalhostAddr(&addr, dual_stack);
+ RETURN_ERROR_IF_SYSCALL_FAIL(
+ bind(fd, reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr)));
+ socklen_t addrlen = sizeof(addr);
+ RETURN_ERROR_IF_SYSCALL_FAIL(
+ getsockname(fd, reinterpret_cast<struct sockaddr*>(&addr), &addrlen));
+ return addr;
+}
+
+template <typename T>
+PosixErrorOr<T> TCPBindAndListen(int fd, bool dual_stack) {
+ ASSIGN_OR_RETURN_ERRNO(T addr, BindIP<T>(fd, dual_stack));
+ RETURN_ERROR_IF_SYSCALL_FAIL(listen(fd, /* backlog = */ 5));
+ return addr;
+}
+
+template <typename T>
+PosixErrorOr<std::unique_ptr<AddrFDSocketPair>>
+CreateTCPConnectAcceptSocketPair(int bound, int connected, int type,
+ bool dual_stack, T bind_addr) {
+ int connect_result = 0;
+ RETURN_ERROR_IF_SYSCALL_FAIL(
+ (connect_result = RetryEINTR(connect)(
+ connected, reinterpret_cast<struct sockaddr*>(&bind_addr),
+ sizeof(bind_addr))) == -1 &&
+ errno == EINPROGRESS
+ ? 0
+ : connect_result);
+ MaybeSave(); // Successful connect.
+
+ if (connect_result == -1) {
+ struct pollfd connect_poll = {connected, POLLOUT | POLLERR | POLLHUP, 0};
+ RETURN_ERROR_IF_SYSCALL_FAIL(RetryEINTR(poll)(&connect_poll, 1, 0));
+ int error = 0;
+ socklen_t errorlen = sizeof(error);
+ RETURN_ERROR_IF_SYSCALL_FAIL(
+ getsockopt(connected, SOL_SOCKET, SO_ERROR, &error, &errorlen));
+ errno = error;
+ RETURN_ERROR_IF_SYSCALL_FAIL(
+ /* connect */ error == 0 ? 0 : -1);
+ }
+
+ int accepted = -1;
+ struct pollfd accept_poll = {bound, POLLIN, 0};
+ while (accepted == -1) {
+ RETURN_ERROR_IF_SYSCALL_FAIL(RetryEINTR(poll)(&accept_poll, 1, 0));
+
+ RETURN_ERROR_IF_SYSCALL_FAIL(
+ (accepted = RetryEINTR(accept4)(
+ bound, nullptr, nullptr, type & (SOCK_NONBLOCK | SOCK_CLOEXEC))) ==
+ -1 &&
+ errno == EAGAIN
+ ? 0
+ : accepted);
+ }
+ MaybeSave(); // Successful accept.
+
+ T extra_addr = {};
+ LocalhostAddr(&extra_addr, dual_stack);
+ return absl::make_unique<AddrFDSocketPair>(connected, accepted, bind_addr,
+ extra_addr);
+}
+
+template <typename T>
+PosixErrorOr<std::unique_ptr<AddrFDSocketPair>> CreateTCPAcceptBindSocketPair(
+ int bound, int connected, int type, bool dual_stack) {
+ ASSIGN_OR_RETURN_ERRNO(T bind_addr, TCPBindAndListen<T>(bound, dual_stack));
+
+ auto result = CreateTCPConnectAcceptSocketPair(bound, connected, type,
+ dual_stack, bind_addr);
+
+ // Cleanup no longer needed resources.
+ RETURN_ERROR_IF_SYSCALL_FAIL(close(bound));
+ MaybeSave(); // Successful close.
+
+ return result;
+}
+
+Creator<SocketPair> TCPAcceptBindSocketPairCreator(int domain, int type,
+ int protocol,
+ bool dual_stack) {
+ return [=]() -> PosixErrorOr<std::unique_ptr<AddrFDSocketPair>> {
+ int bound;
+ RETURN_ERROR_IF_SYSCALL_FAIL(bound = socket(domain, type, protocol));
+ MaybeSave(); // Successful socket creation.
+
+ int connected;
+ RETURN_ERROR_IF_SYSCALL_FAIL(connected = socket(domain, type, protocol));
+ MaybeSave(); // Successful socket creation.
+
+ if (domain == AF_INET) {
+ return CreateTCPAcceptBindSocketPair<sockaddr_in>(bound, connected, type,
+ dual_stack);
+ }
+ return CreateTCPAcceptBindSocketPair<sockaddr_in6>(bound, connected, type,
+ dual_stack);
+ };
+}
+
+Creator<SocketPair> TCPAcceptBindPersistentListenerSocketPairCreator(
+ int domain, int type, int protocol, bool dual_stack) {
+ // These are lazily initialized below, on the first call to the returned
+ // lambda. These values are private to each returned lambda, but shared across
+ // invocations of a specific lambda.
+ //
+ // The sharing allows pairs created with the same parameters to share a
+ // listener. This prevents future connects from failing if the connecting
+ // socket selects a port which had previously been used by a listening socket
+ // that still has some connections in TIME-WAIT.
+ //
+ // The lazy initialization is to avoid creating sockets during parameter
+ // enumeration. This is important because parameters are enumerated during the
+ // build process where networking may not be available.
+ auto listener = std::make_shared<absl::optional<int>>(absl::optional<int>());
+ auto addr4 = std::make_shared<absl::optional<sockaddr_in>>(
+ absl::optional<sockaddr_in>());
+ auto addr6 = std::make_shared<absl::optional<sockaddr_in6>>(
+ absl::optional<sockaddr_in6>());
+
+ return [=]() -> PosixErrorOr<std::unique_ptr<AddrFDSocketPair>> {
+ int connected;
+ RETURN_ERROR_IF_SYSCALL_FAIL(connected = socket(domain, type, protocol));
+ MaybeSave(); // Successful socket creation.
+
+ // Share the listener across invocations.
+ if (!listener->has_value()) {
+ int fd = socket(domain, type, protocol);
+ if (fd < 0) {
+ return PosixError(errno, absl::StrCat("socket(", domain, ", ", type,
+ ", ", protocol, ")"));
+ }
+ listener->emplace(fd);
+ MaybeSave(); // Successful socket creation.
+ }
+
+ // Bind the listener once, but create a new connect/accept pair each
+ // time.
+ if (domain == AF_INET) {
+ if (!addr4->has_value()) {
+ addr4->emplace(
+ TCPBindAndListen<sockaddr_in>(listener->value(), dual_stack)
+ .ValueOrDie());
+ }
+ return CreateTCPConnectAcceptSocketPair(listener->value(), connected,
+ type, dual_stack, addr4->value());
+ }
+ if (!addr6->has_value()) {
+ addr6->emplace(
+ TCPBindAndListen<sockaddr_in6>(listener->value(), dual_stack)
+ .ValueOrDie());
+ }
+ return CreateTCPConnectAcceptSocketPair(listener->value(), connected, type,
+ dual_stack, addr6->value());
+ };
+}
+
+template <typename T>
+PosixErrorOr<std::unique_ptr<AddrFDSocketPair>> CreateUDPBoundSocketPair(
+ int sock1, int sock2, int type, bool dual_stack) {
+ ASSIGN_OR_RETURN_ERRNO(T addr1, BindIP<T>(sock1, dual_stack));
+ ASSIGN_OR_RETURN_ERRNO(T addr2, BindIP<T>(sock2, dual_stack));
+
+ return absl::make_unique<AddrFDSocketPair>(sock1, sock2, addr1, addr2);
+}
+
+template <typename T>
+PosixErrorOr<std::unique_ptr<AddrFDSocketPair>>
+CreateUDPBidirectionalBindSocketPair(int sock1, int sock2, int type,
+ bool dual_stack) {
+ ASSIGN_OR_RETURN_ERRNO(
+ auto socks, CreateUDPBoundSocketPair<T>(sock1, sock2, type, dual_stack));
+
+ // Connect sock1 to sock2.
+ RETURN_ERROR_IF_SYSCALL_FAIL(connect(socks->first_fd(), socks->second_addr(),
+ socks->second_addr_size()));
+ MaybeSave(); // Successful connection.
+
+ // Connect sock2 to sock1.
+ RETURN_ERROR_IF_SYSCALL_FAIL(connect(socks->second_fd(), socks->first_addr(),
+ socks->first_addr_size()));
+ MaybeSave(); // Successful connection.
+
+ return socks;
+}
+
+Creator<SocketPair> UDPBidirectionalBindSocketPairCreator(int domain, int type,
+ int protocol,
+ bool dual_stack) {
+ return [=]() -> PosixErrorOr<std::unique_ptr<AddrFDSocketPair>> {
+ int sock1;
+ RETURN_ERROR_IF_SYSCALL_FAIL(sock1 = socket(domain, type, protocol));
+ MaybeSave(); // Successful socket creation.
+
+ int sock2;
+ RETURN_ERROR_IF_SYSCALL_FAIL(sock2 = socket(domain, type, protocol));
+ MaybeSave(); // Successful socket creation.
+
+ if (domain == AF_INET) {
+ return CreateUDPBidirectionalBindSocketPair<sockaddr_in>(
+ sock1, sock2, type, dual_stack);
+ }
+ return CreateUDPBidirectionalBindSocketPair<sockaddr_in6>(sock1, sock2,
+ type, dual_stack);
+ };
+}
+
+Creator<SocketPair> UDPUnboundSocketPairCreator(int domain, int type,
+ int protocol, bool dual_stack) {
+ return [=]() -> PosixErrorOr<std::unique_ptr<FDSocketPair>> {
+ int sock1;
+ RETURN_ERROR_IF_SYSCALL_FAIL(sock1 = socket(domain, type, protocol));
+ MaybeSave(); // Successful socket creation.
+
+ int sock2;
+ RETURN_ERROR_IF_SYSCALL_FAIL(sock2 = socket(domain, type, protocol));
+ MaybeSave(); // Successful socket creation.
+
+ return absl::make_unique<FDSocketPair>(sock1, sock2);
+ };
+}
+
+SocketPairKind Reversed(SocketPairKind const& base) {
+ auto const& creator = base.creator;
+ return SocketPairKind{
+ absl::StrCat("reversed ", base.description), base.domain, base.type,
+ base.protocol,
+ [creator]() -> PosixErrorOr<std::unique_ptr<ReversedSocketPair>> {
+ ASSIGN_OR_RETURN_ERRNO(auto creator_value, creator());
+ return absl::make_unique<ReversedSocketPair>(std::move(creator_value));
+ }};
+}
+
+Creator<FileDescriptor> UnboundSocketCreator(int domain, int type,
+ int protocol) {
+ return [=]() -> PosixErrorOr<std::unique_ptr<FileDescriptor>> {
+ int sock;
+ RETURN_ERROR_IF_SYSCALL_FAIL(sock = socket(domain, type, protocol));
+ MaybeSave(); // Successful socket creation.
+
+ return absl::make_unique<FileDescriptor>(sock);
+ };
+}
+
+std::vector<SocketPairKind> IncludeReversals(std::vector<SocketPairKind> vec) {
+ return ApplyVecToVec<SocketPairKind>(std::vector<Middleware>{NoOp, Reversed},
+ vec);
+}
+
+SocketPairKind NoOp(SocketPairKind const& base) { return base; }
+
+void TransferTest(int fd1, int fd2) {
+ char buf1[20];
+ RandomizeBuffer(buf1, sizeof(buf1));
+ ASSERT_THAT(WriteFd(fd1, buf1, sizeof(buf1)),
+ SyscallSucceedsWithValue(sizeof(buf1)));
+
+ char buf2[20];
+ ASSERT_THAT(ReadFd(fd2, buf2, sizeof(buf2)),
+ SyscallSucceedsWithValue(sizeof(buf2)));
+
+ EXPECT_EQ(0, memcmp(buf1, buf2, sizeof(buf1)));
+
+ RandomizeBuffer(buf1, sizeof(buf1));
+ ASSERT_THAT(WriteFd(fd2, buf1, sizeof(buf1)),
+ SyscallSucceedsWithValue(sizeof(buf1)));
+
+ ASSERT_THAT(ReadFd(fd1, buf2, sizeof(buf2)),
+ SyscallSucceedsWithValue(sizeof(buf2)));
+
+ EXPECT_EQ(0, memcmp(buf1, buf2, sizeof(buf1)));
+}
+
+// Initializes the given buffer with random data.
+void RandomizeBuffer(char* ptr, size_t len) {
+ uint32_t seed = time(nullptr);
+ for (size_t i = 0; i < len; ++i) {
+ ptr[i] = static_cast<char>(rand_r(&seed));
+ }
+}
+
+size_t CalculateUnixSockAddrLen(const char* sun_path) {
+ // Abstract addresses always return the full length.
+ if (sun_path[0] == 0) {
+ return sizeof(sockaddr_un);
+ }
+ // Filesystem addresses use the address length plus the 2 byte sun_family
+ // and null terminator.
+ return strlen(sun_path) + 3;
+}
+
+struct sockaddr_storage AddrFDSocketPair::to_storage(const sockaddr_un& addr) {
+ struct sockaddr_storage addr_storage = {};
+ memcpy(&addr_storage, &addr, sizeof(addr));
+ return addr_storage;
+}
+
+struct sockaddr_storage AddrFDSocketPair::to_storage(const sockaddr_in& addr) {
+ struct sockaddr_storage addr_storage = {};
+ memcpy(&addr_storage, &addr, sizeof(addr));
+ return addr_storage;
+}
+
+struct sockaddr_storage AddrFDSocketPair::to_storage(const sockaddr_in6& addr) {
+ struct sockaddr_storage addr_storage = {};
+ memcpy(&addr_storage, &addr, sizeof(addr));
+ return addr_storage;
+}
+
+SocketKind SimpleSocket(int fam, int type, int proto) {
+ return SocketKind{
+ absl::StrCat("Family ", fam, ", type ", type, ", proto ", proto), fam,
+ type, proto, SyscallSocketCreator(fam, type, proto)};
+}
+
+ssize_t SendLargeSendMsg(const std::unique_ptr<SocketPair>& sockets,
+ size_t size, bool reader) {
+ const int rfd = sockets->second_fd();
+ ScopedThread t([rfd, size, reader] {
+ if (!reader) {
+ return;
+ }
+
+ // Potentially too many syscalls in the loop.
+ const DisableSave ds;
+
+ std::vector<char> buf(size);
+ size_t total = 0;
+
+ while (total < size) {
+ int ret = read(rfd, buf.data(), buf.size());
+ if (ret == -1 && errno == EAGAIN) {
+ continue;
+ }
+ if (ret > 0) {
+ total += ret;
+ }
+
+ // Assert to return on first failure.
+ ASSERT_THAT(ret, SyscallSucceeds());
+ }
+ });
+
+ std::vector<char> buf(size);
+
+ struct iovec iov = {};
+ iov.iov_base = buf.data();
+ iov.iov_len = buf.size();
+
+ struct msghdr msg = {};
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ return RetryEINTR(sendmsg)(sockets->first_fd(), &msg, 0);
+}
+
+namespace internal {
+PosixErrorOr<int> TryPortAvailable(int port, AddressFamily family,
+ SocketType type, bool reuse_addr) {
+ if (port < 0) {
+ return PosixError(EINVAL, "Invalid port");
+ }
+
+ // Both Ipv6 and Dualstack are AF_INET6.
+ int sock_fam = (family == AddressFamily::kIpv4 ? AF_INET : AF_INET6);
+ int sock_type = (type == SocketType::kTcp ? SOCK_STREAM : SOCK_DGRAM);
+ ASSIGN_OR_RETURN_ERRNO(auto fd, Socket(sock_fam, sock_type, 0));
+
+ if (reuse_addr) {
+ int one = 1;
+ RETURN_ERROR_IF_SYSCALL_FAIL(
+ setsockopt(fd.get(), SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)));
+ }
+
+ // Try to bind.
+ sockaddr_storage storage = {};
+ int storage_size = 0;
+ if (family == AddressFamily::kIpv4) {
+ sockaddr_in* addr = reinterpret_cast<sockaddr_in*>(&storage);
+ storage_size = sizeof(*addr);
+ addr->sin_family = AF_INET;
+ addr->sin_port = htons(port);
+ addr->sin_addr.s_addr = htonl(INADDR_ANY);
+ } else {
+ sockaddr_in6* addr = reinterpret_cast<sockaddr_in6*>(&storage);
+ storage_size = sizeof(*addr);
+ addr->sin6_family = AF_INET6;
+ addr->sin6_port = htons(port);
+ if (family == AddressFamily::kDualStack) {
+ inet_pton(AF_INET6, "::ffff:0.0.0.0",
+ reinterpret_cast<void*>(&addr->sin6_addr.s6_addr));
+ } else {
+ addr->sin6_addr = in6addr_any;
+ }
+ }
+
+ RETURN_ERROR_IF_SYSCALL_FAIL(
+ bind(fd.get(), reinterpret_cast<sockaddr*>(&storage), storage_size));
+
+ // If the user specified 0 as the port, we will return the port that the
+ // kernel gave us, otherwise we will validate that this socket bound to the
+ // requested port.
+ sockaddr_storage bound_storage = {};
+ socklen_t bound_storage_size = sizeof(bound_storage);
+ RETURN_ERROR_IF_SYSCALL_FAIL(
+ getsockname(fd.get(), reinterpret_cast<sockaddr*>(&bound_storage),
+ &bound_storage_size));
+
+ int available_port = -1;
+ if (bound_storage.ss_family == AF_INET) {
+ sockaddr_in* addr = reinterpret_cast<sockaddr_in*>(&bound_storage);
+ available_port = ntohs(addr->sin_port);
+ } else if (bound_storage.ss_family == AF_INET6) {
+ sockaddr_in6* addr = reinterpret_cast<sockaddr_in6*>(&bound_storage);
+ available_port = ntohs(addr->sin6_port);
+ } else {
+ return PosixError(EPROTOTYPE, "Getsockname returned invalid family");
+ }
+
+ // If we requested a specific port make sure our bound port is that port.
+ if (port != 0 && available_port != port) {
+ return PosixError(EINVAL,
+ absl::StrCat("Bound port ", available_port,
+ " was not equal to requested port ", port));
+ }
+
+ // If we're trying to do a TCP socket, let's also try to listen.
+ if (type == SocketType::kTcp) {
+ RETURN_ERROR_IF_SYSCALL_FAIL(listen(fd.get(), 1));
+ }
+
+ return available_port;
+}
+} // namespace internal
+
+PosixErrorOr<int> SendMsg(int sock, msghdr* msg, char buf[], int buf_size) {
+ struct iovec iov;
+ iov.iov_base = buf;
+ iov.iov_len = buf_size;
+ msg->msg_iov = &iov;
+ msg->msg_iovlen = 1;
+
+ int ret;
+ RETURN_ERROR_IF_SYSCALL_FAIL(ret = RetryEINTR(sendmsg)(sock, msg, 0));
+ return ret;
+}
+
+void RecvNoData(int sock) {
+ char data = 0;
+ struct iovec iov;
+ iov.iov_base = &data;
+ iov.iov_len = 1;
+ struct msghdr msg = {};
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ ASSERT_THAT(RetryEINTR(recvmsg)(sock, &msg, MSG_DONTWAIT),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+TestAddress V4Any() {
+ TestAddress t("V4Any");
+ t.addr.ss_family = AF_INET;
+ t.addr_len = sizeof(sockaddr_in);
+ reinterpret_cast<sockaddr_in*>(&t.addr)->sin_addr.s_addr = htonl(INADDR_ANY);
+ return t;
+}
+
+TestAddress V4Loopback() {
+ TestAddress t("V4Loopback");
+ t.addr.ss_family = AF_INET;
+ t.addr_len = sizeof(sockaddr_in);
+ reinterpret_cast<sockaddr_in*>(&t.addr)->sin_addr.s_addr =
+ htonl(INADDR_LOOPBACK);
+ return t;
+}
+
+TestAddress V4MappedAny() {
+ TestAddress t("V4MappedAny");
+ t.addr.ss_family = AF_INET6;
+ t.addr_len = sizeof(sockaddr_in6);
+ inet_pton(AF_INET6, "::ffff:0.0.0.0",
+ reinterpret_cast<sockaddr_in6*>(&t.addr)->sin6_addr.s6_addr);
+ return t;
+}
+
+TestAddress V4MappedLoopback() {
+ TestAddress t("V4MappedLoopback");
+ t.addr.ss_family = AF_INET6;
+ t.addr_len = sizeof(sockaddr_in6);
+ inet_pton(AF_INET6, "::ffff:127.0.0.1",
+ reinterpret_cast<sockaddr_in6*>(&t.addr)->sin6_addr.s6_addr);
+ return t;
+}
+
+TestAddress V4Multicast() {
+ TestAddress t("V4Multicast");
+ t.addr.ss_family = AF_INET;
+ t.addr_len = sizeof(sockaddr_in);
+ reinterpret_cast<sockaddr_in*>(&t.addr)->sin_addr.s_addr =
+ inet_addr(kMulticastAddress);
+ return t;
+}
+
+TestAddress V4Broadcast() {
+ TestAddress t("V4Broadcast");
+ t.addr.ss_family = AF_INET;
+ t.addr_len = sizeof(sockaddr_in);
+ reinterpret_cast<sockaddr_in*>(&t.addr)->sin_addr.s_addr =
+ htonl(INADDR_BROADCAST);
+ return t;
+}
+
+TestAddress V6Any() {
+ TestAddress t("V6Any");
+ t.addr.ss_family = AF_INET6;
+ t.addr_len = sizeof(sockaddr_in6);
+ reinterpret_cast<sockaddr_in6*>(&t.addr)->sin6_addr = in6addr_any;
+ return t;
+}
+
+TestAddress V6Loopback() {
+ TestAddress t("V6Loopback");
+ t.addr.ss_family = AF_INET6;
+ t.addr_len = sizeof(sockaddr_in6);
+ reinterpret_cast<sockaddr_in6*>(&t.addr)->sin6_addr = in6addr_loopback;
+ return t;
+}
+
+// Checksum computes the internet checksum of a buffer.
+uint16_t Checksum(uint16_t* buf, ssize_t buf_size) {
+ // Add up the 16-bit values in the buffer.
+ uint32_t total = 0;
+ for (unsigned int i = 0; i < buf_size; i += sizeof(*buf)) {
+ total += *buf;
+ buf++;
+ }
+
+ // If buf has an odd size, add the remaining byte.
+ if (buf_size % 2) {
+ total += *(reinterpret_cast<unsigned char*>(buf) - 1);
+ }
+
+ // This carries any bits past the lower 16 until everything fits in 16 bits.
+ while (total >> 16) {
+ uint16_t lower = total & 0xffff;
+ uint16_t upper = total >> 16;
+ total = lower + upper;
+ }
+
+ return ~total;
+}
+
+uint16_t IPChecksum(struct iphdr ip) {
+ return Checksum(reinterpret_cast<uint16_t*>(&ip), sizeof(ip));
+}
+
+// The pseudo-header defined in RFC 768 for calculating the UDP checksum.
+struct udp_pseudo_hdr {
+ uint32_t srcip;
+ uint32_t destip;
+ char zero;
+ char protocol;
+ uint16_t udplen;
+};
+
+uint16_t UDPChecksum(struct iphdr iphdr, struct udphdr udphdr,
+ const char* payload, ssize_t payload_len) {
+ struct udp_pseudo_hdr phdr = {};
+ phdr.srcip = iphdr.saddr;
+ phdr.destip = iphdr.daddr;
+ phdr.zero = 0;
+ phdr.protocol = IPPROTO_UDP;
+ phdr.udplen = udphdr.len;
+
+ ssize_t buf_size = sizeof(phdr) + sizeof(udphdr) + payload_len;
+ char* buf = static_cast<char*>(malloc(buf_size));
+ memcpy(buf, &phdr, sizeof(phdr));
+ memcpy(buf + sizeof(phdr), &udphdr, sizeof(udphdr));
+ memcpy(buf + sizeof(phdr) + sizeof(udphdr), payload, payload_len);
+
+ uint16_t csum = Checksum(reinterpret_cast<uint16_t*>(buf), buf_size);
+ free(buf);
+ return csum;
+}
+
+uint16_t ICMPChecksum(struct icmphdr icmphdr, const char* payload,
+ ssize_t payload_len) {
+ ssize_t buf_size = sizeof(icmphdr) + payload_len;
+ char* buf = static_cast<char*>(malloc(buf_size));
+ memcpy(buf, &icmphdr, sizeof(icmphdr));
+ memcpy(buf + sizeof(icmphdr), payload, payload_len);
+
+ uint16_t csum = Checksum(reinterpret_cast<uint16_t*>(buf), buf_size);
+ free(buf);
+ return csum;
+}
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_test_util.h b/test/syscalls/linux/socket_test_util.h
new file mode 100644
index 000000000..734b48b96
--- /dev/null
+++ b/test/syscalls/linux/socket_test_util.h
@@ -0,0 +1,518 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_SOCKET_TEST_UTIL_H_
+#define GVISOR_TEST_SYSCALLS_SOCKET_TEST_UTIL_H_
+
+#include <errno.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h>
+#include <netinet/udp.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "absl/strings/str_format.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Wrapper for socket(2) that returns a FileDescriptor.
+inline PosixErrorOr<FileDescriptor> Socket(int family, int type, int protocol) {
+ int fd = socket(family, type, protocol);
+ MaybeSave();
+ if (fd < 0) {
+ return PosixError(
+ errno, absl::StrFormat("socket(%d, %d, %d)", family, type, protocol));
+ }
+ return FileDescriptor(fd);
+}
+
+// Wrapper for accept(2) that returns a FileDescriptor.
+inline PosixErrorOr<FileDescriptor> Accept(int sockfd, sockaddr* addr,
+ socklen_t* addrlen) {
+ int fd = RetryEINTR(accept)(sockfd, addr, addrlen);
+ MaybeSave();
+ if (fd < 0) {
+ return PosixError(
+ errno, absl::StrFormat("accept(%d, %p, %p)", sockfd, addr, addrlen));
+ }
+ return FileDescriptor(fd);
+}
+
+// Wrapper for accept4(2) that returns a FileDescriptor.
+inline PosixErrorOr<FileDescriptor> Accept4(int sockfd, sockaddr* addr,
+ socklen_t* addrlen, int flags) {
+ int fd = RetryEINTR(accept4)(sockfd, addr, addrlen, flags);
+ MaybeSave();
+ if (fd < 0) {
+ return PosixError(errno, absl::StrFormat("accept4(%d, %p, %p, %#x)", sockfd,
+ addr, addrlen, flags));
+ }
+ return FileDescriptor(fd);
+}
+
+inline ssize_t SendFd(int fd, void* buf, size_t count, int flags) {
+ return internal::ApplyFileIoSyscall(
+ [&](size_t completed) {
+ return sendto(fd, static_cast<char*>(buf) + completed,
+ count - completed, flags, nullptr, 0);
+ },
+ count);
+}
+
+PosixErrorOr<struct sockaddr_un> UniqueUnixAddr(bool abstract, int domain);
+
+// A Creator<T> is a function that attempts to create and return a new T. (This
+// is copy/pasted from cloud/gvisor/api/sandbox_util.h and is just duplicated
+// here for clarity.)
+template <typename T>
+using Creator = std::function<PosixErrorOr<std::unique_ptr<T>>()>;
+
+// A SocketPair represents a pair of socket file descriptors owned by the
+// SocketPair.
+class SocketPair {
+ public:
+ virtual ~SocketPair() = default;
+
+ virtual int first_fd() const = 0;
+ virtual int second_fd() const = 0;
+ virtual int release_first_fd() = 0;
+ virtual int release_second_fd() = 0;
+ virtual const struct sockaddr* first_addr() const = 0;
+ virtual const struct sockaddr* second_addr() const = 0;
+ virtual size_t first_addr_size() const = 0;
+ virtual size_t second_addr_size() const = 0;
+ virtual size_t first_addr_len() const = 0;
+ virtual size_t second_addr_len() const = 0;
+};
+
+// A FDSocketPair is a SocketPair that consists of only a pair of file
+// descriptors.
+class FDSocketPair : public SocketPair {
+ public:
+ FDSocketPair(int first_fd, int second_fd)
+ : first_(first_fd), second_(second_fd) {}
+ FDSocketPair(std::unique_ptr<FileDescriptor> first_fd,
+ std::unique_ptr<FileDescriptor> second_fd)
+ : first_(first_fd->release()), second_(second_fd->release()) {}
+
+ int first_fd() const override { return first_.get(); }
+ int second_fd() const override { return second_.get(); }
+ int release_first_fd() override { return first_.release(); }
+ int release_second_fd() override { return second_.release(); }
+ const struct sockaddr* first_addr() const override { return nullptr; }
+ const struct sockaddr* second_addr() const override { return nullptr; }
+ size_t first_addr_size() const override { return 0; }
+ size_t second_addr_size() const override { return 0; }
+ size_t first_addr_len() const override { return 0; }
+ size_t second_addr_len() const override { return 0; }
+
+ private:
+ FileDescriptor first_;
+ FileDescriptor second_;
+};
+
+// CalculateUnixSockAddrLen calculates the length returned by recvfrom(2) and
+// recvmsg(2) for Unix sockets.
+size_t CalculateUnixSockAddrLen(const char* sun_path);
+
+// A AddrFDSocketPair is a SocketPair that consists of a pair of file
+// descriptors in addition to a pair of socket addresses.
+class AddrFDSocketPair : public SocketPair {
+ public:
+ AddrFDSocketPair(int first_fd, int second_fd,
+ const struct sockaddr_un& first_address,
+ const struct sockaddr_un& second_address)
+ : first_(first_fd),
+ second_(second_fd),
+ first_addr_(to_storage(first_address)),
+ second_addr_(to_storage(second_address)),
+ first_len_(CalculateUnixSockAddrLen(first_address.sun_path)),
+ second_len_(CalculateUnixSockAddrLen(second_address.sun_path)),
+ first_size_(sizeof(first_address)),
+ second_size_(sizeof(second_address)) {}
+
+ AddrFDSocketPair(int first_fd, int second_fd,
+ const struct sockaddr_in& first_address,
+ const struct sockaddr_in& second_address)
+ : first_(first_fd),
+ second_(second_fd),
+ first_addr_(to_storage(first_address)),
+ second_addr_(to_storage(second_address)),
+ first_len_(sizeof(first_address)),
+ second_len_(sizeof(second_address)),
+ first_size_(sizeof(first_address)),
+ second_size_(sizeof(second_address)) {}
+
+ AddrFDSocketPair(int first_fd, int second_fd,
+ const struct sockaddr_in6& first_address,
+ const struct sockaddr_in6& second_address)
+ : first_(first_fd),
+ second_(second_fd),
+ first_addr_(to_storage(first_address)),
+ second_addr_(to_storage(second_address)),
+ first_len_(sizeof(first_address)),
+ second_len_(sizeof(second_address)),
+ first_size_(sizeof(first_address)),
+ second_size_(sizeof(second_address)) {}
+
+ int first_fd() const override { return first_.get(); }
+ int second_fd() const override { return second_.get(); }
+ int release_first_fd() override { return first_.release(); }
+ int release_second_fd() override { return second_.release(); }
+ const struct sockaddr* first_addr() const override {
+ return reinterpret_cast<const struct sockaddr*>(&first_addr_);
+ }
+ const struct sockaddr* second_addr() const override {
+ return reinterpret_cast<const struct sockaddr*>(&second_addr_);
+ }
+ size_t first_addr_size() const override { return first_size_; }
+ size_t second_addr_size() const override { return second_size_; }
+ size_t first_addr_len() const override { return first_len_; }
+ size_t second_addr_len() const override { return second_len_; }
+
+ private:
+ // to_storage coverts a sockaddr_* to a sockaddr_storage.
+ static struct sockaddr_storage to_storage(const sockaddr_un& addr);
+ static struct sockaddr_storage to_storage(const sockaddr_in& addr);
+ static struct sockaddr_storage to_storage(const sockaddr_in6& addr);
+
+ FileDescriptor first_;
+ FileDescriptor second_;
+ const struct sockaddr_storage first_addr_;
+ const struct sockaddr_storage second_addr_;
+ const size_t first_len_;
+ const size_t second_len_;
+ const size_t first_size_;
+ const size_t second_size_;
+};
+
+// SyscallSocketPairCreator returns a Creator<SocketPair> that obtains file
+// descriptors by invoking the socketpair() syscall.
+Creator<SocketPair> SyscallSocketPairCreator(int domain, int type,
+ int protocol);
+
+// SyscallSocketCreator returns a Creator<FileDescriptor> that obtains a file
+// descriptor by invoking the socket() syscall.
+Creator<FileDescriptor> SyscallSocketCreator(int domain, int type,
+ int protocol);
+
+// FilesystemBidirectionalBindSocketPairCreator returns a Creator<SocketPair>
+// that obtains file descriptors by invoking the bind() and connect() syscalls
+// on filesystem paths. Only works for DGRAM sockets.
+Creator<SocketPair> FilesystemBidirectionalBindSocketPairCreator(int domain,
+ int type,
+ int protocol);
+
+// AbstractBidirectionalBindSocketPairCreator returns a Creator<SocketPair> that
+// obtains file descriptors by invoking the bind() and connect() syscalls on
+// abstract namespace paths. Only works for DGRAM sockets.
+Creator<SocketPair> AbstractBidirectionalBindSocketPairCreator(int domain,
+ int type,
+ int protocol);
+
+// SocketpairGoferSocketPairCreator returns a Creator<SocketPair> that
+// obtains file descriptors by connect() syscalls on two sockets with socketpair
+// gofer paths.
+Creator<SocketPair> SocketpairGoferSocketPairCreator(int domain, int type,
+ int protocol);
+
+// SocketpairGoferFileSocketPairCreator returns a Creator<SocketPair> that
+// obtains file descriptors by open() syscalls on socketpair gofer paths.
+Creator<SocketPair> SocketpairGoferFileSocketPairCreator(int flags);
+
+// FilesystemAcceptBindSocketPairCreator returns a Creator<SocketPair> that
+// obtains file descriptors by invoking the accept() and bind() syscalls on
+// a filesystem path. Only works for STREAM and SEQPACKET sockets.
+Creator<SocketPair> FilesystemAcceptBindSocketPairCreator(int domain, int type,
+ int protocol);
+
+// AbstractAcceptBindSocketPairCreator returns a Creator<SocketPair> that
+// obtains file descriptors by invoking the accept() and bind() syscalls on a
+// abstract namespace path. Only works for STREAM and SEQPACKET sockets.
+Creator<SocketPair> AbstractAcceptBindSocketPairCreator(int domain, int type,
+ int protocol);
+
+// FilesystemUnboundSocketPairCreator returns a Creator<SocketPair> that obtains
+// file descriptors by invoking the socket() syscall and generates a filesystem
+// path for binding.
+Creator<SocketPair> FilesystemUnboundSocketPairCreator(int domain, int type,
+ int protocol);
+
+// AbstractUnboundSocketPairCreator returns a Creator<SocketPair> that obtains
+// file descriptors by invoking the socket() syscall and generates an abstract
+// path for binding.
+Creator<SocketPair> AbstractUnboundSocketPairCreator(int domain, int type,
+ int protocol);
+
+// TCPAcceptBindSocketPairCreator returns a Creator<SocketPair> that obtains
+// file descriptors by invoking the accept() and bind() syscalls on TCP sockets.
+Creator<SocketPair> TCPAcceptBindSocketPairCreator(int domain, int type,
+ int protocol,
+ bool dual_stack);
+
+// TCPAcceptBindPersistentListenerSocketPairCreator is like
+// TCPAcceptBindSocketPairCreator, except it uses the same listening socket to
+// create all SocketPairs.
+Creator<SocketPair> TCPAcceptBindPersistentListenerSocketPairCreator(
+ int domain, int type, int protocol, bool dual_stack);
+
+// UDPBidirectionalBindSocketPairCreator returns a Creator<SocketPair> that
+// obtains file descriptors by invoking the bind() and connect() syscalls on UDP
+// sockets.
+Creator<SocketPair> UDPBidirectionalBindSocketPairCreator(int domain, int type,
+ int protocol,
+ bool dual_stack);
+
+// UDPUnboundSocketPairCreator returns a Creator<SocketPair> that obtains file
+// descriptors by creating UDP sockets.
+Creator<SocketPair> UDPUnboundSocketPairCreator(int domain, int type,
+ int protocol, bool dual_stack);
+
+// UnboundSocketCreator returns a Creator<FileDescriptor> that obtains a file
+// descriptor by creating a socket.
+Creator<FileDescriptor> UnboundSocketCreator(int domain, int type,
+ int protocol);
+
+// A SocketPairKind couples a human-readable description of a socket pair with
+// a function that creates such a socket pair.
+struct SocketPairKind {
+ std::string description;
+ int domain;
+ int type;
+ int protocol;
+ Creator<SocketPair> creator;
+
+ // Create creates a socket pair of this kind.
+ PosixErrorOr<std::unique_ptr<SocketPair>> Create() const { return creator(); }
+};
+
+// A SocketKind couples a human-readable description of a socket with
+// a function that creates such a socket.
+struct SocketKind {
+ std::string description;
+ int domain;
+ int type;
+ int protocol;
+ Creator<FileDescriptor> creator;
+
+ // Create creates a socket pair of this kind.
+ PosixErrorOr<std::unique_ptr<FileDescriptor>> Create() const {
+ return creator();
+ }
+};
+
+// A ReversedSocketPair wraps another SocketPair but flips the first and second
+// file descriptors. ReversedSocketPair is used to test socket pairs that
+// should be symmetric.
+class ReversedSocketPair : public SocketPair {
+ public:
+ explicit ReversedSocketPair(std::unique_ptr<SocketPair> base)
+ : base_(std::move(base)) {}
+
+ int first_fd() const override { return base_->second_fd(); }
+ int second_fd() const override { return base_->first_fd(); }
+ int release_first_fd() override { return base_->release_second_fd(); }
+ int release_second_fd() override { return base_->release_first_fd(); }
+ const struct sockaddr* first_addr() const override {
+ return base_->second_addr();
+ }
+ const struct sockaddr* second_addr() const override {
+ return base_->first_addr();
+ }
+ size_t first_addr_size() const override { return base_->second_addr_size(); }
+ size_t second_addr_size() const override { return base_->first_addr_size(); }
+ size_t first_addr_len() const override { return base_->second_addr_len(); }
+ size_t second_addr_len() const override { return base_->first_addr_len(); }
+
+ private:
+ std::unique_ptr<SocketPair> base_;
+};
+
+// Reversed returns a SocketPairKind that represents SocketPairs created by
+// flipping the file descriptors provided by another SocketPair.
+SocketPairKind Reversed(SocketPairKind const& base);
+
+// IncludeReversals returns a vector<SocketPairKind> that returns all
+// SocketPairKinds in `vec` as well as all SocketPairKinds obtained by flipping
+// the file descriptors provided by the kinds in `vec`.
+std::vector<SocketPairKind> IncludeReversals(std::vector<SocketPairKind> vec);
+
+// A Middleware is a function wraps a SocketPairKind.
+using Middleware = std::function<SocketPairKind(SocketPairKind)>;
+
+// Reversed returns a SocketPairKind that represents SocketPairs created by
+// flipping the file descriptors provided by another SocketPair.
+template <typename T>
+Middleware SetSockOpt(int level, int optname, T* value) {
+ return [=](SocketPairKind const& base) {
+ auto const& creator = base.creator;
+ return SocketPairKind{
+ absl::StrCat("setsockopt(", level, ", ", optname, ", ", *value, ") ",
+ base.description),
+ base.domain, base.type, base.protocol,
+ [creator, level, optname,
+ value]() -> PosixErrorOr<std::unique_ptr<SocketPair>> {
+ ASSIGN_OR_RETURN_ERRNO(auto creator_value, creator());
+ if (creator_value->first_fd() >= 0) {
+ RETURN_ERROR_IF_SYSCALL_FAIL(setsockopt(
+ creator_value->first_fd(), level, optname, value, sizeof(T)));
+ }
+ if (creator_value->second_fd() >= 0) {
+ RETURN_ERROR_IF_SYSCALL_FAIL(setsockopt(
+ creator_value->second_fd(), level, optname, value, sizeof(T)));
+ }
+ return creator_value;
+ }};
+ };
+}
+
+constexpr int kSockOptOn = 1;
+constexpr int kSockOptOff = 0;
+
+// NoOp returns the same SocketPairKind that it is passed.
+SocketPairKind NoOp(SocketPairKind const& base);
+
+// TransferTest tests that data can be send back and fourth between two
+// specified FDs. Note that calls to this function should be wrapped in
+// ASSERT_NO_FATAL_FAILURE().
+void TransferTest(int fd1, int fd2);
+
+// Fills [buf, buf+len) with random bytes.
+void RandomizeBuffer(char* buf, size_t len);
+
+// Base test fixture for tests that operate on pairs of connected sockets.
+class SocketPairTest : public ::testing::TestWithParam<SocketPairKind> {
+ protected:
+ SocketPairTest() {
+ // gUnit uses printf, so so will we.
+ printf("Testing with %s\n", GetParam().description.c_str());
+ fflush(stdout);
+ }
+
+ PosixErrorOr<std::unique_ptr<SocketPair>> NewSocketPair() const {
+ return GetParam().Create();
+ }
+};
+
+// Base test fixture for tests that operate on simple Sockets.
+class SimpleSocketTest : public ::testing::TestWithParam<SocketKind> {
+ protected:
+ SimpleSocketTest() {
+ // gUnit uses printf, so so will we.
+ printf("Testing with %s\n", GetParam().description.c_str());
+ }
+
+ PosixErrorOr<std::unique_ptr<FileDescriptor>> NewSocket() const {
+ return GetParam().Create();
+ }
+};
+
+SocketKind SimpleSocket(int fam, int type, int proto);
+
+// Send a buffer of size 'size' to sockets->first_fd(), returning the result of
+// sendmsg.
+//
+// If reader, read from second_fd() until size bytes have been read.
+ssize_t SendLargeSendMsg(const std::unique_ptr<SocketPair>& sockets,
+ size_t size, bool reader);
+
+// Initializes the given buffer with random data.
+void RandomizeBuffer(char* ptr, size_t len);
+
+enum class AddressFamily { kIpv4 = 1, kIpv6 = 2, kDualStack = 3 };
+enum class SocketType { kUdp = 1, kTcp = 2 };
+
+// Returns a PosixError or a port that is available. If 0 is specified as the
+// port it will bind port 0 (and allow the kernel to select any free port).
+// Otherwise, it will try to bind the specified port and validate that it can be
+// used for the requested family and socket type. The final option is
+// reuse_addr. This specifies whether SO_REUSEADDR should be applied before a
+// bind(2) attempt. SO_REUSEADDR means that sockets in TIME_WAIT states or other
+// bound UDP sockets would not cause an error on bind(2). This option should be
+// set if subsequent calls to bind on the returned port will also use
+// SO_REUSEADDR.
+//
+// Note: That this test will attempt to bind the ANY address for the respective
+// protocol.
+PosixErrorOr<int> PortAvailable(int port, AddressFamily family, SocketType type,
+ bool reuse_addr);
+
+// FreeAvailablePort is used to return a port that was obtained by using
+// the PortAvailable helper with port 0.
+PosixError FreeAvailablePort(int port);
+
+// SendMsg converts a buffer to an iovec and adds it to msg before sending it.
+PosixErrorOr<int> SendMsg(int sock, msghdr* msg, char buf[], int buf_size);
+
+// RecvNoData checks that no data is receivable on sock.
+void RecvNoData(int sock);
+
+// Base test fixture for tests that apply to all kinds of pairs of connected
+// sockets.
+using AllSocketPairTest = SocketPairTest;
+
+struct TestAddress {
+ std::string description;
+ sockaddr_storage addr;
+ socklen_t addr_len;
+
+ int family() const { return addr.ss_family; }
+ explicit TestAddress(std::string description = "")
+ : description(std::move(description)), addr(), addr_len() {}
+};
+
+constexpr char kMulticastAddress[] = "224.0.2.1";
+constexpr char kBroadcastAddress[] = "255.255.255.255";
+
+TestAddress V4Any();
+TestAddress V4Broadcast();
+TestAddress V4Loopback();
+TestAddress V4MappedAny();
+TestAddress V4MappedLoopback();
+TestAddress V4Multicast();
+TestAddress V6Any();
+TestAddress V6Loopback();
+
+// Compute the internet checksum of an IP header.
+uint16_t IPChecksum(struct iphdr ip);
+
+// Compute the internet checksum of a UDP header.
+uint16_t UDPChecksum(struct iphdr iphdr, struct udphdr udphdr,
+ const char* payload, ssize_t payload_len);
+
+// Compute the internet checksum of an ICMP header.
+uint16_t ICMPChecksum(struct icmphdr icmphdr, const char* payload,
+ ssize_t payload_len);
+
+namespace internal {
+PosixErrorOr<int> TryPortAvailable(int port, AddressFamily family,
+ SocketType type, bool reuse_addr);
+} // namespace internal
+
+} // namespace testing
+} // namespace gvisor
+
+#endif // GVISOR_TEST_SYSCALLS_SOCKET_TEST_UTIL_H_
diff --git a/test/syscalls/linux/socket_test_util_impl.cc b/test/syscalls/linux/socket_test_util_impl.cc
new file mode 100644
index 000000000..ef661a0e3
--- /dev/null
+++ b/test/syscalls/linux/socket_test_util_impl.cc
@@ -0,0 +1,28 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+PosixErrorOr<int> PortAvailable(int port, AddressFamily family, SocketType type,
+ bool reuse_addr) {
+ return internal::TryPortAvailable(port, family, type, reuse_addr);
+}
+
+PosixError FreeAvailablePort(int port) { return NoError(); }
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix.cc b/test/syscalls/linux/socket_unix.cc
new file mode 100644
index 000000000..591cab3fd
--- /dev/null
+++ b/test/syscalls/linux/socket_unix.cc
@@ -0,0 +1,274 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_unix.h"
+
+#include <errno.h>
+#include <net/if.h>
+#include <stdio.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "absl/strings/string_view.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+// This file contains tests specific to Unix domain sockets. It does not contain
+// tests for UDS control messages. Those belong in socket_unix_cmsg.cc.
+//
+// This file is a generic socket test file. It must be built with another file
+// that provides the test types.
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST_P(UnixSocketPairTest, InvalidGetSockOpt) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ int opt;
+ socklen_t optlen = sizeof(opt);
+ EXPECT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, -1, &opt, &optlen),
+ SyscallFailsWithErrno(ENOPROTOOPT));
+}
+
+TEST_P(UnixSocketPairTest, BindToBadName) {
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ constexpr char kBadName[] = "/some/path/that/does/not/exist";
+ sockaddr_un sockaddr;
+ sockaddr.sun_family = AF_LOCAL;
+ memcpy(sockaddr.sun_path, kBadName, sizeof(kBadName));
+
+ EXPECT_THAT(
+ bind(pair->first_fd(), reinterpret_cast<struct sockaddr*>(&sockaddr),
+ sizeof(sockaddr)),
+ SyscallFailsWithErrno(ENOENT));
+}
+
+TEST_P(UnixSocketPairTest, BindToBadFamily) {
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ constexpr char kBadName[] = "/some/path/that/does/not/exist";
+ sockaddr_un sockaddr;
+ sockaddr.sun_family = AF_INET;
+ memcpy(sockaddr.sun_path, kBadName, sizeof(kBadName));
+
+ EXPECT_THAT(
+ bind(pair->first_fd(), reinterpret_cast<struct sockaddr*>(&sockaddr),
+ sizeof(sockaddr)),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(UnixSocketPairTest, RecvmmsgTimeoutAfterRecv) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ char sent_data[10];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ char received_data[sizeof(sent_data) * 2];
+ std::vector<struct mmsghdr> msgs(2);
+ std::vector<struct iovec> iovs(msgs.size());
+ const int chunk_size = sizeof(received_data) / msgs.size();
+ for (size_t i = 0; i < msgs.size(); i++) {
+ iovs[i].iov_len = chunk_size;
+ iovs[i].iov_base = &received_data[i * chunk_size];
+ msgs[i].msg_hdr.msg_iov = &iovs[i];
+ msgs[i].msg_hdr.msg_iovlen = 1;
+ }
+
+ ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data, sizeof(sent_data)),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ struct timespec timeout = {0, 1};
+ ASSERT_THAT(RetryEINTR(recvmmsg)(sockets->second_fd(), &msgs[0], msgs.size(),
+ 0, &timeout),
+ SyscallSucceedsWithValue(1));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ EXPECT_EQ(chunk_size, msgs[0].msg_len);
+}
+
+TEST_P(UnixSocketPairTest, TIOCINQSucceeds) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ if (IsRunningOnGvisor()) {
+ // TODO(gvisor.dev/issue/273): Inherited host UDS don't support TIOCINQ.
+ // Skip the test.
+ int size = -1;
+ int ret = ioctl(sockets->first_fd(), TIOCINQ, &size);
+ SKIP_IF(ret == -1 && errno == ENOTTY);
+ }
+
+ int size = -1;
+ EXPECT_THAT(ioctl(sockets->first_fd(), TIOCINQ, &size), SyscallSucceeds());
+ EXPECT_EQ(size, 0);
+
+ const char some_data[] = "dangerzone";
+ ASSERT_THAT(
+ RetryEINTR(send)(sockets->second_fd(), &some_data, sizeof(some_data), 0),
+ SyscallSucceeds());
+ EXPECT_THAT(ioctl(sockets->first_fd(), TIOCINQ, &size), SyscallSucceeds());
+ EXPECT_EQ(size, sizeof(some_data));
+
+ // Linux only reports the first message's size, which is wrong. We test for
+ // the behavior described in the man page.
+ SKIP_IF(!IsRunningOnGvisor());
+
+ ASSERT_THAT(
+ RetryEINTR(send)(sockets->second_fd(), &some_data, sizeof(some_data), 0),
+ SyscallSucceeds());
+ EXPECT_THAT(ioctl(sockets->first_fd(), TIOCINQ, &size), SyscallSucceeds());
+ EXPECT_EQ(size, sizeof(some_data) * 2);
+}
+
+TEST_P(UnixSocketPairTest, TIOCOUTQSucceeds) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ if (IsRunningOnGvisor()) {
+ // TODO(gvisor.dev/issue/273): Inherited host UDS don't support TIOCOUTQ.
+ // Skip the test.
+ int size = -1;
+ int ret = ioctl(sockets->second_fd(), TIOCOUTQ, &size);
+ SKIP_IF(ret == -1 && errno == ENOTTY);
+ }
+
+ int size = -1;
+ EXPECT_THAT(ioctl(sockets->second_fd(), TIOCOUTQ, &size), SyscallSucceeds());
+ EXPECT_EQ(size, 0);
+
+ // Linux reports bogus numbers which are related to its internal allocations.
+ // We test for the behavior described in the man page.
+ SKIP_IF(!IsRunningOnGvisor());
+
+ const char some_data[] = "dangerzone";
+ ASSERT_THAT(
+ RetryEINTR(send)(sockets->second_fd(), &some_data, sizeof(some_data), 0),
+ SyscallSucceeds());
+ EXPECT_THAT(ioctl(sockets->second_fd(), TIOCOUTQ, &size), SyscallSucceeds());
+ EXPECT_EQ(size, sizeof(some_data));
+
+ ASSERT_THAT(
+ RetryEINTR(send)(sockets->second_fd(), &some_data, sizeof(some_data), 0),
+ SyscallSucceeds());
+ EXPECT_THAT(ioctl(sockets->second_fd(), TIOCOUTQ, &size), SyscallSucceeds());
+ EXPECT_EQ(size, sizeof(some_data) * 2);
+}
+
+TEST_P(UnixSocketPairTest, NetdeviceIoctlsSucceed) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ // Prepare the request.
+ struct ifreq ifr;
+ snprintf(ifr.ifr_name, IFNAMSIZ, "lo");
+
+ // Check that the ioctl either succeeds or fails with ENODEV.
+ int err = ioctl(sockets->first_fd(), SIOCGIFINDEX, &ifr);
+ if (err < 0) {
+ ASSERT_EQ(errno, ENODEV);
+ }
+}
+
+TEST_P(UnixSocketPairTest, Shutdown) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ const std::string data = "abc";
+ ASSERT_THAT(WriteFd(sockets->first_fd(), data.c_str(), data.size()),
+ SyscallSucceedsWithValue(data.size()));
+
+ ASSERT_THAT(shutdown(sockets->first_fd(), SHUT_RDWR), SyscallSucceeds());
+ ASSERT_THAT(shutdown(sockets->second_fd(), SHUT_RDWR), SyscallSucceeds());
+
+ // Shutting down a socket does not clear the buffer.
+ char buf[3];
+ ASSERT_THAT(ReadFd(sockets->second_fd(), buf, data.size()),
+ SyscallSucceedsWithValue(data.size()));
+ EXPECT_EQ(data, absl::string_view(buf, data.size()));
+}
+
+TEST_P(UnixSocketPairTest, ShutdownRead) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(shutdown(sockets->first_fd(), SHUT_RD), SyscallSucceeds());
+
+ // When the socket is shutdown for read, read behavior varies between
+ // different socket types. This is covered by the various ReadOneSideClosed
+ // test cases.
+
+ // ... and the peer cannot write.
+ const std::string data = "abc";
+ EXPECT_THAT(WriteFd(sockets->second_fd(), data.c_str(), data.size()),
+ SyscallFailsWithErrno(EPIPE));
+
+ // ... but the socket can still write.
+ ASSERT_THAT(WriteFd(sockets->first_fd(), data.c_str(), data.size()),
+ SyscallSucceedsWithValue(data.size()));
+
+ // ... and the peer can still read.
+ char buf[3];
+ EXPECT_THAT(ReadFd(sockets->second_fd(), buf, data.size()),
+ SyscallSucceedsWithValue(data.size()));
+ EXPECT_EQ(data, absl::string_view(buf, data.size()));
+}
+
+TEST_P(UnixSocketPairTest, ShutdownWrite) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(shutdown(sockets->first_fd(), SHUT_WR), SyscallSucceeds());
+
+ // When the socket is shutdown for write, it cannot write.
+ const std::string data = "abc";
+ EXPECT_THAT(WriteFd(sockets->first_fd(), data.c_str(), data.size()),
+ SyscallFailsWithErrno(EPIPE));
+
+ // ... and the peer read behavior varies between different socket types. This
+ // is covered by the various ReadOneSideClosed test cases.
+
+ // ... but the peer can still write.
+ char buf[3];
+ ASSERT_THAT(WriteFd(sockets->second_fd(), data.c_str(), data.size()),
+ SyscallSucceedsWithValue(data.size()));
+
+ // ... and the socket can still read.
+ EXPECT_THAT(ReadFd(sockets->first_fd(), buf, data.size()),
+ SyscallSucceedsWithValue(data.size()));
+ EXPECT_EQ(data, absl::string_view(buf, data.size()));
+}
+
+TEST_P(UnixSocketPairTest, SocketReopenFromProcfs) {
+ // TODO(gvisor.dev/issue/1624): In VFS1, we return EIO instead of ENXIO (see
+ // b/122310852). Remove this skip once VFS1 is deleted.
+ SKIP_IF(IsRunningWithVFS1());
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ // Opening a socket pair via /proc/self/fd/X is a ENXIO.
+ for (const int fd : {sockets->first_fd(), sockets->second_fd()}) {
+ ASSERT_THAT(Open(absl::StrCat("/proc/self/fd/", fd), O_WRONLY),
+ PosixErrorIs(ENXIO, ::testing::_));
+ }
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix.h b/test/syscalls/linux/socket_unix.h
new file mode 100644
index 000000000..3625cc404
--- /dev/null
+++ b/test/syscalls/linux/socket_unix.h
@@ -0,0 +1,29 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_UNIX_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_UNIX_H_
+
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to pairs of connected unix sockets.
+using UnixSocketPairTest = SocketPairTest;
+
+} // namespace testing
+} // namespace gvisor
+
+#endif // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_UNIX_H_
diff --git a/test/syscalls/linux/socket_unix_abstract_nonblock.cc b/test/syscalls/linux/socket_unix_abstract_nonblock.cc
new file mode 100644
index 000000000..8bef76b67
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_abstract_nonblock.cc
@@ -0,0 +1,39 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "test/syscalls/linux/socket_non_blocking.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+ return ApplyVec<SocketPairKind>(
+ AbstractBoundUnixDomainSocketPair,
+ AllBitwiseCombinations(List<int>{SOCK_STREAM, SOCK_DGRAM, SOCK_SEQPACKET},
+ List<int>{SOCK_NONBLOCK}));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ NonBlockingAbstractUnixSockets, NonBlockingSocketPairTest,
+ ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_blocking_local.cc b/test/syscalls/linux/socket_unix_blocking_local.cc
new file mode 100644
index 000000000..77cb8c6d6
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_blocking_local.cc
@@ -0,0 +1,45 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "test/syscalls/linux/socket_blocking.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+ return VecCat<SocketPairKind>(
+ ApplyVec<SocketPairKind>(
+ UnixDomainSocketPair,
+ std::vector<int>{SOCK_STREAM, SOCK_SEQPACKET, SOCK_DGRAM}),
+ ApplyVec<SocketPairKind>(
+ FilesystemBoundUnixDomainSocketPair,
+ std::vector<int>{SOCK_STREAM, SOCK_SEQPACKET, SOCK_DGRAM}),
+ ApplyVec<SocketPairKind>(
+ AbstractBoundUnixDomainSocketPair,
+ std::vector<int>{SOCK_STREAM, SOCK_SEQPACKET, SOCK_DGRAM}));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ NonBlockingUnixDomainSockets, BlockingSocketPairTest,
+ ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_cmsg.cc b/test/syscalls/linux/socket_unix_cmsg.cc
new file mode 100644
index 000000000..a16899493
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_cmsg.cc
@@ -0,0 +1,1501 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_unix_cmsg.h"
+
+#include <errno.h>
+#include <net/if.h>
+#include <stdio.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "absl/strings/string_view.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+// This file contains tests for control message in Unix domain sockets.
+//
+// This file is a generic socket test file. It must be built with another file
+// that provides the test types.
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST_P(UnixSocketPairCmsgTest, BasicFDPass) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+ sent_data, sizeof(sent_data)));
+
+ char received_data[20];
+ int fd = -1;
+ ASSERT_NO_FATAL_FAILURE(RecvSingleFD(sockets->second_fd(), &fd, received_data,
+ sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ ASSERT_NO_FATAL_FAILURE(TransferTest(fd, pair->first_fd()));
+}
+
+TEST_P(UnixSocketPairCmsgTest, BasicTwoFDPass) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair1 =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+ auto pair2 =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+ int sent_fds[] = {pair1->second_fd(), pair2->second_fd()};
+
+ ASSERT_NO_FATAL_FAILURE(
+ SendFDs(sockets->first_fd(), sent_fds, 2, sent_data, sizeof(sent_data)));
+
+ char received_data[20];
+ int received_fds[] = {-1, -1};
+
+ ASSERT_NO_FATAL_FAILURE(RecvFDs(sockets->second_fd(), received_fds, 2,
+ received_data, sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ ASSERT_NO_FATAL_FAILURE(TransferTest(received_fds[0], pair1->first_fd()));
+ ASSERT_NO_FATAL_FAILURE(TransferTest(received_fds[1], pair2->first_fd()));
+}
+
+TEST_P(UnixSocketPairCmsgTest, BasicThreeFDPass) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair1 =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+ auto pair2 =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+ auto pair3 =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+ int sent_fds[] = {pair1->second_fd(), pair2->second_fd(), pair3->second_fd()};
+
+ ASSERT_NO_FATAL_FAILURE(
+ SendFDs(sockets->first_fd(), sent_fds, 3, sent_data, sizeof(sent_data)));
+
+ char received_data[20];
+ int received_fds[] = {-1, -1, -1};
+
+ ASSERT_NO_FATAL_FAILURE(RecvFDs(sockets->second_fd(), received_fds, 3,
+ received_data, sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ ASSERT_NO_FATAL_FAILURE(TransferTest(received_fds[0], pair1->first_fd()));
+ ASSERT_NO_FATAL_FAILURE(TransferTest(received_fds[1], pair2->first_fd()));
+ ASSERT_NO_FATAL_FAILURE(TransferTest(received_fds[2], pair3->first_fd()));
+}
+
+TEST_P(UnixSocketPairCmsgTest, BadFDPass) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ int sent_fd = -1;
+
+ struct msghdr msg = {};
+ char control[CMSG_SPACE(sizeof(sent_fd))];
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_len = CMSG_LEN(sizeof(sent_fd));
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ memcpy(CMSG_DATA(cmsg), &sent_fd, sizeof(sent_fd));
+
+ struct iovec iov;
+ iov.iov_base = sent_data;
+ iov.iov_len = sizeof(sent_data);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(sendmsg)(sockets->first_fd(), &msg, 0),
+ SyscallFailsWithErrno(EBADF));
+}
+
+TEST_P(UnixSocketPairCmsgTest, ShortCmsg) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ int sent_fd = -1;
+
+ struct msghdr msg = {};
+ char control[CMSG_SPACE(sizeof(sent_fd))];
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_len = 1;
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ memcpy(CMSG_DATA(cmsg), &sent_fd, sizeof(sent_fd));
+
+ struct iovec iov;
+ iov.iov_base = sent_data;
+ iov.iov_len = sizeof(sent_data);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(sendmsg)(sockets->first_fd(), &msg, 0),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+// BasicFDPassNoSpace starts off by sending a single FD just like BasicFDPass.
+// The difference is that when calling recvmsg, no space for FDs is provided,
+// only space for the cmsg header.
+TEST_P(UnixSocketPairCmsgTest, BasicFDPassNoSpace) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+ sent_data, sizeof(sent_data)));
+
+ char received_data[20];
+
+ struct msghdr msg = {};
+ std::vector<char> control(CMSG_SPACE(0));
+ msg.msg_control = &control[0];
+ msg.msg_controllen = control.size();
+
+ struct iovec iov;
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+
+ EXPECT_EQ(msg.msg_controllen, 0);
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+}
+
+// BasicFDPassNoSpaceMsgCtrunc sends an FD, but does not provide any space to
+// receive it. It then verifies that the MSG_CTRUNC flag is set in the msghdr.
+TEST_P(UnixSocketPairCmsgTest, BasicFDPassNoSpaceMsgCtrunc) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+ sent_data, sizeof(sent_data)));
+
+ struct msghdr msg = {};
+ std::vector<char> control(CMSG_SPACE(0));
+ msg.msg_control = &control[0];
+ msg.msg_controllen = control.size();
+
+ char received_data[sizeof(sent_data)];
+ struct iovec iov;
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+
+ EXPECT_EQ(msg.msg_controllen, 0);
+ EXPECT_EQ(msg.msg_flags, MSG_CTRUNC);
+}
+
+// BasicFDPassNullControlMsgCtrunc sends an FD and sets contradictory values for
+// msg_controllen and msg_control. msg_controllen is set to the correct size to
+// accommodate the FD, but msg_control is set to NULL. In this case, msg_control
+// should override msg_controllen.
+TEST_P(UnixSocketPairCmsgTest, BasicFDPassNullControlMsgCtrunc) {
+ // FIXME(gvisor.dev/issue/207): Fix handling of NULL msg_control.
+ SKIP_IF(IsRunningOnGvisor());
+
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+ sent_data, sizeof(sent_data)));
+
+ struct msghdr msg = {};
+ msg.msg_controllen = CMSG_SPACE(1);
+
+ char received_data[sizeof(sent_data)];
+ struct iovec iov;
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+
+ EXPECT_EQ(msg.msg_controllen, 0);
+ EXPECT_EQ(msg.msg_flags, MSG_CTRUNC);
+}
+
+// BasicFDPassNotEnoughSpaceMsgCtrunc sends an FD, but does not provide enough
+// space to receive it. It then verifies that the MSG_CTRUNC flag is set in the
+// msghdr.
+TEST_P(UnixSocketPairCmsgTest, BasicFDPassNotEnoughSpaceMsgCtrunc) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+ sent_data, sizeof(sent_data)));
+
+ struct msghdr msg = {};
+ std::vector<char> control(CMSG_SPACE(0) + 1);
+ msg.msg_control = &control[0];
+ msg.msg_controllen = control.size();
+
+ char received_data[sizeof(sent_data)];
+ struct iovec iov;
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+
+ EXPECT_EQ(msg.msg_controllen, 0);
+ EXPECT_EQ(msg.msg_flags, MSG_CTRUNC);
+}
+
+// BasicThreeFDPassTruncationMsgCtrunc sends three FDs, but only provides enough
+// space to receive two of them. It then verifies that the MSG_CTRUNC flag is
+// set in the msghdr.
+TEST_P(UnixSocketPairCmsgTest, BasicThreeFDPassTruncationMsgCtrunc) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair1 =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+ auto pair2 =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+ auto pair3 =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+ int sent_fds[] = {pair1->second_fd(), pair2->second_fd(), pair3->second_fd()};
+
+ ASSERT_NO_FATAL_FAILURE(
+ SendFDs(sockets->first_fd(), sent_fds, 3, sent_data, sizeof(sent_data)));
+
+ struct msghdr msg = {};
+ std::vector<char> control(CMSG_SPACE(2 * sizeof(int)));
+ msg.msg_control = &control[0];
+ msg.msg_controllen = control.size();
+
+ char received_data[sizeof(sent_data)];
+ struct iovec iov;
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+
+ EXPECT_EQ(msg.msg_flags, MSG_CTRUNC);
+
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, nullptr);
+ EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(2 * sizeof(int)));
+ EXPECT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+ EXPECT_EQ(cmsg->cmsg_type, SCM_RIGHTS);
+}
+
+// BasicFDPassUnalignedRecv starts off by sending a single FD just like
+// BasicFDPass. The difference is that when calling recvmsg, the length of the
+// receive data is only aligned on a 4 byte boundry instead of the normal 8.
+TEST_P(UnixSocketPairCmsgTest, BasicFDPassUnalignedRecv) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+ sent_data, sizeof(sent_data)));
+
+ char received_data[20];
+ int fd = -1;
+ ASSERT_NO_FATAL_FAILURE(RecvSingleFDUnaligned(
+ sockets->second_fd(), &fd, received_data, sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ ASSERT_NO_FATAL_FAILURE(TransferTest(fd, pair->first_fd()));
+}
+
+// BasicFDPassUnalignedRecvNoMsgTrunc sends one FD and only provides enough
+// space to receive just it. (Normally the minimum amount of space one would
+// provide would be enough space for two FDs.) It then verifies that the
+// MSG_CTRUNC flag is not set in the msghdr.
+TEST_P(UnixSocketPairCmsgTest, BasicFDPassUnalignedRecvNoMsgTrunc) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+ sent_data, sizeof(sent_data)));
+
+ struct msghdr msg = {};
+ char control[CMSG_SPACE(sizeof(int)) - sizeof(int)];
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ char received_data[sizeof(sent_data)] = {};
+ struct iovec iov;
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+
+ EXPECT_EQ(msg.msg_flags, 0);
+
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, nullptr);
+ EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(int)));
+ EXPECT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+ EXPECT_EQ(cmsg->cmsg_type, SCM_RIGHTS);
+}
+
+// BasicTwoFDPassUnalignedRecvTruncationMsgTrunc sends two FDs, but only
+// provides enough space to receive one of them. It then verifies that the
+// MSG_CTRUNC flag is set in the msghdr.
+TEST_P(UnixSocketPairCmsgTest, BasicTwoFDPassUnalignedRecvTruncationMsgTrunc) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+ int sent_fds[] = {pair->first_fd(), pair->second_fd()};
+
+ ASSERT_NO_FATAL_FAILURE(
+ SendFDs(sockets->first_fd(), sent_fds, 2, sent_data, sizeof(sent_data)));
+
+ struct msghdr msg = {};
+ // CMSG_SPACE rounds up to two FDs, we only want one.
+ char control[CMSG_SPACE(sizeof(int)) - sizeof(int)];
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ char received_data[sizeof(sent_data)] = {};
+ struct iovec iov;
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+
+ EXPECT_EQ(msg.msg_flags, MSG_CTRUNC);
+
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, nullptr);
+ EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(int)));
+ EXPECT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+ EXPECT_EQ(cmsg->cmsg_type, SCM_RIGHTS);
+}
+
+TEST_P(UnixSocketPairCmsgTest, ConcurrentBasicFDPass) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ int sockfd1 = sockets->first_fd();
+ auto recv_func = [sockfd1, sent_data]() {
+ char received_data[20];
+ int fd = -1;
+ RecvSingleFD(sockfd1, &fd, received_data, sizeof(received_data));
+ ASSERT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+ char buf[20];
+ ASSERT_THAT(ReadFd(fd, buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+ ASSERT_THAT(WriteFd(fd, buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+ };
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->second_fd(), pair->second_fd(),
+ sent_data, sizeof(sent_data)));
+
+ ScopedThread t(recv_func);
+
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+ ASSERT_THAT(WriteFd(pair->first_fd(), sent_data, sizeof(sent_data)),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ char received_data[20];
+ ASSERT_THAT(ReadFd(pair->first_fd(), received_data, sizeof(received_data)),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+
+ t.Join();
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+}
+
+// FDPassNoRecv checks that the control message can be safely ignored by using
+// read(2) instead of recvmsg(2).
+TEST_P(UnixSocketPairCmsgTest, FDPassNoRecv) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+ sent_data, sizeof(sent_data)));
+
+ // Read while ignoring the passed FD.
+ char received_data[20];
+ ASSERT_THAT(
+ ReadFd(sockets->second_fd(), received_data, sizeof(received_data)),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ // Check that the socket still works for reads and writes.
+ ASSERT_NO_FATAL_FAILURE(
+ TransferTest(sockets->first_fd(), sockets->second_fd()));
+}
+
+// FDPassInterspersed1 checks that sent control messages cannot be read before
+// their associated data has been read.
+TEST_P(UnixSocketPairCmsgTest, FDPassInterspersed1) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char written_data[20];
+ RandomizeBuffer(written_data, sizeof(written_data));
+
+ ASSERT_THAT(WriteFd(sockets->first_fd(), written_data, sizeof(written_data)),
+ SyscallSucceedsWithValue(sizeof(written_data)));
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+ sent_data, sizeof(sent_data)));
+
+ // Check that we don't get a control message, but do get the data.
+ char received_data[20];
+ RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data));
+ EXPECT_EQ(0, memcmp(written_data, received_data, sizeof(written_data)));
+}
+
+// FDPassInterspersed2 checks that sent control messages cannot be read after
+// their associated data has been read while ignoring the control message by
+// using read(2) instead of recvmsg(2).
+TEST_P(UnixSocketPairCmsgTest, FDPassInterspersed2) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+ sent_data, sizeof(sent_data)));
+
+ char written_data[20];
+ RandomizeBuffer(written_data, sizeof(written_data));
+ ASSERT_THAT(WriteFd(sockets->first_fd(), written_data, sizeof(written_data)),
+ SyscallSucceedsWithValue(sizeof(written_data)));
+
+ char received_data[20];
+ ASSERT_THAT(
+ ReadFd(sockets->second_fd(), received_data, sizeof(received_data)),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ ASSERT_NO_FATAL_FAILURE(
+ RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data)));
+ EXPECT_EQ(0, memcmp(written_data, received_data, sizeof(written_data)));
+}
+
+TEST_P(UnixSocketPairCmsgTest, FDPassNotCoalesced) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data1[20];
+ RandomizeBuffer(sent_data1, sizeof(sent_data1));
+
+ auto pair1 =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair1->second_fd(),
+ sent_data1, sizeof(sent_data1)));
+
+ char sent_data2[20];
+ RandomizeBuffer(sent_data2, sizeof(sent_data2));
+
+ auto pair2 =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair2->second_fd(),
+ sent_data2, sizeof(sent_data2)));
+
+ char received_data1[sizeof(sent_data1) + sizeof(sent_data2)];
+ int received_fd1 = -1;
+
+ RecvSingleFD(sockets->second_fd(), &received_fd1, received_data1,
+ sizeof(received_data1), sizeof(sent_data1));
+
+ EXPECT_EQ(0, memcmp(sent_data1, received_data1, sizeof(sent_data1)));
+ TransferTest(pair1->first_fd(), pair1->second_fd());
+
+ char received_data2[sizeof(sent_data1) + sizeof(sent_data2)];
+ int received_fd2 = -1;
+
+ RecvSingleFD(sockets->second_fd(), &received_fd2, received_data2,
+ sizeof(received_data2), sizeof(sent_data2));
+
+ EXPECT_EQ(0, memcmp(sent_data2, received_data2, sizeof(sent_data2)));
+ TransferTest(pair2->first_fd(), pair2->second_fd());
+}
+
+TEST_P(UnixSocketPairCmsgTest, FDPassPeek) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+ sent_data, sizeof(sent_data)));
+
+ char peek_data[20];
+ int peek_fd = -1;
+ PeekSingleFD(sockets->second_fd(), &peek_fd, peek_data, sizeof(peek_data));
+ EXPECT_EQ(0, memcmp(sent_data, peek_data, sizeof(sent_data)));
+ TransferTest(peek_fd, pair->first_fd());
+ EXPECT_THAT(close(peek_fd), SyscallSucceeds());
+
+ char received_data[20];
+ int received_fd = -1;
+ RecvSingleFD(sockets->second_fd(), &received_fd, received_data,
+ sizeof(received_data));
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+ TransferTest(received_fd, pair->first_fd());
+ EXPECT_THAT(close(received_fd), SyscallSucceeds());
+}
+
+TEST_P(UnixSocketPairCmsgTest, BasicCredPass) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ struct ucred sent_creds;
+
+ ASSERT_THAT(sent_creds.pid = getpid(), SyscallSucceeds());
+ ASSERT_THAT(sent_creds.uid = getuid(), SyscallSucceeds());
+ ASSERT_THAT(sent_creds.gid = getgid(), SyscallSucceeds());
+
+ ASSERT_NO_FATAL_FAILURE(
+ SendCreds(sockets->first_fd(), sent_creds, sent_data, sizeof(sent_data)));
+
+ SetSoPassCred(sockets->second_fd());
+
+ char received_data[20];
+ struct ucred received_creds;
+ ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
+ received_data, sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+ EXPECT_EQ(sent_creds.pid, received_creds.pid);
+ EXPECT_EQ(sent_creds.uid, received_creds.uid);
+ EXPECT_EQ(sent_creds.gid, received_creds.gid);
+}
+
+TEST_P(UnixSocketPairCmsgTest, SendNullCredsBeforeSoPassCredRecvEnd) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ ASSERT_NO_FATAL_FAILURE(
+ SendNullCmsg(sockets->first_fd(), sent_data, sizeof(sent_data)));
+
+ SetSoPassCred(sockets->second_fd());
+
+ char received_data[20];
+ struct ucred received_creds;
+ ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
+ received_data, sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ struct ucred want_creds {
+ 0, 65534, 65534
+ };
+
+ EXPECT_EQ(want_creds.pid, received_creds.pid);
+ EXPECT_EQ(want_creds.uid, received_creds.uid);
+ EXPECT_EQ(want_creds.gid, received_creds.gid);
+}
+
+TEST_P(UnixSocketPairCmsgTest, SendNullCredsAfterSoPassCredRecvEnd) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ SetSoPassCred(sockets->second_fd());
+
+ ASSERT_NO_FATAL_FAILURE(
+ SendNullCmsg(sockets->first_fd(), sent_data, sizeof(sent_data)));
+
+ char received_data[20];
+ struct ucred received_creds;
+ ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
+ received_data, sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ struct ucred want_creds;
+ ASSERT_THAT(want_creds.pid = getpid(), SyscallSucceeds());
+ ASSERT_THAT(want_creds.uid = getuid(), SyscallSucceeds());
+ ASSERT_THAT(want_creds.gid = getgid(), SyscallSucceeds());
+
+ EXPECT_EQ(want_creds.pid, received_creds.pid);
+ EXPECT_EQ(want_creds.uid, received_creds.uid);
+ EXPECT_EQ(want_creds.gid, received_creds.gid);
+}
+
+TEST_P(UnixSocketPairCmsgTest, SendNullCredsBeforeSoPassCredSendEnd) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ ASSERT_NO_FATAL_FAILURE(
+ SendNullCmsg(sockets->first_fd(), sent_data, sizeof(sent_data)));
+
+ SetSoPassCred(sockets->first_fd());
+
+ char received_data[20];
+ ASSERT_NO_FATAL_FAILURE(
+ RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+}
+
+TEST_P(UnixSocketPairCmsgTest, SendNullCredsAfterSoPassCredSendEnd) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ SetSoPassCred(sockets->first_fd());
+
+ ASSERT_NO_FATAL_FAILURE(
+ SendNullCmsg(sockets->first_fd(), sent_data, sizeof(sent_data)));
+
+ char received_data[20];
+ ASSERT_NO_FATAL_FAILURE(
+ RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+}
+
+TEST_P(UnixSocketPairCmsgTest,
+ SendNullCredsBeforeSoPassCredRecvEndAfterSendEnd) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ SetSoPassCred(sockets->first_fd());
+
+ ASSERT_NO_FATAL_FAILURE(
+ SendNullCmsg(sockets->first_fd(), sent_data, sizeof(sent_data)));
+
+ SetSoPassCred(sockets->second_fd());
+
+ char received_data[20];
+ struct ucred received_creds;
+ ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
+ received_data, sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ struct ucred want_creds;
+ ASSERT_THAT(want_creds.pid = getpid(), SyscallSucceeds());
+ ASSERT_THAT(want_creds.uid = getuid(), SyscallSucceeds());
+ ASSERT_THAT(want_creds.gid = getgid(), SyscallSucceeds());
+
+ EXPECT_EQ(want_creds.pid, received_creds.pid);
+ EXPECT_EQ(want_creds.uid, received_creds.uid);
+ EXPECT_EQ(want_creds.gid, received_creds.gid);
+}
+
+TEST_P(UnixSocketPairCmsgTest, WriteBeforeSoPassCredRecvEnd) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data, sizeof(sent_data)),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ SetSoPassCred(sockets->second_fd());
+
+ char received_data[20];
+
+ struct ucred received_creds;
+ ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
+ received_data, sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ struct ucred want_creds {
+ 0, 65534, 65534
+ };
+
+ EXPECT_EQ(want_creds.pid, received_creds.pid);
+ EXPECT_EQ(want_creds.uid, received_creds.uid);
+ EXPECT_EQ(want_creds.gid, received_creds.gid);
+}
+
+TEST_P(UnixSocketPairCmsgTest, WriteAfterSoPassCredRecvEnd) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ SetSoPassCred(sockets->second_fd());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+ ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data, sizeof(sent_data)),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ char received_data[20];
+
+ struct ucred received_creds;
+ ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
+ received_data, sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ struct ucred want_creds;
+ ASSERT_THAT(want_creds.pid = getpid(), SyscallSucceeds());
+ ASSERT_THAT(want_creds.uid = getuid(), SyscallSucceeds());
+ ASSERT_THAT(want_creds.gid = getgid(), SyscallSucceeds());
+
+ EXPECT_EQ(want_creds.pid, received_creds.pid);
+ EXPECT_EQ(want_creds.uid, received_creds.uid);
+ EXPECT_EQ(want_creds.gid, received_creds.gid);
+}
+
+TEST_P(UnixSocketPairCmsgTest, WriteBeforeSoPassCredSendEnd) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data, sizeof(sent_data)),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ SetSoPassCred(sockets->first_fd());
+
+ char received_data[20];
+ ASSERT_NO_FATAL_FAILURE(
+ RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+}
+
+TEST_P(UnixSocketPairCmsgTest, WriteAfterSoPassCredSendEnd) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ SetSoPassCred(sockets->first_fd());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data, sizeof(sent_data)),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ char received_data[20];
+ ASSERT_NO_FATAL_FAILURE(
+ RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+}
+
+TEST_P(UnixSocketPairCmsgTest, WriteBeforeSoPassCredRecvEndAfterSendEnd) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ SetSoPassCred(sockets->first_fd());
+
+ ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data, sizeof(sent_data)),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ SetSoPassCred(sockets->second_fd());
+
+ char received_data[20];
+
+ struct ucred received_creds;
+ ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
+ received_data, sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ struct ucred want_creds;
+ ASSERT_THAT(want_creds.pid = getpid(), SyscallSucceeds());
+ ASSERT_THAT(want_creds.uid = getuid(), SyscallSucceeds());
+ ASSERT_THAT(want_creds.gid = getgid(), SyscallSucceeds());
+
+ EXPECT_EQ(want_creds.pid, received_creds.pid);
+ EXPECT_EQ(want_creds.uid, received_creds.uid);
+ EXPECT_EQ(want_creds.gid, received_creds.gid);
+}
+
+TEST_P(UnixSocketPairCmsgTest, CredPassTruncated) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ struct ucred sent_creds;
+
+ ASSERT_THAT(sent_creds.pid = getpid(), SyscallSucceeds());
+ ASSERT_THAT(sent_creds.uid = getuid(), SyscallSucceeds());
+ ASSERT_THAT(sent_creds.gid = getgid(), SyscallSucceeds());
+
+ ASSERT_NO_FATAL_FAILURE(
+ SendCreds(sockets->first_fd(), sent_creds, sent_data, sizeof(sent_data)));
+
+ SetSoPassCred(sockets->second_fd());
+
+ struct msghdr msg = {};
+ char control[CMSG_SPACE(0) + sizeof(pid_t)];
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ char received_data[sizeof(sent_data)] = {};
+ struct iovec iov;
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ EXPECT_EQ(msg.msg_controllen, sizeof(control));
+
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, nullptr);
+ EXPECT_EQ(cmsg->cmsg_len, sizeof(control));
+ EXPECT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+ EXPECT_EQ(cmsg->cmsg_type, SCM_CREDENTIALS);
+
+ pid_t pid = 0;
+ memcpy(&pid, CMSG_DATA(cmsg), sizeof(pid));
+ EXPECT_EQ(pid, sent_creds.pid);
+}
+
+// CredPassNoMsgCtrunc passes a full set of credentials. It then verifies that
+// receiving the full set does not result in MSG_CTRUNC being set in the msghdr.
+TEST_P(UnixSocketPairCmsgTest, CredPassNoMsgCtrunc) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ struct ucred sent_creds;
+
+ ASSERT_THAT(sent_creds.pid = getpid(), SyscallSucceeds());
+ ASSERT_THAT(sent_creds.uid = getuid(), SyscallSucceeds());
+ ASSERT_THAT(sent_creds.gid = getgid(), SyscallSucceeds());
+
+ ASSERT_NO_FATAL_FAILURE(
+ SendCreds(sockets->first_fd(), sent_creds, sent_data, sizeof(sent_data)));
+
+ SetSoPassCred(sockets->second_fd());
+
+ struct msghdr msg = {};
+ char control[CMSG_SPACE(sizeof(struct ucred))];
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ char received_data[sizeof(sent_data)] = {};
+ struct iovec iov;
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ // The control message should not be truncated.
+ EXPECT_EQ(msg.msg_flags, 0);
+ EXPECT_EQ(msg.msg_controllen, sizeof(control));
+
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, nullptr);
+ EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(struct ucred)));
+ EXPECT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+ EXPECT_EQ(cmsg->cmsg_type, SCM_CREDENTIALS);
+}
+
+// CredPassNoSpaceMsgCtrunc passes a full set of credentials. It then receives
+// the data without providing space for any credentials and verifies that
+// MSG_CTRUNC is set in the msghdr.
+TEST_P(UnixSocketPairCmsgTest, CredPassNoSpaceMsgCtrunc) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ struct ucred sent_creds;
+
+ ASSERT_THAT(sent_creds.pid = getpid(), SyscallSucceeds());
+ ASSERT_THAT(sent_creds.uid = getuid(), SyscallSucceeds());
+ ASSERT_THAT(sent_creds.gid = getgid(), SyscallSucceeds());
+
+ ASSERT_NO_FATAL_FAILURE(
+ SendCreds(sockets->first_fd(), sent_creds, sent_data, sizeof(sent_data)));
+
+ SetSoPassCred(sockets->second_fd());
+
+ struct msghdr msg = {};
+ char control[CMSG_SPACE(0)];
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ char received_data[sizeof(sent_data)] = {};
+ struct iovec iov;
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ // The control message should be truncated.
+ EXPECT_EQ(msg.msg_flags, MSG_CTRUNC);
+ EXPECT_EQ(msg.msg_controllen, sizeof(control));
+
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, nullptr);
+ EXPECT_EQ(cmsg->cmsg_len, sizeof(control));
+ EXPECT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+ EXPECT_EQ(cmsg->cmsg_type, SCM_CREDENTIALS);
+}
+
+// CredPassTruncatedMsgCtrunc passes a full set of credentials. It then receives
+// the data while providing enough space for only the first field of the
+// credentials and verifies that MSG_CTRUNC is set in the msghdr.
+TEST_P(UnixSocketPairCmsgTest, CredPassTruncatedMsgCtrunc) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ struct ucred sent_creds;
+
+ ASSERT_THAT(sent_creds.pid = getpid(), SyscallSucceeds());
+ ASSERT_THAT(sent_creds.uid = getuid(), SyscallSucceeds());
+ ASSERT_THAT(sent_creds.gid = getgid(), SyscallSucceeds());
+
+ ASSERT_NO_FATAL_FAILURE(
+ SendCreds(sockets->first_fd(), sent_creds, sent_data, sizeof(sent_data)));
+
+ SetSoPassCred(sockets->second_fd());
+
+ struct msghdr msg = {};
+ char control[CMSG_SPACE(0) + sizeof(pid_t)];
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ char received_data[sizeof(sent_data)] = {};
+ struct iovec iov;
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ // The control message should be truncated.
+ EXPECT_EQ(msg.msg_flags, MSG_CTRUNC);
+ EXPECT_EQ(msg.msg_controllen, sizeof(control));
+
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, nullptr);
+ EXPECT_EQ(cmsg->cmsg_len, sizeof(control));
+ EXPECT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+ EXPECT_EQ(cmsg->cmsg_type, SCM_CREDENTIALS);
+}
+
+TEST_P(UnixSocketPairCmsgTest, SoPassCred) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ int opt;
+ socklen_t optLen = sizeof(opt);
+ EXPECT_THAT(
+ getsockopt(sockets->first_fd(), SOL_SOCKET, SO_PASSCRED, &opt, &optLen),
+ SyscallSucceeds());
+ EXPECT_FALSE(opt);
+
+ optLen = sizeof(opt);
+ EXPECT_THAT(
+ getsockopt(sockets->second_fd(), SOL_SOCKET, SO_PASSCRED, &opt, &optLen),
+ SyscallSucceeds());
+ EXPECT_FALSE(opt);
+
+ SetSoPassCred(sockets->first_fd());
+
+ optLen = sizeof(opt);
+ EXPECT_THAT(
+ getsockopt(sockets->first_fd(), SOL_SOCKET, SO_PASSCRED, &opt, &optLen),
+ SyscallSucceeds());
+ EXPECT_TRUE(opt);
+
+ optLen = sizeof(opt);
+ EXPECT_THAT(
+ getsockopt(sockets->second_fd(), SOL_SOCKET, SO_PASSCRED, &opt, &optLen),
+ SyscallSucceeds());
+ EXPECT_FALSE(opt);
+
+ int zero = 0;
+ EXPECT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_PASSCRED, &zero,
+ sizeof(zero)),
+ SyscallSucceeds());
+
+ optLen = sizeof(opt);
+ EXPECT_THAT(
+ getsockopt(sockets->first_fd(), SOL_SOCKET, SO_PASSCRED, &opt, &optLen),
+ SyscallSucceeds());
+ EXPECT_FALSE(opt);
+
+ optLen = sizeof(opt);
+ EXPECT_THAT(
+ getsockopt(sockets->second_fd(), SOL_SOCKET, SO_PASSCRED, &opt, &optLen),
+ SyscallSucceeds());
+ EXPECT_FALSE(opt);
+}
+
+TEST_P(UnixSocketPairCmsgTest, NoDataCredPass) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ struct msghdr msg = {};
+
+ struct iovec iov;
+ iov.iov_base = sent_data;
+ iov.iov_len = sizeof(sent_data);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ char control[CMSG_SPACE(0)];
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_CREDENTIALS;
+ cmsg->cmsg_len = CMSG_LEN(0);
+
+ ASSERT_THAT(RetryEINTR(sendmsg)(sockets->first_fd(), &msg, 0),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(UnixSocketPairCmsgTest, NoPassCred) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ struct ucred sent_creds;
+
+ ASSERT_THAT(sent_creds.pid = getpid(), SyscallSucceeds());
+ ASSERT_THAT(sent_creds.uid = getuid(), SyscallSucceeds());
+ ASSERT_THAT(sent_creds.gid = getgid(), SyscallSucceeds());
+
+ ASSERT_NO_FATAL_FAILURE(
+ SendCreds(sockets->first_fd(), sent_creds, sent_data, sizeof(sent_data)));
+
+ char received_data[20];
+
+ ASSERT_NO_FATAL_FAILURE(
+ RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+}
+
+TEST_P(UnixSocketPairCmsgTest, CredAndFDPass) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ struct ucred sent_creds;
+
+ ASSERT_THAT(sent_creds.pid = getpid(), SyscallSucceeds());
+ ASSERT_THAT(sent_creds.uid = getuid(), SyscallSucceeds());
+ ASSERT_THAT(sent_creds.gid = getgid(), SyscallSucceeds());
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ ASSERT_NO_FATAL_FAILURE(SendCredsAndFD(sockets->first_fd(), sent_creds,
+ pair->second_fd(), sent_data,
+ sizeof(sent_data)));
+
+ SetSoPassCred(sockets->second_fd());
+
+ char received_data[20];
+ struct ucred received_creds;
+ int fd = -1;
+ ASSERT_NO_FATAL_FAILURE(RecvCredsAndFD(sockets->second_fd(), &received_creds,
+ &fd, received_data,
+ sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ EXPECT_EQ(sent_creds.pid, received_creds.pid);
+ EXPECT_EQ(sent_creds.uid, received_creds.uid);
+ EXPECT_EQ(sent_creds.gid, received_creds.gid);
+
+ ASSERT_NO_FATAL_FAILURE(TransferTest(fd, pair->first_fd()));
+}
+
+TEST_P(UnixSocketPairCmsgTest, FDPassBeforeSoPassCred) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+ sent_data, sizeof(sent_data)));
+
+ SetSoPassCred(sockets->second_fd());
+
+ char received_data[20];
+ struct ucred received_creds;
+ int fd = -1;
+ ASSERT_NO_FATAL_FAILURE(RecvCredsAndFD(sockets->second_fd(), &received_creds,
+ &fd, received_data,
+ sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ struct ucred want_creds {
+ 0, 65534, 65534
+ };
+
+ EXPECT_EQ(want_creds.pid, received_creds.pid);
+ EXPECT_EQ(want_creds.uid, received_creds.uid);
+ EXPECT_EQ(want_creds.gid, received_creds.gid);
+
+ ASSERT_NO_FATAL_FAILURE(TransferTest(fd, pair->first_fd()));
+}
+
+TEST_P(UnixSocketPairCmsgTest, FDPassAfterSoPassCred) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ SetSoPassCred(sockets->second_fd());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+ sent_data, sizeof(sent_data)));
+
+ char received_data[20];
+ struct ucred received_creds;
+ int fd = -1;
+ ASSERT_NO_FATAL_FAILURE(RecvCredsAndFD(sockets->second_fd(), &received_creds,
+ &fd, received_data,
+ sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ struct ucred want_creds;
+ ASSERT_THAT(want_creds.pid = getpid(), SyscallSucceeds());
+ ASSERT_THAT(want_creds.uid = getuid(), SyscallSucceeds());
+ ASSERT_THAT(want_creds.gid = getgid(), SyscallSucceeds());
+
+ EXPECT_EQ(want_creds.pid, received_creds.pid);
+ EXPECT_EQ(want_creds.uid, received_creds.uid);
+ EXPECT_EQ(want_creds.gid, received_creds.gid);
+
+ ASSERT_NO_FATAL_FAILURE(TransferTest(fd, pair->first_fd()));
+}
+
+TEST_P(UnixSocketPairCmsgTest, CloexecDroppedWhenFDPassed) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair = ASSERT_NO_ERRNO_AND_VALUE(
+ UnixDomainSocketPair(SOCK_SEQPACKET | SOCK_CLOEXEC).Create());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+ sent_data, sizeof(sent_data)));
+
+ char received_data[20];
+ int fd = -1;
+ ASSERT_NO_FATAL_FAILURE(RecvSingleFD(sockets->second_fd(), &fd, received_data,
+ sizeof(received_data)));
+
+ EXPECT_THAT(fcntl(fd, F_GETFD), SyscallSucceedsWithValue(0));
+}
+
+TEST_P(UnixSocketPairCmsgTest, CloexecRecvFDPass) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+ sent_data, sizeof(sent_data)));
+
+ struct msghdr msg = {};
+ char control[CMSG_SPACE(sizeof(int))];
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ struct iovec iov;
+ char received_data[20];
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, MSG_CMSG_CLOEXEC),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, nullptr);
+ ASSERT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(int)));
+ ASSERT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+ ASSERT_EQ(cmsg->cmsg_type, SCM_RIGHTS);
+
+ int fd = -1;
+ memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
+
+ EXPECT_THAT(fcntl(fd, F_GETFD), SyscallSucceedsWithValue(FD_CLOEXEC));
+}
+
+TEST_P(UnixSocketPairCmsgTest, FDPassAfterSoPassCredWithoutCredSpace) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ SetSoPassCred(sockets->second_fd());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+ sent_data, sizeof(sent_data)));
+
+ struct msghdr msg = {};
+ char control[CMSG_LEN(0)];
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ char received_data[20];
+ struct iovec iov;
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ EXPECT_EQ(msg.msg_controllen, sizeof(control));
+
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, nullptr);
+ EXPECT_EQ(cmsg->cmsg_len, sizeof(control));
+ EXPECT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+ EXPECT_EQ(cmsg->cmsg_type, SCM_CREDENTIALS);
+}
+
+// This test will validate that MSG_CTRUNC as an input flag to recvmsg will
+// not appear as an output flag on the control message when truncation doesn't
+// happen.
+TEST_P(UnixSocketPairCmsgTest, MsgCtruncInputIsNoop) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+ sent_data, sizeof(sent_data)));
+
+ struct msghdr msg = {};
+ char control[CMSG_SPACE(sizeof(int)) /* we're passing a single fd */];
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ struct iovec iov;
+ char received_data[20];
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, MSG_CTRUNC),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, nullptr);
+ ASSERT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(int)));
+ ASSERT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+ ASSERT_EQ(cmsg->cmsg_type, SCM_RIGHTS);
+
+ // Now we should verify that MSG_CTRUNC wasn't set as an output flag.
+ EXPECT_EQ(msg.msg_flags & MSG_CTRUNC, 0);
+}
+
+TEST_P(UnixSocketPairCmsgTest, FDPassAfterSoPassCredWithoutCredHeaderSpace) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ SetSoPassCred(sockets->second_fd());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+ sent_data, sizeof(sent_data)));
+
+ struct msghdr msg = {};
+ char control[CMSG_LEN(0) / 2];
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ char received_data[20];
+ struct iovec iov;
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+ EXPECT_EQ(msg.msg_controllen, 0);
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_cmsg.h b/test/syscalls/linux/socket_unix_cmsg.h
new file mode 100644
index 000000000..431606903
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_cmsg.h
@@ -0,0 +1,30 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_UNIX_CMSG_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_UNIX_CMSG_H_
+
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to pairs of connected unix sockets about
+// control messages.
+using UnixSocketPairCmsgTest = SocketPairTest;
+
+} // namespace testing
+} // namespace gvisor
+
+#endif // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_UNIX_CMSG_H_
diff --git a/test/syscalls/linux/socket_unix_dgram.cc b/test/syscalls/linux/socket_unix_dgram.cc
new file mode 100644
index 000000000..af0df4fb4
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_dgram.cc
@@ -0,0 +1,45 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_unix_dgram.h"
+
+#include <stdio.h>
+#include <sys/un.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST_P(DgramUnixSocketPairTest, WriteOneSideClosed) {
+ // FIXME(b/35925052): gVisor datagram sockets return EPIPE instead of
+ // ECONNREFUSED.
+ SKIP_IF(IsRunningOnGvisor());
+
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ ASSERT_THAT(close(sockets->release_first_fd()), SyscallSucceeds());
+ constexpr char kStr[] = "abc";
+ ASSERT_THAT(write(sockets->second_fd(), kStr, 3),
+ SyscallFailsWithErrno(ECONNREFUSED));
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_dgram.h b/test/syscalls/linux/socket_unix_dgram.h
new file mode 100644
index 000000000..0764ef85b
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_dgram.h
@@ -0,0 +1,29 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_UNIX_DGRAM_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_UNIX_DGRAM_H_
+
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to pairs of connected dgram unix sockets.
+using DgramUnixSocketPairTest = SocketPairTest;
+
+} // namespace testing
+} // namespace gvisor
+
+#endif // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_UNIX_DGRAM_H_
diff --git a/test/syscalls/linux/socket_unix_dgram_local.cc b/test/syscalls/linux/socket_unix_dgram_local.cc
new file mode 100644
index 000000000..31d2d5216
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_dgram_local.cc
@@ -0,0 +1,58 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "test/syscalls/linux/socket_non_stream.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/socket_unix_dgram.h"
+#include "test/syscalls/linux/socket_unix_non_stream.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+ return VecCat<SocketPairKind>(VecCat<SocketPairKind>(
+ ApplyVec<SocketPairKind>(
+ UnixDomainSocketPair,
+ AllBitwiseCombinations(List<int>{SOCK_DGRAM, SOCK_RAW},
+ List<int>{0, SOCK_NONBLOCK})),
+ ApplyVec<SocketPairKind>(
+ FilesystemBoundUnixDomainSocketPair,
+ AllBitwiseCombinations(List<int>{SOCK_DGRAM, SOCK_RAW},
+ List<int>{0, SOCK_NONBLOCK})),
+ ApplyVec<SocketPairKind>(
+ AbstractBoundUnixDomainSocketPair,
+ AllBitwiseCombinations(List<int>{SOCK_DGRAM, SOCK_RAW},
+ List<int>{0, SOCK_NONBLOCK}))));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ DgramUnixSockets, DgramUnixSocketPairTest,
+ ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+INSTANTIATE_TEST_SUITE_P(
+ DgramUnixSockets, UnixNonStreamSocketPairTest,
+ ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+INSTANTIATE_TEST_SUITE_P(
+ DgramUnixSockets, NonStreamSocketPairTest,
+ ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_dgram_non_blocking.cc b/test/syscalls/linux/socket_unix_dgram_non_blocking.cc
new file mode 100644
index 000000000..2db8b68d3
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_dgram_non_blocking.cc
@@ -0,0 +1,57 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <sys/un.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Test fixture for tests that apply to pairs of connected non-blocking dgram
+// unix sockets.
+using NonBlockingDgramUnixSocketPairTest = SocketPairTest;
+
+TEST_P(NonBlockingDgramUnixSocketPairTest, ReadOneSideClosed) {
+ if (IsRunningOnGvisor()) {
+ // FIXME(b/70803293): gVisor datagram sockets return 0 instead of
+ // EAGAIN.
+ return;
+ }
+
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ ASSERT_THAT(close(sockets->release_first_fd()), SyscallSucceeds());
+ char data[10] = {};
+ ASSERT_THAT(read(sockets->second_fd(), data, sizeof(data)),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ NonBlockingDgramUnixSockets, NonBlockingDgramUnixSocketPairTest,
+ ::testing::ValuesIn(IncludeReversals(std::vector<SocketPairKind>{
+ UnixDomainSocketPair(SOCK_DGRAM | SOCK_NONBLOCK),
+ FilesystemBoundUnixDomainSocketPair(SOCK_DGRAM | SOCK_NONBLOCK),
+ AbstractBoundUnixDomainSocketPair(SOCK_DGRAM | SOCK_NONBLOCK),
+ })));
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_domain.cc b/test/syscalls/linux/socket_unix_domain.cc
new file mode 100644
index 000000000..f7dff8b4d
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_domain.cc
@@ -0,0 +1,39 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "test/syscalls/linux/socket_generic.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+ return ApplyVec<SocketPairKind>(
+ UnixDomainSocketPair,
+ AllBitwiseCombinations(List<int>{SOCK_STREAM, SOCK_DGRAM, SOCK_SEQPACKET},
+ List<int>{0, SOCK_NONBLOCK}));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ AllUnixDomainSockets, AllSocketPairTest,
+ ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_filesystem_nonblock.cc b/test/syscalls/linux/socket_unix_filesystem_nonblock.cc
new file mode 100644
index 000000000..6700b4d90
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_filesystem_nonblock.cc
@@ -0,0 +1,39 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "test/syscalls/linux/socket_non_blocking.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+ return ApplyVec<SocketPairKind>(
+ FilesystemBoundUnixDomainSocketPair,
+ AllBitwiseCombinations(List<int>{SOCK_STREAM, SOCK_DGRAM, SOCK_SEQPACKET},
+ List<int>{SOCK_NONBLOCK}));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ NonBlockingFilesystemUnixSockets, NonBlockingSocketPairTest,
+ ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_non_stream.cc b/test/syscalls/linux/socket_unix_non_stream.cc
new file mode 100644
index 000000000..884319e1d
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_non_stream.cc
@@ -0,0 +1,256 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_unix_non_stream.h"
+
+#include <stdio.h>
+#include <sys/mman.h>
+#include <sys/un.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/memory_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+TEST_P(UnixNonStreamSocketPairTest, RecvMsgTooLarge) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ int rcvbuf;
+ socklen_t length = sizeof(rcvbuf);
+ ASSERT_THAT(
+ getsockopt(sockets->first_fd(), SOL_SOCKET, SO_RCVBUF, &rcvbuf, &length),
+ SyscallSucceeds());
+
+ // Make the call larger than the receive buffer.
+ const int recv_size = 3 * rcvbuf;
+
+ // Write a message that does fit in the receive buffer.
+ const int write_size = rcvbuf - kPageSize;
+
+ std::vector<char> write_buf(write_size, 'a');
+ const int ret = RetryEINTR(write)(sockets->second_fd(), write_buf.data(),
+ write_buf.size());
+ if (ret < 0 && errno == ENOBUFS) {
+ // NOTE(b/116636318): Linux may stall the write for a long time and
+ // ultimately return ENOBUFS. Allow this error, since a retry will likely
+ // result in the same error.
+ return;
+ }
+ ASSERT_THAT(ret, SyscallSucceeds());
+
+ std::vector<char> recv_buf(recv_size);
+
+ ASSERT_NO_FATAL_FAILURE(RecvNoCmsg(sockets->first_fd(), recv_buf.data(),
+ recv_buf.size(), write_size));
+
+ recv_buf.resize(write_size);
+ EXPECT_EQ(recv_buf, write_buf);
+}
+
+// Create a region of anonymous memory of size 'size', which is fragmented in
+// FileMem.
+//
+// ptr contains the start address of the region. The returned vector contains
+// all of the mappings to be unmapped when done.
+PosixErrorOr<std::vector<Mapping>> CreateFragmentedRegion(const int size,
+ void** ptr) {
+ Mapping region;
+ ASSIGN_OR_RETURN_ERRNO(region, Mmap(nullptr, size, PROT_NONE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0));
+
+ *ptr = region.ptr();
+
+ // Don't save hundreds of times for all of these mmaps.
+ DisableSave ds;
+
+ std::vector<Mapping> pages;
+
+ // Map and commit a single page at a time, mapping and committing an unrelated
+ // page between each call to force FileMem fragmentation.
+ for (uintptr_t addr = region.addr(); addr < region.endaddr();
+ addr += kPageSize) {
+ Mapping page;
+ ASSIGN_OR_RETURN_ERRNO(
+ page,
+ Mmap(reinterpret_cast<void*>(addr), kPageSize, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0));
+ *reinterpret_cast<volatile char*>(page.ptr()) = 42;
+
+ pages.emplace_back(std::move(page));
+
+ // Unrelated page elsewhere.
+ ASSIGN_OR_RETURN_ERRNO(page,
+ Mmap(nullptr, kPageSize, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0));
+ *reinterpret_cast<volatile char*>(page.ptr()) = 42;
+
+ pages.emplace_back(std::move(page));
+ }
+
+ // The mappings above have taken ownership of the region.
+ region.release();
+
+ return std::move(pages);
+}
+
+// A contiguous iov that is heavily fragmented in FileMem can still be sent
+// successfully. See b/115833655.
+TEST_P(UnixNonStreamSocketPairTest, FragmentedSendMsg) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ const int buffer_size = UIO_MAXIOV * kPageSize;
+ // Extra page for message header overhead.
+ const int sndbuf = buffer_size + kPageSize;
+ // N.B. setsockopt(SO_SNDBUF) doubles the passed value.
+ const int set_sndbuf = sndbuf / 2;
+
+ EXPECT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDBUF,
+ &set_sndbuf, sizeof(set_sndbuf)),
+ SyscallSucceeds());
+
+ int actual_sndbuf = 0;
+ socklen_t length = sizeof(actual_sndbuf);
+ ASSERT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDBUF,
+ &actual_sndbuf, &length),
+ SyscallSucceeds());
+
+ if (actual_sndbuf != sndbuf) {
+ // Unable to get the sndbuf we want.
+ //
+ // N.B. At minimum, the socketpair gofer should provide a socket that is
+ // already the correct size.
+ //
+ // TODO(b/35921550): When internal UDS support SO_SNDBUF, we can assert that
+ // we always get the right SO_SNDBUF on gVisor.
+ GTEST_SKIP() << "SO_SNDBUF = " << actual_sndbuf << ", want " << sndbuf;
+ }
+
+ // Create a contiguous region of memory of 2*UIO_MAXIOV*PAGE_SIZE. We'll call
+ // sendmsg with a single iov, but the goal is to get the sentry to split this
+ // into > UIO_MAXIOV iovs when calling the kernel.
+ void* ptr;
+ std::vector<Mapping> pages =
+ ASSERT_NO_ERRNO_AND_VALUE(CreateFragmentedRegion(buffer_size, &ptr));
+
+ struct iovec iov = {};
+ iov.iov_base = ptr;
+ iov.iov_len = buffer_size;
+
+ struct msghdr msg = {};
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ // NOTE(b/116636318,b/115833655): Linux has poor behavior in the presence of
+ // physical memory fragmentation. As a result, this may stall for a long time
+ // and ultimately return ENOBUFS. Allow this error, since it means that we
+ // made it to the host kernel and started the sendmsg.
+ EXPECT_THAT(RetryEINTR(sendmsg)(sockets->first_fd(), &msg, 0),
+ AnyOf(SyscallSucceedsWithValue(buffer_size),
+ SyscallFailsWithErrno(ENOBUFS)));
+}
+
+// A contiguous iov that is heavily fragmented in FileMem can still be received
+// into successfully. Regression test for b/115833655.
+TEST_P(UnixNonStreamSocketPairTest, FragmentedRecvMsg) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ const int buffer_size = UIO_MAXIOV * kPageSize;
+ // Extra page for message header overhead.
+ const int sndbuf = buffer_size + kPageSize;
+ // N.B. setsockopt(SO_SNDBUF) doubles the passed value.
+ const int set_sndbuf = sndbuf / 2;
+
+ EXPECT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDBUF,
+ &set_sndbuf, sizeof(set_sndbuf)),
+ SyscallSucceeds());
+
+ int actual_sndbuf = 0;
+ socklen_t length = sizeof(actual_sndbuf);
+ ASSERT_THAT(getsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDBUF,
+ &actual_sndbuf, &length),
+ SyscallSucceeds());
+
+ if (actual_sndbuf != sndbuf) {
+ // Unable to get the sndbuf we want.
+ //
+ // N.B. At minimum, the socketpair gofer should provide a socket that is
+ // already the correct size.
+ //
+ // TODO(b/35921550): When internal UDS support SO_SNDBUF, we can assert that
+ // we always get the right SO_SNDBUF on gVisor.
+ GTEST_SKIP() << "SO_SNDBUF = " << actual_sndbuf << ", want " << sndbuf;
+ }
+
+ std::vector<char> write_buf(buffer_size, 'a');
+ const int ret = RetryEINTR(write)(sockets->first_fd(), write_buf.data(),
+ write_buf.size());
+ if (ret < 0 && errno == ENOBUFS) {
+ // NOTE(b/116636318): Linux may stall the write for a long time and
+ // ultimately return ENOBUFS. Allow this error, since a retry will likely
+ // result in the same error.
+ return;
+ }
+ ASSERT_THAT(ret, SyscallSucceeds());
+
+ // Create a contiguous region of memory of 2*UIO_MAXIOV*PAGE_SIZE. We'll call
+ // sendmsg with a single iov, but the goal is to get the sentry to split this
+ // into > UIO_MAXIOV iovs when calling the kernel.
+ void* ptr;
+ std::vector<Mapping> pages =
+ ASSERT_NO_ERRNO_AND_VALUE(CreateFragmentedRegion(buffer_size, &ptr));
+
+ ASSERT_NO_FATAL_FAILURE(RecvNoCmsg(
+ sockets->second_fd(), reinterpret_cast<char*>(ptr), buffer_size));
+
+ EXPECT_EQ(0, memcmp(write_buf.data(), ptr, buffer_size));
+}
+
+TEST_P(UnixNonStreamSocketPairTest, SendTimeout) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ struct timeval tv {
+ .tv_sec = 0, .tv_usec = 10
+ };
+ EXPECT_THAT(
+ setsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)),
+ SyscallSucceeds());
+
+ const int buf_size = 5 * kPageSize;
+ EXPECT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_SNDBUF, &buf_size,
+ sizeof(buf_size)),
+ SyscallSucceeds());
+ EXPECT_THAT(setsockopt(sockets->second_fd(), SOL_SOCKET, SO_RCVBUF, &buf_size,
+ sizeof(buf_size)),
+ SyscallSucceeds());
+
+ // The buffer size should be big enough to avoid many iterations in the next
+ // loop. Otherwise, this will slow down cooperative_save tests.
+ std::vector<char> buf(kPageSize);
+ for (;;) {
+ int ret;
+ ASSERT_THAT(
+ ret = RetryEINTR(send)(sockets->first_fd(), buf.data(), buf.size(), 0),
+ ::testing::AnyOf(SyscallSucceeds(), SyscallFailsWithErrno(EAGAIN)));
+ if (ret == -1) {
+ break;
+ }
+ }
+}
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_non_stream.h b/test/syscalls/linux/socket_unix_non_stream.h
new file mode 100644
index 000000000..7478ab172
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_non_stream.h
@@ -0,0 +1,30 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_UNIX_NON_STREAM_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_UNIX_NON_STREAM_H_
+
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to pairs of connected non-stream
+// unix-domain sockets.
+using UnixNonStreamSocketPairTest = SocketPairTest;
+
+} // namespace testing
+} // namespace gvisor
+
+#endif // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_UNIX_NON_STREAM_H_
diff --git a/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc b/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc
new file mode 100644
index 000000000..fddcdf1c5
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc
@@ -0,0 +1,42 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "test/syscalls/linux/socket_non_stream_blocking.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+ return VecCat<SocketPairKind>(
+ ApplyVec<SocketPairKind>(UnixDomainSocketPair,
+ std::vector<int>{SOCK_DGRAM, SOCK_SEQPACKET}),
+ ApplyVec<SocketPairKind>(FilesystemBoundUnixDomainSocketPair,
+ std::vector<int>{SOCK_DGRAM, SOCK_SEQPACKET}),
+ ApplyVec<SocketPairKind>(AbstractBoundUnixDomainSocketPair,
+ std::vector<int>{SOCK_DGRAM, SOCK_SEQPACKET}));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ BlockingNonStreamUnixSockets, BlockingNonStreamSocketPairTest,
+ ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_pair.cc b/test/syscalls/linux/socket_unix_pair.cc
new file mode 100644
index 000000000..85999db04
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_pair.cc
@@ -0,0 +1,44 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/socket_unix.h"
+#include "test/syscalls/linux/socket_unix_cmsg.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+ return VecCat<SocketPairKind>(ApplyVec<SocketPairKind>(
+ UnixDomainSocketPair,
+ AllBitwiseCombinations(List<int>{SOCK_STREAM, SOCK_DGRAM, SOCK_SEQPACKET},
+ List<int>{0, SOCK_NONBLOCK})));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ AllUnixDomainSockets, UnixSocketPairTest,
+ ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+INSTANTIATE_TEST_SUITE_P(
+ AllUnixDomainSockets, UnixSocketPairCmsgTest,
+ ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_pair_nonblock.cc b/test/syscalls/linux/socket_unix_pair_nonblock.cc
new file mode 100644
index 000000000..281410a9a
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_pair_nonblock.cc
@@ -0,0 +1,39 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "test/syscalls/linux/socket_non_blocking.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+ return ApplyVec<SocketPairKind>(
+ UnixDomainSocketPair,
+ AllBitwiseCombinations(List<int>{SOCK_STREAM, SOCK_DGRAM, SOCK_SEQPACKET},
+ List<int>{SOCK_NONBLOCK}));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ NonBlockingUnixSockets, NonBlockingSocketPairTest,
+ ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_seqpacket.cc b/test/syscalls/linux/socket_unix_seqpacket.cc
new file mode 100644
index 000000000..6d03df4d9
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_seqpacket.cc
@@ -0,0 +1,67 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_unix_seqpacket.h"
+
+#include <stdio.h>
+#include <sys/un.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST_P(SeqpacketUnixSocketPairTest, WriteOneSideClosed) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ ASSERT_THAT(close(sockets->release_first_fd()), SyscallSucceeds());
+ constexpr char kStr[] = "abc";
+ ASSERT_THAT(write(sockets->second_fd(), kStr, 3),
+ SyscallFailsWithErrno(EPIPE));
+}
+
+TEST_P(SeqpacketUnixSocketPairTest, ReadOneSideClosed) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ ASSERT_THAT(close(sockets->release_first_fd()), SyscallSucceeds());
+ char data[10] = {};
+ ASSERT_THAT(read(sockets->second_fd(), data, sizeof(data)),
+ SyscallSucceedsWithValue(0));
+}
+
+TEST_P(SeqpacketUnixSocketPairTest, Sendto) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ struct sockaddr_un addr = {};
+ addr.sun_family = AF_UNIX;
+ constexpr char kPath[] = "\0nonexistent";
+ memcpy(addr.sun_path, kPath, sizeof(kPath));
+
+ constexpr char kStr[] = "abc";
+ ASSERT_THAT(sendto(sockets->second_fd(), kStr, 3, 0, (struct sockaddr*)&addr,
+ sizeof(addr)),
+ SyscallSucceedsWithValue(3));
+
+ char data[10] = {};
+ ASSERT_THAT(read(sockets->first_fd(), data, sizeof(data)),
+ SyscallSucceedsWithValue(3));
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_seqpacket.h b/test/syscalls/linux/socket_unix_seqpacket.h
new file mode 100644
index 000000000..30d9b9edf
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_seqpacket.h
@@ -0,0 +1,30 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_UNIX_SEQPACKET_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_UNIX_SEQPACKET_H_
+
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Test fixture for tests that apply to pairs of connected seqpacket unix
+// sockets.
+using SeqpacketUnixSocketPairTest = SocketPairTest;
+
+} // namespace testing
+} // namespace gvisor
+
+#endif // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_UNIX_SEQPACKET_H_
diff --git a/test/syscalls/linux/socket_unix_seqpacket_local.cc b/test/syscalls/linux/socket_unix_seqpacket_local.cc
new file mode 100644
index 000000000..69a5f150d
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_seqpacket_local.cc
@@ -0,0 +1,58 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "test/syscalls/linux/socket_non_stream.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/socket_unix_non_stream.h"
+#include "test/syscalls/linux/socket_unix_seqpacket.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+ return VecCat<SocketPairKind>(VecCat<SocketPairKind>(
+ ApplyVec<SocketPairKind>(
+ UnixDomainSocketPair,
+ AllBitwiseCombinations(List<int>{SOCK_SEQPACKET},
+ List<int>{0, SOCK_NONBLOCK})),
+ ApplyVec<SocketPairKind>(
+ FilesystemBoundUnixDomainSocketPair,
+ AllBitwiseCombinations(List<int>{SOCK_SEQPACKET},
+ List<int>{0, SOCK_NONBLOCK})),
+ ApplyVec<SocketPairKind>(
+ AbstractBoundUnixDomainSocketPair,
+ AllBitwiseCombinations(List<int>{SOCK_SEQPACKET},
+ List<int>{0, SOCK_NONBLOCK}))));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ SeqpacketUnixSockets, NonStreamSocketPairTest,
+ ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+INSTANTIATE_TEST_SUITE_P(
+ SeqpacketUnixSockets, SeqpacketUnixSocketPairTest,
+ ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+INSTANTIATE_TEST_SUITE_P(
+ SeqpacketUnixSockets, UnixNonStreamSocketPairTest,
+ ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_stream.cc b/test/syscalls/linux/socket_unix_stream.cc
new file mode 100644
index 000000000..99e77b89e
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_stream.cc
@@ -0,0 +1,125 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <poll.h>
+#include <stdio.h>
+#include <sys/un.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Test fixture for tests that apply to pairs of connected stream unix sockets.
+using StreamUnixSocketPairTest = SocketPairTest;
+
+TEST_P(StreamUnixSocketPairTest, WriteOneSideClosed) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ ASSERT_THAT(close(sockets->release_first_fd()), SyscallSucceeds());
+ constexpr char kStr[] = "abc";
+ ASSERT_THAT(write(sockets->second_fd(), kStr, 3),
+ SyscallFailsWithErrno(EPIPE));
+}
+
+TEST_P(StreamUnixSocketPairTest, ReadOneSideClosed) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ ASSERT_THAT(close(sockets->release_first_fd()), SyscallSucceeds());
+ char data[10] = {};
+ ASSERT_THAT(read(sockets->second_fd(), data, sizeof(data)),
+ SyscallSucceedsWithValue(0));
+}
+
+TEST_P(StreamUnixSocketPairTest, RecvmsgOneSideClosed) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ // Set timeout so that it will not wait for ever.
+ struct timeval tv {
+ .tv_sec = 0, .tv_usec = 10
+ };
+ EXPECT_THAT(setsockopt(sockets->second_fd(), SOL_SOCKET, SO_RCVTIMEO, &tv,
+ sizeof(tv)),
+ SyscallSucceeds());
+
+ ASSERT_THAT(close(sockets->release_first_fd()), SyscallSucceeds());
+
+ char received_data[10] = {};
+ struct iovec iov;
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ struct msghdr msg = {};
+ msg.msg_flags = -1;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(recvmsg(sockets->second_fd(), &msg, MSG_WAITALL),
+ SyscallSucceedsWithValue(0));
+}
+
+TEST_P(StreamUnixSocketPairTest, ReadOneSideClosedWithUnreadData) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char buf[10] = {};
+ ASSERT_THAT(RetryEINTR(write)(sockets->second_fd(), buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ ASSERT_THAT(shutdown(sockets->first_fd(), SHUT_RDWR), SyscallSucceeds());
+
+ ASSERT_THAT(RetryEINTR(read)(sockets->second_fd(), buf, sizeof(buf)),
+ SyscallSucceedsWithValue(0));
+
+ ASSERT_THAT(close(sockets->release_first_fd()), SyscallSucceeds());
+
+ ASSERT_THAT(RetryEINTR(read)(sockets->second_fd(), buf, sizeof(buf)),
+ SyscallFailsWithErrno(ECONNRESET));
+}
+
+TEST_P(StreamUnixSocketPairTest, Sendto) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ struct sockaddr_un addr = {};
+ addr.sun_family = AF_UNIX;
+ constexpr char kPath[] = "\0nonexistent";
+ memcpy(addr.sun_path, kPath, sizeof(kPath));
+
+ constexpr char kStr[] = "abc";
+ ASSERT_THAT(sendto(sockets->second_fd(), kStr, 3, 0, (struct sockaddr*)&addr,
+ sizeof(addr)),
+ SyscallFailsWithErrno(EISCONN));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ AllUnixDomainSockets, StreamUnixSocketPairTest,
+ ::testing::ValuesIn(IncludeReversals(VecCat<SocketPairKind>(
+ ApplyVec<SocketPairKind>(UnixDomainSocketPair,
+ AllBitwiseCombinations(List<int>{SOCK_STREAM},
+ List<int>{
+ 0, SOCK_NONBLOCK})),
+ ApplyVec<SocketPairKind>(FilesystemBoundUnixDomainSocketPair,
+ AllBitwiseCombinations(List<int>{SOCK_STREAM},
+ List<int>{
+ 0, SOCK_NONBLOCK})),
+ ApplyVec<SocketPairKind>(
+ AbstractBoundUnixDomainSocketPair,
+ AllBitwiseCombinations(List<int>{SOCK_STREAM},
+ List<int>{0, SOCK_NONBLOCK}))))));
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_stream_blocking_local.cc b/test/syscalls/linux/socket_unix_stream_blocking_local.cc
new file mode 100644
index 000000000..8429bd429
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_stream_blocking_local.cc
@@ -0,0 +1,40 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "test/syscalls/linux/socket_stream_blocking.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+ return {
+ UnixDomainSocketPair(SOCK_STREAM),
+ FilesystemBoundUnixDomainSocketPair(SOCK_STREAM),
+ AbstractBoundUnixDomainSocketPair(SOCK_STREAM),
+ };
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ BlockingStreamUnixSockets, BlockingStreamSocketPairTest,
+ ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_stream_local.cc b/test/syscalls/linux/socket_unix_stream_local.cc
new file mode 100644
index 000000000..a7e3449a9
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_stream_local.cc
@@ -0,0 +1,48 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "test/syscalls/linux/socket_stream.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+ return VecCat<SocketPairKind>(
+ ApplyVec<SocketPairKind>(
+ UnixDomainSocketPair,
+ AllBitwiseCombinations(List<int>{SOCK_STREAM},
+ List<int>{0, SOCK_NONBLOCK})),
+ ApplyVec<SocketPairKind>(
+ FilesystemBoundUnixDomainSocketPair,
+ AllBitwiseCombinations(List<int>{SOCK_STREAM},
+ List<int>{0, SOCK_NONBLOCK})),
+ ApplyVec<SocketPairKind>(
+ AbstractBoundUnixDomainSocketPair,
+ AllBitwiseCombinations(List<int>{SOCK_STREAM},
+ List<int>{0, SOCK_NONBLOCK})));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ StreamUnixSockets, StreamSocketPairTest,
+ ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_stream_nonblock_local.cc b/test/syscalls/linux/socket_unix_stream_nonblock_local.cc
new file mode 100644
index 000000000..4b763c8e2
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_stream_nonblock_local.cc
@@ -0,0 +1,39 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <vector>
+
+#include "test/syscalls/linux/socket_stream_nonblock.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+std::vector<SocketPairKind> GetSocketPairs() {
+ return {
+ UnixDomainSocketPair(SOCK_STREAM | SOCK_NONBLOCK),
+ FilesystemBoundUnixDomainSocketPair(SOCK_STREAM | SOCK_NONBLOCK),
+ AbstractBoundUnixDomainSocketPair(SOCK_STREAM | SOCK_NONBLOCK),
+ };
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ NonBlockingStreamUnixSockets, NonBlockingStreamSocketPairTest,
+ ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_unbound_abstract.cc b/test/syscalls/linux/socket_unix_unbound_abstract.cc
new file mode 100644
index 000000000..8b1762000
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_unbound_abstract.cc
@@ -0,0 +1,116 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <sys/un.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Test fixture for tests that apply to pairs of unbound abstract unix sockets.
+using UnboundAbstractUnixSocketPairTest = SocketPairTest;
+
+TEST_P(UnboundAbstractUnixSocketPairTest, AddressAfterNull) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ struct sockaddr_un addr =
+ *reinterpret_cast<const struct sockaddr_un*>(sockets->first_addr());
+ ASSERT_EQ(addr.sun_path[sizeof(addr.sun_path) - 1], 0);
+ SKIP_IF(addr.sun_path[sizeof(addr.sun_path) - 2] != 0 ||
+ addr.sun_path[sizeof(addr.sun_path) - 3] != 0);
+
+ addr.sun_path[sizeof(addr.sun_path) - 2] = 'a';
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ ASSERT_THAT(bind(sockets->second_fd(),
+ reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr)),
+ SyscallSucceeds());
+}
+
+TEST_P(UnboundAbstractUnixSocketPairTest, ShortAddressNotExtended) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ struct sockaddr_un addr =
+ *reinterpret_cast<const struct sockaddr_un*>(sockets->first_addr());
+ ASSERT_EQ(addr.sun_path[sizeof(addr.sun_path) - 1], 0);
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size() - 1),
+ SyscallSucceeds());
+
+ ASSERT_THAT(bind(sockets->second_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+}
+
+TEST_P(UnboundAbstractUnixSocketPairTest, BindNothing) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ struct sockaddr_un addr = {.sun_family = AF_UNIX};
+ ASSERT_THAT(bind(sockets->first_fd(),
+ reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr)),
+ SyscallSucceeds());
+}
+
+TEST_P(UnboundAbstractUnixSocketPairTest, GetSockNameFullLength) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ sockaddr_storage addr = {};
+ socklen_t addr_len = sizeof(addr);
+ ASSERT_THAT(getsockname(sockets->first_fd(),
+ reinterpret_cast<struct sockaddr*>(&addr), &addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(addr_len, sockets->first_addr_size());
+}
+
+TEST_P(UnboundAbstractUnixSocketPairTest, GetSockNamePartialLength) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size() - 1),
+ SyscallSucceeds());
+
+ sockaddr_storage addr = {};
+ socklen_t addr_len = sizeof(addr);
+ ASSERT_THAT(getsockname(sockets->first_fd(),
+ reinterpret_cast<struct sockaddr*>(&addr), &addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(addr_len, sockets->first_addr_size() - 1);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ AllUnixDomainSockets, UnboundAbstractUnixSocketPairTest,
+ ::testing::ValuesIn(ApplyVec<SocketPairKind>(
+ AbstractUnboundUnixDomainSocketPair,
+ AllBitwiseCombinations(List<int>{SOCK_STREAM, SOCK_SEQPACKET,
+ SOCK_DGRAM},
+ List<int>{0, SOCK_NONBLOCK}))));
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_unbound_dgram.cc b/test/syscalls/linux/socket_unix_unbound_dgram.cc
new file mode 100644
index 000000000..907dca0f1
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_unbound_dgram.cc
@@ -0,0 +1,183 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Test fixture for tests that apply to pairs of unbound dgram unix sockets.
+using UnboundDgramUnixSocketPairTest = SocketPairTest;
+
+TEST_P(UnboundDgramUnixSocketPairTest, BindConnect) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+ ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+}
+
+TEST_P(UnboundDgramUnixSocketPairTest, SelfConnect) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+ ASSERT_THAT(connect(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+}
+
+TEST_P(UnboundDgramUnixSocketPairTest, DoubleConnect) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+ ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+ ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+}
+
+TEST_P(UnboundDgramUnixSocketPairTest, GetRemoteAddress) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+ ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ socklen_t addressLength = sockets->first_addr_size();
+ struct sockaddr_storage address = {};
+ ASSERT_THAT(getpeername(sockets->second_fd(), (struct sockaddr*)(&address),
+ &addressLength),
+ SyscallSucceeds());
+ EXPECT_EQ(
+ 0, memcmp(&address, sockets->first_addr(), sockets->first_addr_size()));
+}
+
+TEST_P(UnboundDgramUnixSocketPairTest, Sendto) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ ASSERT_THAT(sendto(sockets->second_fd(), sent_data, sizeof(sent_data), 0,
+ sockets->first_addr(), sockets->first_addr_size()),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ char received_data[sizeof(sent_data)];
+ ASSERT_THAT(ReadFd(sockets->first_fd(), received_data, sizeof(received_data)),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(received_data)));
+}
+
+TEST_P(UnboundDgramUnixSocketPairTest, ZeroWriteAllowed) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+ ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ char sent_data[3];
+ // Send a zero length packet.
+ ASSERT_THAT(write(sockets->second_fd(), sent_data, 0),
+ SyscallSucceedsWithValue(0));
+ // Receive the packet.
+ char received_data[sizeof(sent_data)];
+ ASSERT_THAT(read(sockets->first_fd(), received_data, sizeof(received_data)),
+ SyscallSucceedsWithValue(0));
+}
+
+TEST_P(UnboundDgramUnixSocketPairTest, Listen) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ ASSERT_THAT(listen(sockets->first_fd(), 0), SyscallFailsWithErrno(ENOTSUP));
+}
+
+TEST_P(UnboundDgramUnixSocketPairTest, Accept) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ ASSERT_THAT(accept(sockets->first_fd(), nullptr, nullptr),
+ SyscallFailsWithErrno(ENOTSUP));
+}
+
+TEST_P(UnboundDgramUnixSocketPairTest, SendtoWithoutConnect) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ char data = 'a';
+ ASSERT_THAT(
+ RetryEINTR(sendto)(sockets->second_fd(), &data, sizeof(data), 0,
+ sockets->first_addr(), sockets->first_addr_size()),
+ SyscallSucceedsWithValue(sizeof(data)));
+}
+
+TEST_P(UnboundDgramUnixSocketPairTest, SendtoWithoutConnectPassCreds) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ SetSoPassCred(sockets->first_fd());
+ char data = 'a';
+ ASSERT_THAT(
+ RetryEINTR(sendto)(sockets->second_fd(), &data, sizeof(data), 0,
+ sockets->first_addr(), sockets->first_addr_size()),
+ SyscallSucceedsWithValue(sizeof(data)));
+ ucred creds;
+ creds.pid = -1;
+ char buf[sizeof(data) + 1];
+ ASSERT_NO_FATAL_FAILURE(
+ RecvCreds(sockets->first_fd(), &creds, buf, sizeof(buf), sizeof(data)));
+ EXPECT_EQ(0, memcmp(&data, buf, sizeof(data)));
+ EXPECT_THAT(getpid(), SyscallSucceedsWithValue(creds.pid));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ AllUnixDomainSockets, UnboundDgramUnixSocketPairTest,
+ ::testing::ValuesIn(VecCat<SocketPairKind>(
+ ApplyVec<SocketPairKind>(FilesystemUnboundUnixDomainSocketPair,
+ AllBitwiseCombinations(List<int>{SOCK_DGRAM},
+ List<int>{
+ 0, SOCK_NONBLOCK})),
+ ApplyVec<SocketPairKind>(
+ AbstractUnboundUnixDomainSocketPair,
+ AllBitwiseCombinations(List<int>{SOCK_DGRAM},
+ List<int>{0, SOCK_NONBLOCK})))));
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_unbound_filesystem.cc b/test/syscalls/linux/socket_unix_unbound_filesystem.cc
new file mode 100644
index 000000000..cab912152
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_unbound_filesystem.cc
@@ -0,0 +1,84 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <sys/un.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Test fixture for tests that apply to pairs of unbound filesystem unix
+// sockets.
+using UnboundFilesystemUnixSocketPairTest = SocketPairTest;
+
+TEST_P(UnboundFilesystemUnixSocketPairTest, AddressAfterNull) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ struct sockaddr_un addr =
+ *reinterpret_cast<const struct sockaddr_un*>(sockets->first_addr());
+ ASSERT_EQ(addr.sun_path[sizeof(addr.sun_path) - 1], 0);
+ SKIP_IF(addr.sun_path[sizeof(addr.sun_path) - 2] != 0 ||
+ addr.sun_path[sizeof(addr.sun_path) - 3] != 0);
+
+ addr.sun_path[sizeof(addr.sun_path) - 2] = 'a';
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ ASSERT_THAT(bind(sockets->second_fd(),
+ reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr)),
+ SyscallFailsWithErrno(EADDRINUSE));
+}
+
+TEST_P(UnboundFilesystemUnixSocketPairTest, GetSockNameLength) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ sockaddr_storage got_addr = {};
+ socklen_t got_addr_len = sizeof(got_addr);
+ ASSERT_THAT(
+ getsockname(sockets->first_fd(),
+ reinterpret_cast<struct sockaddr*>(&got_addr), &got_addr_len),
+ SyscallSucceeds());
+
+ sockaddr_un want_addr =
+ *reinterpret_cast<const struct sockaddr_un*>(sockets->first_addr());
+
+ EXPECT_EQ(got_addr_len,
+ strlen(want_addr.sun_path) + 1 + sizeof(want_addr.sun_family));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ AllUnixDomainSockets, UnboundFilesystemUnixSocketPairTest,
+ ::testing::ValuesIn(ApplyVec<SocketPairKind>(
+ FilesystemUnboundUnixDomainSocketPair,
+ AllBitwiseCombinations(List<int>{SOCK_STREAM, SOCK_SEQPACKET,
+ SOCK_DGRAM},
+ List<int>{0, SOCK_NONBLOCK}))));
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_unbound_seqpacket.cc b/test/syscalls/linux/socket_unix_unbound_seqpacket.cc
new file mode 100644
index 000000000..cb99030f5
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_unbound_seqpacket.cc
@@ -0,0 +1,89 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <sys/un.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Test fixture for tests that apply to pairs of unbound seqpacket unix sockets.
+using UnboundUnixSeqpacketSocketPairTest = SocketPairTest;
+
+TEST_P(UnboundUnixSeqpacketSocketPairTest, SendtoWithoutConnect) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ char data = 'a';
+ ASSERT_THAT(sendto(sockets->second_fd(), &data, sizeof(data), 0,
+ sockets->first_addr(), sockets->first_addr_size()),
+ SyscallFailsWithErrno(ENOTCONN));
+}
+
+TEST_P(UnboundUnixSeqpacketSocketPairTest, SendtoWithoutConnectIgnoresAddr) {
+ // FIXME(b/68223466): gVisor tries to find /foo/bar and thus returns ENOENT.
+ if (IsRunningOnGvisor()) {
+ return;
+ }
+
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ // Even a bogus address is completely ignored.
+ constexpr char kPath[] = "/foo/bar";
+
+ // Sanity check that kPath doesn't exist.
+ struct stat s;
+ ASSERT_THAT(stat(kPath, &s), SyscallFailsWithErrno(ENOENT));
+
+ struct sockaddr_un addr = {};
+ addr.sun_family = AF_UNIX;
+ memcpy(addr.sun_path, kPath, sizeof(kPath));
+
+ char data = 'a';
+ ASSERT_THAT(
+ sendto(sockets->second_fd(), &data, sizeof(data), 0,
+ reinterpret_cast<const struct sockaddr*>(&addr), sizeof(addr)),
+ SyscallFailsWithErrno(ENOTCONN));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ AllUnixDomainSockets, UnboundUnixSeqpacketSocketPairTest,
+ ::testing::ValuesIn(IncludeReversals(VecCat<SocketPairKind>(
+ ApplyVec<SocketPairKind>(
+ FilesystemUnboundUnixDomainSocketPair,
+ AllBitwiseCombinations(List<int>{SOCK_SEQPACKET},
+ List<int>{0, SOCK_NONBLOCK})),
+ ApplyVec<SocketPairKind>(
+ AbstractUnboundUnixDomainSocketPair,
+ AllBitwiseCombinations(List<int>{SOCK_SEQPACKET},
+ List<int>{0, SOCK_NONBLOCK}))))));
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_unbound_stream.cc b/test/syscalls/linux/socket_unix_unbound_stream.cc
new file mode 100644
index 000000000..f185dded3
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_unbound_stream.cc
@@ -0,0 +1,733 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <sys/un.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Test fixture for tests that apply to pairs of connected unix stream sockets.
+using UnixStreamSocketPairTest = SocketPairTest;
+
+// FDPassPartialRead checks that sent control messages cannot be read after
+// any of their associated data has been read while ignoring the control message
+// by using read(2) instead of recvmsg(2).
+TEST_P(UnixStreamSocketPairTest, FDPassPartialRead) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+ sent_data, sizeof(sent_data)));
+
+ char received_data[sizeof(sent_data) / 2];
+ ASSERT_THAT(
+ ReadFd(sockets->second_fd(), received_data, sizeof(received_data)),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(received_data)));
+
+ RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data));
+ EXPECT_EQ(0, memcmp(sent_data + sizeof(received_data), received_data,
+ sizeof(received_data)));
+}
+
+TEST_P(UnixStreamSocketPairTest, FDPassCoalescedRead) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data1[20];
+ RandomizeBuffer(sent_data1, sizeof(sent_data1));
+
+ auto pair1 =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair1->second_fd(),
+ sent_data1, sizeof(sent_data1)));
+
+ char sent_data2[20];
+ RandomizeBuffer(sent_data2, sizeof(sent_data2));
+
+ auto pair2 =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair2->second_fd(),
+ sent_data2, sizeof(sent_data2)));
+
+ char received_data[sizeof(sent_data1) + sizeof(sent_data2)];
+ ASSERT_THAT(
+ ReadFd(sockets->second_fd(), received_data, sizeof(received_data)),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data1, received_data, sizeof(sent_data1)));
+ EXPECT_EQ(0, memcmp(sent_data2, received_data + sizeof(sent_data1),
+ sizeof(sent_data2)));
+}
+
+// ZeroLengthMessageFDDiscarded checks that control messages associated with
+// zero length messages are discarded.
+TEST_P(UnixStreamSocketPairTest, ZeroLengthMessageFDDiscarded) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ // Zero length arrays are invalid in ISO C++, so allocate one of size 1 and
+ // send a length of 0.
+ char sent_data1[1] = {};
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ ASSERT_NO_FATAL_FAILURE(
+ SendSingleFD(sockets->first_fd(), pair->second_fd(), sent_data1, 0));
+
+ char sent_data2[20];
+ RandomizeBuffer(sent_data2, sizeof(sent_data2));
+
+ ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data2, sizeof(sent_data2)),
+ SyscallSucceedsWithValue(sizeof(sent_data2)));
+
+ char received_data[sizeof(sent_data2)] = {};
+
+ RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data));
+ EXPECT_EQ(0, memcmp(sent_data2, received_data, sizeof(received_data)));
+}
+
+// FDPassCoalescedRecv checks that control messages not in the first message are
+// preserved in a coalesced recv.
+TEST_P(UnixStreamSocketPairTest, FDPassCoalescedRecv) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data, sizeof(sent_data) / 2),
+ SyscallSucceedsWithValue(sizeof(sent_data) / 2));
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+ sent_data + sizeof(sent_data) / 2,
+ sizeof(sent_data) / 2));
+
+ char received_data[sizeof(sent_data)];
+
+ int fd = -1;
+ ASSERT_NO_FATAL_FAILURE(RecvSingleFD(sockets->second_fd(), &fd, received_data,
+ sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ ASSERT_NO_FATAL_FAILURE(TransferTest(fd, pair->first_fd()));
+}
+
+// ReadsNotCoalescedAfterFDPass checks that messages after a message containing
+// an FD control message are not coalesced.
+TEST_P(UnixStreamSocketPairTest, ReadsNotCoalescedAfterFDPass) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+ sent_data, sizeof(sent_data) / 2));
+
+ ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data + sizeof(sent_data) / 2,
+ sizeof(sent_data) / 2),
+ SyscallSucceedsWithValue(sizeof(sent_data) / 2));
+
+ char received_data[sizeof(sent_data)];
+
+ int fd = -1;
+ ASSERT_NO_FATAL_FAILURE(RecvSingleFD(sockets->second_fd(), &fd, received_data,
+ sizeof(received_data),
+ sizeof(sent_data) / 2));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data) / 2));
+
+ ASSERT_NO_FATAL_FAILURE(TransferTest(fd, pair->first_fd()));
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+
+ ASSERT_NO_FATAL_FAILURE(
+ RecvNoCmsg(sockets->second_fd(), received_data, sizeof(sent_data) / 2));
+
+ EXPECT_EQ(0, memcmp(sent_data + sizeof(sent_data) / 2, received_data,
+ sizeof(sent_data) / 2));
+}
+
+// FDPassNotCombined checks that FD control messages are not combined in a
+// coalesced read.
+TEST_P(UnixStreamSocketPairTest, FDPassNotCombined) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair1 =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair1->second_fd(),
+ sent_data, sizeof(sent_data) / 2));
+
+ auto pair2 =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair2->second_fd(),
+ sent_data + sizeof(sent_data) / 2,
+ sizeof(sent_data) / 2));
+
+ char received_data[sizeof(sent_data)];
+
+ int fd = -1;
+ ASSERT_NO_FATAL_FAILURE(RecvSingleFD(sockets->second_fd(), &fd, received_data,
+ sizeof(received_data),
+ sizeof(sent_data) / 2));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data) / 2));
+
+ ASSERT_NO_FATAL_FAILURE(TransferTest(fd, pair1->first_fd()));
+
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+ fd = -1;
+
+ ASSERT_NO_FATAL_FAILURE(RecvSingleFD(sockets->second_fd(), &fd, received_data,
+ sizeof(received_data),
+ sizeof(sent_data) / 2));
+
+ EXPECT_EQ(0, memcmp(sent_data + sizeof(sent_data) / 2, received_data,
+ sizeof(sent_data) / 2));
+
+ ASSERT_NO_FATAL_FAILURE(TransferTest(fd, pair2->first_fd()));
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
+TEST_P(UnixStreamSocketPairTest, CredPassPartialRead) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ struct ucred sent_creds;
+
+ ASSERT_THAT(sent_creds.pid = getpid(), SyscallSucceeds());
+ ASSERT_THAT(sent_creds.uid = getuid(), SyscallSucceeds());
+ ASSERT_THAT(sent_creds.gid = getgid(), SyscallSucceeds());
+
+ ASSERT_NO_FATAL_FAILURE(
+ SendCreds(sockets->first_fd(), sent_creds, sent_data, sizeof(sent_data)));
+
+ int one = 1;
+ ASSERT_THAT(setsockopt(sockets->second_fd(), SOL_SOCKET, SO_PASSCRED, &one,
+ sizeof(one)),
+ SyscallSucceeds());
+
+ for (int i = 0; i < 2; i++) {
+ char received_data[10];
+ struct ucred received_creds;
+ ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
+ received_data, sizeof(received_data),
+ sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data + i * sizeof(received_data), received_data,
+ sizeof(received_data)));
+ EXPECT_EQ(sent_creds.pid, received_creds.pid);
+ EXPECT_EQ(sent_creds.uid, received_creds.uid);
+ EXPECT_EQ(sent_creds.gid, received_creds.gid);
+ }
+}
+
+// Unix stream sockets peek in the same way as datagram sockets.
+//
+// SinglePeek checks that only a single message is peekable in a single recv.
+TEST_P(UnixStreamSocketPairTest, SinglePeek) {
+ if (!IsRunningOnGvisor()) {
+ // Don't run this test on linux kernels newer than 4.3.x Linux kernel commit
+ // 9f389e35674f5b086edd70ed524ca0f287259725 which changes this behavior. We
+ // used to target 3.11 compatibility, so disable this test on newer kernels.
+ //
+ // NOTE(b/118902768): Bring this up to Linux 4.4 compatibility.
+ auto version = ASSERT_NO_ERRNO_AND_VALUE(GetKernelVersion());
+ SKIP_IF(version.major > 4 || (version.major == 4 && version.minor >= 3));
+ }
+
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ char sent_data[40];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+ ASSERT_THAT(RetryEINTR(send)(sockets->first_fd(), sent_data,
+ sizeof(sent_data) / 2, 0),
+ SyscallSucceedsWithValue(sizeof(sent_data) / 2));
+ ASSERT_THAT(
+ RetryEINTR(send)(sockets->first_fd(), sent_data + sizeof(sent_data) / 2,
+ sizeof(sent_data) / 2, 0),
+ SyscallSucceedsWithValue(sizeof(sent_data) / 2));
+ char received_data[sizeof(sent_data)];
+ for (int i = 0; i < 3; i++) {
+ memset(received_data, 0, sizeof(received_data));
+ ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+ sizeof(received_data), MSG_PEEK),
+ SyscallSucceedsWithValue(sizeof(sent_data) / 2));
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data) / 2));
+ }
+ memset(received_data, 0, sizeof(received_data));
+ ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+ sizeof(sent_data) / 2, 0),
+ SyscallSucceedsWithValue(sizeof(sent_data) / 2));
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data) / 2));
+ memset(received_data, 0, sizeof(received_data));
+ ASSERT_THAT(RetryEINTR(recv)(sockets->second_fd(), received_data,
+ sizeof(sent_data) / 2, 0),
+ SyscallSucceedsWithValue(sizeof(sent_data) / 2));
+ EXPECT_EQ(0, memcmp(sent_data + sizeof(sent_data) / 2, received_data,
+ sizeof(sent_data) / 2));
+}
+
+TEST_P(UnixStreamSocketPairTest, CredsNotCoalescedUp) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data1[20];
+ RandomizeBuffer(sent_data1, sizeof(sent_data1));
+
+ ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data1, sizeof(sent_data1)),
+ SyscallSucceedsWithValue(sizeof(sent_data1)));
+
+ SetSoPassCred(sockets->second_fd());
+
+ char sent_data2[20];
+ RandomizeBuffer(sent_data2, sizeof(sent_data2));
+
+ ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data2, sizeof(sent_data2)),
+ SyscallSucceedsWithValue(sizeof(sent_data2)));
+
+ char received_data[sizeof(sent_data1) + sizeof(sent_data2)];
+
+ struct ucred received_creds;
+ ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
+ received_data, sizeof(received_data),
+ sizeof(sent_data1)));
+
+ EXPECT_EQ(0, memcmp(sent_data1, received_data, sizeof(sent_data1)));
+
+ struct ucred want_creds {
+ 0, 65534, 65534
+ };
+
+ EXPECT_EQ(want_creds.pid, received_creds.pid);
+ EXPECT_EQ(want_creds.uid, received_creds.uid);
+ EXPECT_EQ(want_creds.gid, received_creds.gid);
+
+ ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
+ received_data, sizeof(received_data),
+ sizeof(sent_data2)));
+
+ EXPECT_EQ(0, memcmp(sent_data2, received_data, sizeof(sent_data2)));
+
+ ASSERT_THAT(want_creds.pid = getpid(), SyscallSucceeds());
+ ASSERT_THAT(want_creds.uid = getuid(), SyscallSucceeds());
+ ASSERT_THAT(want_creds.gid = getgid(), SyscallSucceeds());
+
+ EXPECT_EQ(want_creds.pid, received_creds.pid);
+ EXPECT_EQ(want_creds.uid, received_creds.uid);
+ EXPECT_EQ(want_creds.gid, received_creds.gid);
+}
+
+TEST_P(UnixStreamSocketPairTest, CredsNotCoalescedDown) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ SetSoPassCred(sockets->second_fd());
+
+ char sent_data1[20];
+ RandomizeBuffer(sent_data1, sizeof(sent_data1));
+
+ ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data1, sizeof(sent_data1)),
+ SyscallSucceedsWithValue(sizeof(sent_data1)));
+
+ UnsetSoPassCred(sockets->second_fd());
+
+ char sent_data2[20];
+ RandomizeBuffer(sent_data2, sizeof(sent_data2));
+
+ ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data2, sizeof(sent_data2)),
+ SyscallSucceedsWithValue(sizeof(sent_data2)));
+
+ SetSoPassCred(sockets->second_fd());
+
+ char received_data[sizeof(sent_data1) + sizeof(sent_data2)];
+ struct ucred received_creds;
+
+ ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
+ received_data, sizeof(received_data),
+ sizeof(sent_data1)));
+
+ EXPECT_EQ(0, memcmp(sent_data1, received_data, sizeof(sent_data1)));
+
+ struct ucred want_creds;
+ ASSERT_THAT(want_creds.pid = getpid(), SyscallSucceeds());
+ ASSERT_THAT(want_creds.uid = getuid(), SyscallSucceeds());
+ ASSERT_THAT(want_creds.gid = getgid(), SyscallSucceeds());
+
+ EXPECT_EQ(want_creds.pid, received_creds.pid);
+ EXPECT_EQ(want_creds.uid, received_creds.uid);
+ EXPECT_EQ(want_creds.gid, received_creds.gid);
+
+ ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
+ received_data, sizeof(received_data),
+ sizeof(sent_data2)));
+
+ EXPECT_EQ(0, memcmp(sent_data2, received_data, sizeof(sent_data2)));
+
+ want_creds = {0, 65534, 65534};
+
+ EXPECT_EQ(want_creds.pid, received_creds.pid);
+ EXPECT_EQ(want_creds.uid, received_creds.uid);
+ EXPECT_EQ(want_creds.gid, received_creds.gid);
+}
+
+TEST_P(UnixStreamSocketPairTest, CoalescedCredsNoPasscred) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ SetSoPassCred(sockets->second_fd());
+
+ char sent_data1[20];
+ RandomizeBuffer(sent_data1, sizeof(sent_data1));
+
+ ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data1, sizeof(sent_data1)),
+ SyscallSucceedsWithValue(sizeof(sent_data1)));
+
+ UnsetSoPassCred(sockets->second_fd());
+
+ char sent_data2[20];
+ RandomizeBuffer(sent_data2, sizeof(sent_data2));
+
+ ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data2, sizeof(sent_data2)),
+ SyscallSucceedsWithValue(sizeof(sent_data2)));
+
+ char received_data[sizeof(sent_data1) + sizeof(sent_data2)];
+
+ ASSERT_NO_FATAL_FAILURE(
+ RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data1, received_data, sizeof(sent_data1)));
+ EXPECT_EQ(0, memcmp(sent_data2, received_data + sizeof(sent_data1),
+ sizeof(sent_data2)));
+}
+
+TEST_P(UnixStreamSocketPairTest, CoalescedCreds1) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data1[20];
+ RandomizeBuffer(sent_data1, sizeof(sent_data1));
+
+ ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data1, sizeof(sent_data1)),
+ SyscallSucceedsWithValue(sizeof(sent_data1)));
+
+ char sent_data2[20];
+ RandomizeBuffer(sent_data2, sizeof(sent_data2));
+
+ ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data2, sizeof(sent_data2)),
+ SyscallSucceedsWithValue(sizeof(sent_data2)));
+
+ SetSoPassCred(sockets->second_fd());
+
+ char received_data[sizeof(sent_data1) + sizeof(sent_data2)];
+ struct ucred received_creds;
+
+ ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
+ received_data, sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data1, received_data, sizeof(sent_data1)));
+ EXPECT_EQ(0, memcmp(sent_data2, received_data + sizeof(sent_data1),
+ sizeof(sent_data2)));
+
+ struct ucred want_creds {
+ 0, 65534, 65534
+ };
+
+ EXPECT_EQ(want_creds.pid, received_creds.pid);
+ EXPECT_EQ(want_creds.uid, received_creds.uid);
+ EXPECT_EQ(want_creds.gid, received_creds.gid);
+}
+
+TEST_P(UnixStreamSocketPairTest, CoalescedCreds2) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ SetSoPassCred(sockets->second_fd());
+
+ char sent_data1[20];
+ RandomizeBuffer(sent_data1, sizeof(sent_data1));
+
+ ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data1, sizeof(sent_data1)),
+ SyscallSucceedsWithValue(sizeof(sent_data1)));
+
+ char sent_data2[20];
+ RandomizeBuffer(sent_data2, sizeof(sent_data2));
+
+ ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data2, sizeof(sent_data2)),
+ SyscallSucceedsWithValue(sizeof(sent_data2)));
+
+ char received_data[sizeof(sent_data1) + sizeof(sent_data2)];
+ struct ucred received_creds;
+
+ ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
+ received_data, sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data1, received_data, sizeof(sent_data1)));
+ EXPECT_EQ(0, memcmp(sent_data2, received_data + sizeof(sent_data1),
+ sizeof(sent_data2)));
+
+ struct ucred want_creds;
+ ASSERT_THAT(want_creds.pid = getpid(), SyscallSucceeds());
+ ASSERT_THAT(want_creds.uid = getuid(), SyscallSucceeds());
+ ASSERT_THAT(want_creds.gid = getgid(), SyscallSucceeds());
+
+ EXPECT_EQ(want_creds.pid, received_creds.pid);
+ EXPECT_EQ(want_creds.uid, received_creds.uid);
+ EXPECT_EQ(want_creds.gid, received_creds.gid);
+}
+
+TEST_P(UnixStreamSocketPairTest, NonCoalescedDifferingCreds1) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data1[20];
+ RandomizeBuffer(sent_data1, sizeof(sent_data1));
+
+ ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data1, sizeof(sent_data1)),
+ SyscallSucceedsWithValue(sizeof(sent_data1)));
+
+ SetSoPassCred(sockets->second_fd());
+
+ char sent_data2[20];
+ RandomizeBuffer(sent_data2, sizeof(sent_data2));
+
+ ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data2, sizeof(sent_data2)),
+ SyscallSucceedsWithValue(sizeof(sent_data2)));
+
+ char received_data1[sizeof(sent_data1) + sizeof(sent_data2)];
+ struct ucred received_creds1;
+
+ ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds1,
+ received_data1, sizeof(sent_data1)));
+
+ EXPECT_EQ(0, memcmp(sent_data1, received_data1, sizeof(sent_data1)));
+
+ struct ucred want_creds1 {
+ 0, 65534, 65534
+ };
+
+ EXPECT_EQ(want_creds1.pid, received_creds1.pid);
+ EXPECT_EQ(want_creds1.uid, received_creds1.uid);
+ EXPECT_EQ(want_creds1.gid, received_creds1.gid);
+
+ char received_data2[sizeof(sent_data1) + sizeof(sent_data2)];
+ struct ucred received_creds2;
+
+ ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds2,
+ received_data2, sizeof(sent_data2)));
+
+ EXPECT_EQ(0, memcmp(sent_data2, received_data2, sizeof(sent_data2)));
+
+ struct ucred want_creds2;
+ ASSERT_THAT(want_creds2.pid = getpid(), SyscallSucceeds());
+ ASSERT_THAT(want_creds2.uid = getuid(), SyscallSucceeds());
+ ASSERT_THAT(want_creds2.gid = getgid(), SyscallSucceeds());
+
+ EXPECT_EQ(want_creds2.pid, received_creds2.pid);
+ EXPECT_EQ(want_creds2.uid, received_creds2.uid);
+ EXPECT_EQ(want_creds2.gid, received_creds2.gid);
+}
+
+TEST_P(UnixStreamSocketPairTest, NonCoalescedDifferingCreds2) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ SetSoPassCred(sockets->second_fd());
+
+ char sent_data1[20];
+ RandomizeBuffer(sent_data1, sizeof(sent_data1));
+
+ ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data1, sizeof(sent_data1)),
+ SyscallSucceedsWithValue(sizeof(sent_data1)));
+
+ UnsetSoPassCred(sockets->second_fd());
+
+ char sent_data2[20];
+ RandomizeBuffer(sent_data2, sizeof(sent_data2));
+
+ ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data2, sizeof(sent_data2)),
+ SyscallSucceedsWithValue(sizeof(sent_data2)));
+
+ SetSoPassCred(sockets->second_fd());
+
+ char received_data1[sizeof(sent_data1) + sizeof(sent_data2)];
+ struct ucred received_creds1;
+
+ ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds1,
+ received_data1, sizeof(sent_data1)));
+
+ EXPECT_EQ(0, memcmp(sent_data1, received_data1, sizeof(sent_data1)));
+
+ struct ucred want_creds1;
+ ASSERT_THAT(want_creds1.pid = getpid(), SyscallSucceeds());
+ ASSERT_THAT(want_creds1.uid = getuid(), SyscallSucceeds());
+ ASSERT_THAT(want_creds1.gid = getgid(), SyscallSucceeds());
+
+ EXPECT_EQ(want_creds1.pid, received_creds1.pid);
+ EXPECT_EQ(want_creds1.uid, received_creds1.uid);
+ EXPECT_EQ(want_creds1.gid, received_creds1.gid);
+
+ char received_data2[sizeof(sent_data1) + sizeof(sent_data2)];
+ struct ucred received_creds2;
+
+ ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds2,
+ received_data2, sizeof(sent_data2)));
+
+ EXPECT_EQ(0, memcmp(sent_data2, received_data2, sizeof(sent_data2)));
+
+ struct ucred want_creds2 {
+ 0, 65534, 65534
+ };
+
+ EXPECT_EQ(want_creds2.pid, received_creds2.pid);
+ EXPECT_EQ(want_creds2.uid, received_creds2.uid);
+ EXPECT_EQ(want_creds2.gid, received_creds2.gid);
+}
+
+TEST_P(UnixStreamSocketPairTest, CoalescedDifferingCreds) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ SetSoPassCred(sockets->second_fd());
+
+ char sent_data1[20];
+ RandomizeBuffer(sent_data1, sizeof(sent_data1));
+
+ ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data1, sizeof(sent_data1)),
+ SyscallSucceedsWithValue(sizeof(sent_data1)));
+
+ char sent_data2[20];
+ RandomizeBuffer(sent_data2, sizeof(sent_data2));
+
+ ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data2, sizeof(sent_data2)),
+ SyscallSucceedsWithValue(sizeof(sent_data2)));
+
+ UnsetSoPassCred(sockets->second_fd());
+
+ char sent_data3[20];
+ RandomizeBuffer(sent_data3, sizeof(sent_data3));
+
+ ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data3, sizeof(sent_data3)),
+ SyscallSucceedsWithValue(sizeof(sent_data3)));
+
+ char received_data[sizeof(sent_data1) + sizeof(sent_data2) +
+ sizeof(sent_data3)];
+
+ ASSERT_NO_FATAL_FAILURE(
+ RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data1, received_data, sizeof(sent_data1)));
+ EXPECT_EQ(0, memcmp(sent_data2, received_data + sizeof(sent_data1),
+ sizeof(sent_data2)));
+ EXPECT_EQ(0, memcmp(sent_data3,
+ received_data + sizeof(sent_data1) + sizeof(sent_data2),
+ sizeof(sent_data3)));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ AllUnixDomainSockets, UnixStreamSocketPairTest,
+ ::testing::ValuesIn(IncludeReversals(VecCat<SocketPairKind>(
+ ApplyVec<SocketPairKind>(UnixDomainSocketPair,
+ AllBitwiseCombinations(List<int>{SOCK_STREAM},
+ List<int>{
+ 0, SOCK_NONBLOCK})),
+ ApplyVec<SocketPairKind>(FilesystemBoundUnixDomainSocketPair,
+ AllBitwiseCombinations(List<int>{SOCK_STREAM},
+ List<int>{
+ 0, SOCK_NONBLOCK})),
+ ApplyVec<SocketPairKind>(
+ AbstractBoundUnixDomainSocketPair,
+ AllBitwiseCombinations(List<int>{SOCK_STREAM},
+ List<int>{0, SOCK_NONBLOCK}))))));
+
+// Test fixture for tests that apply to pairs of unbound unix stream sockets.
+using UnboundUnixStreamSocketPairTest = SocketPairTest;
+
+TEST_P(UnboundUnixStreamSocketPairTest, SendtoWithoutConnect) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ char data = 'a';
+ ASSERT_THAT(sendto(sockets->second_fd(), &data, sizeof(data), 0,
+ sockets->first_addr(), sockets->first_addr_size()),
+ SyscallFailsWithErrno(EOPNOTSUPP));
+}
+
+TEST_P(UnboundUnixStreamSocketPairTest, SendtoWithoutConnectIgnoresAddr) {
+ // FIXME(b/68223466): gVisor tries to find /foo/bar and thus returns ENOENT.
+ if (IsRunningOnGvisor()) {
+ return;
+ }
+
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ // Even a bogus address is completely ignored.
+ constexpr char kPath[] = "/foo/bar";
+
+ // Sanity check that kPath doesn't exist.
+ struct stat s;
+ ASSERT_THAT(stat(kPath, &s), SyscallFailsWithErrno(ENOENT));
+
+ struct sockaddr_un addr = {};
+ addr.sun_family = AF_UNIX;
+ memcpy(addr.sun_path, kPath, sizeof(kPath));
+
+ char data = 'a';
+ ASSERT_THAT(
+ sendto(sockets->second_fd(), &data, sizeof(data), 0,
+ reinterpret_cast<const struct sockaddr*>(&addr), sizeof(addr)),
+ SyscallFailsWithErrno(EOPNOTSUPP));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ AllUnixDomainSockets, UnboundUnixStreamSocketPairTest,
+ ::testing::ValuesIn(IncludeReversals(VecCat<SocketPairKind>(
+ ApplyVec<SocketPairKind>(FilesystemUnboundUnixDomainSocketPair,
+ AllBitwiseCombinations(List<int>{SOCK_STREAM},
+ List<int>{
+ 0, SOCK_NONBLOCK})),
+ ApplyVec<SocketPairKind>(
+ AbstractUnboundUnixDomainSocketPair,
+ AllBitwiseCombinations(List<int>{SOCK_STREAM},
+ List<int>{0, SOCK_NONBLOCK}))))));
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/splice.cc b/test/syscalls/linux/splice.cc
new file mode 100644
index 000000000..08fc4b1b7
--- /dev/null
+++ b/test/syscalls/linux/splice.cc
@@ -0,0 +1,699 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <linux/unistd.h>
+#include <sys/eventfd.h>
+#include <sys/resource.h>
+#include <sys/sendfile.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(SpliceTest, TwoRegularFiles) {
+ // Create temp files.
+ const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+ // Open the input file as read only.
+ const FileDescriptor in_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+ // Open the output file as write only.
+ const FileDescriptor out_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_WRONLY));
+
+ // Verify that it is rejected as expected; regardless of offsets.
+ loff_t in_offset = 0;
+ loff_t out_offset = 0;
+ EXPECT_THAT(splice(in_fd.get(), &in_offset, out_fd.get(), &out_offset, 1, 0),
+ SyscallFailsWithErrno(EINVAL));
+ EXPECT_THAT(splice(in_fd.get(), nullptr, out_fd.get(), &out_offset, 1, 0),
+ SyscallFailsWithErrno(EINVAL));
+ EXPECT_THAT(splice(in_fd.get(), &in_offset, out_fd.get(), nullptr, 1, 0),
+ SyscallFailsWithErrno(EINVAL));
+ EXPECT_THAT(splice(in_fd.get(), nullptr, out_fd.get(), nullptr, 1, 0),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+int memfd_create(const std::string& name, unsigned int flags) {
+ return syscall(__NR_memfd_create, name.c_str(), flags);
+}
+
+TEST(SpliceTest, NegativeOffset) {
+ // Create a new pipe.
+ int fds[2];
+ ASSERT_THAT(pipe(fds), SyscallSucceeds());
+ const FileDescriptor rfd(fds[0]);
+ const FileDescriptor wfd(fds[1]);
+
+ // Fill the pipe.
+ std::vector<char> buf(kPageSize);
+ RandomizeBuffer(buf.data(), buf.size());
+ ASSERT_THAT(write(wfd.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(kPageSize));
+
+ // Open the output file as write only.
+ int fd;
+ EXPECT_THAT(fd = memfd_create("negative", 0), SyscallSucceeds());
+ const FileDescriptor out_fd(fd);
+
+ loff_t out_offset = 0xffffffffffffffffull;
+ constexpr int kSize = 2;
+ EXPECT_THAT(splice(rfd.get(), nullptr, out_fd.get(), &out_offset, kSize, 0),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+// Write offset + size overflows int64.
+//
+// This is a regression test for b/148041624.
+TEST(SpliceTest, WriteOverflow) {
+ // Create a new pipe.
+ int fds[2];
+ ASSERT_THAT(pipe(fds), SyscallSucceeds());
+ const FileDescriptor rfd(fds[0]);
+ const FileDescriptor wfd(fds[1]);
+
+ // Fill the pipe.
+ std::vector<char> buf(kPageSize);
+ RandomizeBuffer(buf.data(), buf.size());
+ ASSERT_THAT(write(wfd.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(kPageSize));
+
+ // Open the output file.
+ int fd;
+ EXPECT_THAT(fd = memfd_create("overflow", 0), SyscallSucceeds());
+ const FileDescriptor out_fd(fd);
+
+ // out_offset + kSize overflows INT64_MAX.
+ loff_t out_offset = 0x7ffffffffffffffeull;
+ constexpr int kSize = 3;
+ EXPECT_THAT(splice(rfd.get(), nullptr, out_fd.get(), &out_offset, kSize, 0),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(SpliceTest, SamePipe) {
+ // Create a new pipe.
+ int fds[2];
+ ASSERT_THAT(pipe(fds), SyscallSucceeds());
+ const FileDescriptor rfd(fds[0]);
+ const FileDescriptor wfd(fds[1]);
+
+ // Fill the pipe.
+ std::vector<char> buf(kPageSize);
+ RandomizeBuffer(buf.data(), buf.size());
+ ASSERT_THAT(write(wfd.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(kPageSize));
+
+ // Attempt to splice to itself.
+ EXPECT_THAT(splice(rfd.get(), nullptr, wfd.get(), nullptr, kPageSize, 0),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(TeeTest, SamePipe) {
+ // Create a new pipe.
+ int fds[2];
+ ASSERT_THAT(pipe(fds), SyscallSucceeds());
+ const FileDescriptor rfd(fds[0]);
+ const FileDescriptor wfd(fds[1]);
+
+ // Fill the pipe.
+ std::vector<char> buf(kPageSize);
+ RandomizeBuffer(buf.data(), buf.size());
+ ASSERT_THAT(write(wfd.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(kPageSize));
+
+ // Attempt to tee to itself.
+ EXPECT_THAT(tee(rfd.get(), wfd.get(), kPageSize, 0),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(TeeTest, RegularFile) {
+ // Open some file.
+ const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const FileDescriptor in_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDWR));
+
+ // Create a new pipe.
+ int fds[2];
+ ASSERT_THAT(pipe(fds), SyscallSucceeds());
+ const FileDescriptor rfd(fds[0]);
+ const FileDescriptor wfd(fds[1]);
+
+ // Attempt to tee from the file.
+ EXPECT_THAT(tee(in_fd.get(), wfd.get(), kPageSize, 0),
+ SyscallFailsWithErrno(EINVAL));
+ EXPECT_THAT(tee(rfd.get(), in_fd.get(), kPageSize, 0),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(SpliceTest, PipeOffsets) {
+ // Create two new pipes.
+ int first[2], second[2];
+ ASSERT_THAT(pipe(first), SyscallSucceeds());
+ const FileDescriptor rfd1(first[0]);
+ const FileDescriptor wfd1(first[1]);
+ ASSERT_THAT(pipe(second), SyscallSucceeds());
+ const FileDescriptor rfd2(second[0]);
+ const FileDescriptor wfd2(second[1]);
+
+ // All pipe offsets should be rejected.
+ loff_t in_offset = 0;
+ loff_t out_offset = 0;
+ EXPECT_THAT(splice(rfd1.get(), &in_offset, wfd2.get(), &out_offset, 1, 0),
+ SyscallFailsWithErrno(ESPIPE));
+ EXPECT_THAT(splice(rfd1.get(), nullptr, wfd2.get(), &out_offset, 1, 0),
+ SyscallFailsWithErrno(ESPIPE));
+ EXPECT_THAT(splice(rfd1.get(), &in_offset, wfd2.get(), nullptr, 1, 0),
+ SyscallFailsWithErrno(ESPIPE));
+}
+
+// Event FDs may be used with splice without an offset.
+TEST(SpliceTest, FromEventFD) {
+ // Open the input eventfd with an initial value so that it is readable.
+ constexpr uint64_t kEventFDValue = 1;
+ int efd;
+ ASSERT_THAT(efd = eventfd(kEventFDValue, 0), SyscallSucceeds());
+ const FileDescriptor in_fd(efd);
+
+ // Create a new pipe.
+ int fds[2];
+ ASSERT_THAT(pipe(fds), SyscallSucceeds());
+ const FileDescriptor rfd(fds[0]);
+ const FileDescriptor wfd(fds[1]);
+
+ // Splice 8-byte eventfd value to pipe.
+ constexpr int kEventFDSize = 8;
+ EXPECT_THAT(splice(in_fd.get(), nullptr, wfd.get(), nullptr, kEventFDSize, 0),
+ SyscallSucceedsWithValue(kEventFDSize));
+
+ // Contents should be equal.
+ std::vector<char> rbuf(kEventFDSize);
+ ASSERT_THAT(read(rfd.get(), rbuf.data(), rbuf.size()),
+ SyscallSucceedsWithValue(kEventFDSize));
+ EXPECT_EQ(memcmp(rbuf.data(), &kEventFDValue, rbuf.size()), 0);
+}
+
+// Event FDs may not be used with splice with an offset.
+TEST(SpliceTest, FromEventFDOffset) {
+ int efd;
+ ASSERT_THAT(efd = eventfd(0, 0), SyscallSucceeds());
+ const FileDescriptor in_fd(efd);
+
+ // Create a new pipe.
+ int fds[2];
+ ASSERT_THAT(pipe(fds), SyscallSucceeds());
+ const FileDescriptor rfd(fds[0]);
+ const FileDescriptor wfd(fds[1]);
+
+ // Attempt to splice 8-byte eventfd value to pipe with offset.
+ //
+ // This is not allowed because eventfd doesn't support pread.
+ constexpr int kEventFDSize = 8;
+ loff_t in_off = 0;
+ EXPECT_THAT(splice(in_fd.get(), &in_off, wfd.get(), nullptr, kEventFDSize, 0),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+// Event FDs may not be used with splice with an offset.
+TEST(SpliceTest, ToEventFDOffset) {
+ // Create a new pipe.
+ int fds[2];
+ ASSERT_THAT(pipe(fds), SyscallSucceeds());
+ const FileDescriptor rfd(fds[0]);
+ const FileDescriptor wfd(fds[1]);
+
+ // Fill with a value.
+ constexpr int kEventFDSize = 8;
+ std::vector<char> buf(kEventFDSize);
+ buf[0] = 1;
+ ASSERT_THAT(write(wfd.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(kEventFDSize));
+
+ int efd;
+ ASSERT_THAT(efd = eventfd(0, 0), SyscallSucceeds());
+ const FileDescriptor out_fd(efd);
+
+ // Attempt to splice 8-byte eventfd value to pipe with offset.
+ //
+ // This is not allowed because eventfd doesn't support pwrite.
+ loff_t out_off = 0;
+ EXPECT_THAT(
+ splice(rfd.get(), nullptr, out_fd.get(), &out_off, kEventFDSize, 0),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(SpliceTest, ToPipe) {
+ // Open the input file.
+ const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const FileDescriptor in_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDWR));
+
+ // Fill with some random data.
+ std::vector<char> buf(kPageSize);
+ RandomizeBuffer(buf.data(), buf.size());
+ ASSERT_THAT(write(in_fd.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(kPageSize));
+ ASSERT_THAT(lseek(in_fd.get(), 0, SEEK_SET), SyscallSucceedsWithValue(0));
+
+ // Create a new pipe.
+ int fds[2];
+ ASSERT_THAT(pipe(fds), SyscallSucceeds());
+ const FileDescriptor rfd(fds[0]);
+ const FileDescriptor wfd(fds[1]);
+
+ // Splice to the pipe.
+ EXPECT_THAT(splice(in_fd.get(), nullptr, wfd.get(), nullptr, kPageSize, 0),
+ SyscallSucceedsWithValue(kPageSize));
+
+ // Contents should be equal.
+ std::vector<char> rbuf(kPageSize);
+ ASSERT_THAT(read(rfd.get(), rbuf.data(), rbuf.size()),
+ SyscallSucceedsWithValue(kPageSize));
+ EXPECT_EQ(memcmp(rbuf.data(), buf.data(), buf.size()), 0);
+}
+
+TEST(SpliceTest, ToPipeOffset) {
+ // Open the input file.
+ const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const FileDescriptor in_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDWR));
+
+ // Fill with some random data.
+ std::vector<char> buf(kPageSize);
+ RandomizeBuffer(buf.data(), buf.size());
+ ASSERT_THAT(write(in_fd.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(kPageSize));
+
+ // Create a new pipe.
+ int fds[2];
+ ASSERT_THAT(pipe(fds), SyscallSucceeds());
+ const FileDescriptor rfd(fds[0]);
+ const FileDescriptor wfd(fds[1]);
+
+ // Splice to the pipe.
+ loff_t in_offset = kPageSize / 2;
+ EXPECT_THAT(
+ splice(in_fd.get(), &in_offset, wfd.get(), nullptr, kPageSize / 2, 0),
+ SyscallSucceedsWithValue(kPageSize / 2));
+
+ // Contents should be equal to only the second part.
+ std::vector<char> rbuf(kPageSize / 2);
+ ASSERT_THAT(read(rfd.get(), rbuf.data(), rbuf.size()),
+ SyscallSucceedsWithValue(kPageSize / 2));
+ EXPECT_EQ(memcmp(rbuf.data(), buf.data() + (kPageSize / 2), rbuf.size()), 0);
+}
+
+TEST(SpliceTest, FromPipe) {
+ // Create a new pipe.
+ int fds[2];
+ ASSERT_THAT(pipe(fds), SyscallSucceeds());
+ const FileDescriptor rfd(fds[0]);
+ const FileDescriptor wfd(fds[1]);
+
+ // Fill with some random data.
+ std::vector<char> buf(kPageSize);
+ RandomizeBuffer(buf.data(), buf.size());
+ ASSERT_THAT(write(wfd.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(kPageSize));
+
+ // Open the input file.
+ const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const FileDescriptor out_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_RDWR));
+
+ // Splice to the output file.
+ EXPECT_THAT(splice(rfd.get(), nullptr, out_fd.get(), nullptr, kPageSize, 0),
+ SyscallSucceedsWithValue(kPageSize));
+
+ // The offset of the output should be equal to kPageSize. We assert that and
+ // reset to zero so that we can read the contents and ensure they match.
+ EXPECT_THAT(lseek(out_fd.get(), 0, SEEK_CUR),
+ SyscallSucceedsWithValue(kPageSize));
+ ASSERT_THAT(lseek(out_fd.get(), 0, SEEK_SET), SyscallSucceedsWithValue(0));
+
+ // Contents should be equal.
+ std::vector<char> rbuf(kPageSize);
+ ASSERT_THAT(read(out_fd.get(), rbuf.data(), rbuf.size()),
+ SyscallSucceedsWithValue(kPageSize));
+ EXPECT_EQ(memcmp(rbuf.data(), buf.data(), buf.size()), 0);
+}
+
+TEST(SpliceTest, FromPipeOffset) {
+ // Create a new pipe.
+ int fds[2];
+ ASSERT_THAT(pipe(fds), SyscallSucceeds());
+ const FileDescriptor rfd(fds[0]);
+ const FileDescriptor wfd(fds[1]);
+
+ // Fill with some random data.
+ std::vector<char> buf(kPageSize);
+ RandomizeBuffer(buf.data(), buf.size());
+ ASSERT_THAT(write(wfd.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(kPageSize));
+
+ // Open the input file.
+ const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const FileDescriptor out_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_RDWR));
+
+ // Splice to the output file.
+ loff_t out_offset = kPageSize / 2;
+ EXPECT_THAT(
+ splice(rfd.get(), nullptr, out_fd.get(), &out_offset, kPageSize, 0),
+ SyscallSucceedsWithValue(kPageSize));
+
+ // Content should reflect the splice. We write to a specific offset in the
+ // file, so the internals should now be allocated sparsely.
+ std::vector<char> rbuf(kPageSize);
+ ASSERT_THAT(read(out_fd.get(), rbuf.data(), rbuf.size()),
+ SyscallSucceedsWithValue(kPageSize));
+ std::vector<char> zbuf(kPageSize / 2);
+ memset(zbuf.data(), 0, zbuf.size());
+ EXPECT_EQ(memcmp(rbuf.data(), zbuf.data(), zbuf.size()), 0);
+ EXPECT_EQ(memcmp(rbuf.data() + kPageSize / 2, buf.data(), kPageSize / 2), 0);
+}
+
+TEST(SpliceTest, TwoPipes) {
+ // Create two new pipes.
+ int first[2], second[2];
+ ASSERT_THAT(pipe(first), SyscallSucceeds());
+ const FileDescriptor rfd1(first[0]);
+ const FileDescriptor wfd1(first[1]);
+ ASSERT_THAT(pipe(second), SyscallSucceeds());
+ const FileDescriptor rfd2(second[0]);
+ const FileDescriptor wfd2(second[1]);
+
+ // Fill with some random data.
+ std::vector<char> buf(kPageSize);
+ RandomizeBuffer(buf.data(), buf.size());
+ ASSERT_THAT(write(wfd1.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(kPageSize));
+
+ // Splice to the second pipe, using two operations.
+ EXPECT_THAT(
+ splice(rfd1.get(), nullptr, wfd2.get(), nullptr, kPageSize / 2, 0),
+ SyscallSucceedsWithValue(kPageSize / 2));
+ EXPECT_THAT(
+ splice(rfd1.get(), nullptr, wfd2.get(), nullptr, kPageSize / 2, 0),
+ SyscallSucceedsWithValue(kPageSize / 2));
+
+ // Content should reflect the splice.
+ std::vector<char> rbuf(kPageSize);
+ ASSERT_THAT(read(rfd2.get(), rbuf.data(), rbuf.size()),
+ SyscallSucceedsWithValue(kPageSize));
+ EXPECT_EQ(memcmp(rbuf.data(), buf.data(), kPageSize), 0);
+}
+
+TEST(SpliceTest, TwoPipesCircular) {
+ // This test deadlocks the sentry on VFS1 because VFS1 splice ordering is
+ // based on fs.File.UniqueID, which does not prevent circular ordering between
+ // e.g. inode-level locks taken by fs.FileOperations.
+ SKIP_IF(IsRunningWithVFS1());
+
+ // Create two pipes.
+ int fds[2];
+ ASSERT_THAT(pipe(fds), SyscallSucceeds());
+ const FileDescriptor first_rfd(fds[0]);
+ const FileDescriptor first_wfd(fds[1]);
+ ASSERT_THAT(pipe(fds), SyscallSucceeds());
+ const FileDescriptor second_rfd(fds[0]);
+ const FileDescriptor second_wfd(fds[1]);
+
+ // On Linux, each pipe is normally limited to
+ // include/linux/pipe_fs_i.h:PIPE_DEF_BUFFERS buffers worth of data.
+ constexpr size_t PIPE_DEF_BUFFERS = 16;
+
+ // Write some data to each pipe. Below we splice 1 byte at a time between
+ // pipes, which very quickly causes each byte to be stored in a separate
+ // buffer, so we must ensure that the total amount of data in the system is <=
+ // PIPE_DEF_BUFFERS bytes.
+ std::vector<char> buf(PIPE_DEF_BUFFERS / 2);
+ RandomizeBuffer(buf.data(), buf.size());
+ ASSERT_THAT(write(first_wfd.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(buf.size()));
+ ASSERT_THAT(write(second_wfd.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(buf.size()));
+
+ // Have another thread splice from the second pipe to the first, while we
+ // splice from the first to the second. The test passes if this does not
+ // deadlock.
+ const int kIterations = 1000;
+ DisableSave ds;
+ ScopedThread t([&]() {
+ for (int i = 0; i < kIterations; i++) {
+ ASSERT_THAT(
+ splice(second_rfd.get(), nullptr, first_wfd.get(), nullptr, 1, 0),
+ SyscallSucceedsWithValue(1));
+ }
+ });
+ for (int i = 0; i < kIterations; i++) {
+ ASSERT_THAT(
+ splice(first_rfd.get(), nullptr, second_wfd.get(), nullptr, 1, 0),
+ SyscallSucceedsWithValue(1));
+ }
+}
+
+TEST(SpliceTest, Blocking) {
+ // Create two new pipes.
+ int first[2], second[2];
+ ASSERT_THAT(pipe(first), SyscallSucceeds());
+ const FileDescriptor rfd1(first[0]);
+ const FileDescriptor wfd1(first[1]);
+ ASSERT_THAT(pipe(second), SyscallSucceeds());
+ const FileDescriptor rfd2(second[0]);
+ const FileDescriptor wfd2(second[1]);
+
+ // This thread writes to the main pipe.
+ std::vector<char> buf(kPageSize);
+ RandomizeBuffer(buf.data(), buf.size());
+ ScopedThread t([&]() {
+ ASSERT_THAT(write(wfd1.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(kPageSize));
+ });
+
+ // Attempt a splice immediately; it should block.
+ EXPECT_THAT(splice(rfd1.get(), nullptr, wfd2.get(), nullptr, kPageSize, 0),
+ SyscallSucceedsWithValue(kPageSize));
+
+ // Thread should be joinable.
+ t.Join();
+
+ // Content should reflect the splice.
+ std::vector<char> rbuf(kPageSize);
+ ASSERT_THAT(read(rfd2.get(), rbuf.data(), rbuf.size()),
+ SyscallSucceedsWithValue(kPageSize));
+ EXPECT_EQ(memcmp(rbuf.data(), buf.data(), kPageSize), 0);
+}
+
+TEST(TeeTest, Blocking) {
+ // Create two new pipes.
+ int first[2], second[2];
+ ASSERT_THAT(pipe(first), SyscallSucceeds());
+ const FileDescriptor rfd1(first[0]);
+ const FileDescriptor wfd1(first[1]);
+ ASSERT_THAT(pipe(second), SyscallSucceeds());
+ const FileDescriptor rfd2(second[0]);
+ const FileDescriptor wfd2(second[1]);
+
+ // This thread writes to the main pipe.
+ std::vector<char> buf(kPageSize);
+ RandomizeBuffer(buf.data(), buf.size());
+ ScopedThread t([&]() {
+ ASSERT_THAT(write(wfd1.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(kPageSize));
+ });
+
+ // Attempt a tee immediately; it should block.
+ EXPECT_THAT(tee(rfd1.get(), wfd2.get(), kPageSize, 0),
+ SyscallSucceedsWithValue(kPageSize));
+
+ // Thread should be joinable.
+ t.Join();
+
+ // Content should reflect the splice, in both pipes.
+ std::vector<char> rbuf(kPageSize);
+ ASSERT_THAT(read(rfd2.get(), rbuf.data(), rbuf.size()),
+ SyscallSucceedsWithValue(kPageSize));
+ EXPECT_EQ(memcmp(rbuf.data(), buf.data(), kPageSize), 0);
+ ASSERT_THAT(read(rfd1.get(), rbuf.data(), rbuf.size()),
+ SyscallSucceedsWithValue(kPageSize));
+ EXPECT_EQ(memcmp(rbuf.data(), buf.data(), kPageSize), 0);
+}
+
+TEST(TeeTest, BlockingWrite) {
+ // Create two new pipes.
+ int first[2], second[2];
+ ASSERT_THAT(pipe(first), SyscallSucceeds());
+ const FileDescriptor rfd1(first[0]);
+ const FileDescriptor wfd1(first[1]);
+ ASSERT_THAT(pipe(second), SyscallSucceeds());
+ const FileDescriptor rfd2(second[0]);
+ const FileDescriptor wfd2(second[1]);
+
+ // Make some data available to be read.
+ std::vector<char> buf1(kPageSize);
+ RandomizeBuffer(buf1.data(), buf1.size());
+ ASSERT_THAT(write(wfd1.get(), buf1.data(), buf1.size()),
+ SyscallSucceedsWithValue(kPageSize));
+
+ // Fill up the write pipe's buffer.
+ int pipe_size = -1;
+ ASSERT_THAT(pipe_size = fcntl(wfd2.get(), F_GETPIPE_SZ), SyscallSucceeds());
+ std::vector<char> buf2(pipe_size);
+ ASSERT_THAT(write(wfd2.get(), buf2.data(), buf2.size()),
+ SyscallSucceedsWithValue(pipe_size));
+
+ ScopedThread t([&]() {
+ absl::SleepFor(absl::Milliseconds(100));
+ ASSERT_THAT(read(rfd2.get(), buf2.data(), buf2.size()),
+ SyscallSucceedsWithValue(pipe_size));
+ });
+
+ // Attempt a tee immediately; it should block.
+ EXPECT_THAT(tee(rfd1.get(), wfd2.get(), kPageSize, 0),
+ SyscallSucceedsWithValue(kPageSize));
+
+ // Thread should be joinable.
+ t.Join();
+
+ // Content should reflect the tee.
+ std::vector<char> rbuf(kPageSize);
+ ASSERT_THAT(read(rfd2.get(), rbuf.data(), rbuf.size()),
+ SyscallSucceedsWithValue(kPageSize));
+ EXPECT_EQ(memcmp(rbuf.data(), buf1.data(), kPageSize), 0);
+}
+
+TEST(SpliceTest, NonBlocking) {
+ // Create two new pipes.
+ int first[2], second[2];
+ ASSERT_THAT(pipe(first), SyscallSucceeds());
+ const FileDescriptor rfd1(first[0]);
+ const FileDescriptor wfd1(first[1]);
+ ASSERT_THAT(pipe(second), SyscallSucceeds());
+ const FileDescriptor rfd2(second[0]);
+ const FileDescriptor wfd2(second[1]);
+
+ // Splice with no data to back it.
+ EXPECT_THAT(splice(rfd1.get(), nullptr, wfd2.get(), nullptr, kPageSize,
+ SPLICE_F_NONBLOCK),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST(TeeTest, NonBlocking) {
+ // Create two new pipes.
+ int first[2], second[2];
+ ASSERT_THAT(pipe(first), SyscallSucceeds());
+ const FileDescriptor rfd1(first[0]);
+ const FileDescriptor wfd1(first[1]);
+ ASSERT_THAT(pipe(second), SyscallSucceeds());
+ const FileDescriptor rfd2(second[0]);
+ const FileDescriptor wfd2(second[1]);
+
+ // Splice with no data to back it.
+ EXPECT_THAT(tee(rfd1.get(), wfd2.get(), kPageSize, SPLICE_F_NONBLOCK),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST(TeeTest, MultiPage) {
+ // Create two new pipes.
+ int first[2], second[2];
+ ASSERT_THAT(pipe(first), SyscallSucceeds());
+ const FileDescriptor rfd1(first[0]);
+ const FileDescriptor wfd1(first[1]);
+ ASSERT_THAT(pipe(second), SyscallSucceeds());
+ const FileDescriptor rfd2(second[0]);
+ const FileDescriptor wfd2(second[1]);
+
+ // Make some data available to be read.
+ std::vector<char> wbuf(8 * kPageSize);
+ RandomizeBuffer(wbuf.data(), wbuf.size());
+ ASSERT_THAT(write(wfd1.get(), wbuf.data(), wbuf.size()),
+ SyscallSucceedsWithValue(wbuf.size()));
+
+ // Attempt a tee immediately; it should complete.
+ EXPECT_THAT(tee(rfd1.get(), wfd2.get(), wbuf.size(), 0),
+ SyscallSucceedsWithValue(wbuf.size()));
+
+ // Content should reflect the tee.
+ std::vector<char> rbuf(wbuf.size());
+ ASSERT_THAT(read(rfd2.get(), rbuf.data(), rbuf.size()),
+ SyscallSucceedsWithValue(rbuf.size()));
+ EXPECT_EQ(memcmp(rbuf.data(), wbuf.data(), rbuf.size()), 0);
+ ASSERT_THAT(read(rfd1.get(), rbuf.data(), rbuf.size()),
+ SyscallSucceedsWithValue(rbuf.size()));
+ EXPECT_EQ(memcmp(rbuf.data(), wbuf.data(), rbuf.size()), 0);
+}
+
+TEST(SpliceTest, FromPipeMaxFileSize) {
+ // Create a new pipe.
+ int fds[2];
+ ASSERT_THAT(pipe(fds), SyscallSucceeds());
+ const FileDescriptor rfd(fds[0]);
+ const FileDescriptor wfd(fds[1]);
+
+ // Fill with some random data.
+ std::vector<char> buf(kPageSize);
+ RandomizeBuffer(buf.data(), buf.size());
+ ASSERT_THAT(write(wfd.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(kPageSize));
+
+ // Open the input file.
+ const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const FileDescriptor out_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_RDWR));
+
+ EXPECT_THAT(ftruncate(out_fd.get(), 13 << 20), SyscallSucceeds());
+ EXPECT_THAT(lseek(out_fd.get(), 0, SEEK_END),
+ SyscallSucceedsWithValue(13 << 20));
+
+ // Set our file size limit.
+ sigset_t set;
+ sigemptyset(&set);
+ sigaddset(&set, SIGXFSZ);
+ TEST_PCHECK(sigprocmask(SIG_BLOCK, &set, nullptr) == 0);
+ rlimit rlim = {};
+ rlim.rlim_cur = rlim.rlim_max = (13 << 20);
+ EXPECT_THAT(setrlimit(RLIMIT_FSIZE, &rlim), SyscallSucceeds());
+
+ // Splice to the output file.
+ EXPECT_THAT(
+ splice(rfd.get(), nullptr, out_fd.get(), nullptr, 3 * kPageSize, 0),
+ SyscallFailsWithErrno(EFBIG));
+
+ // Contents should be equal.
+ std::vector<char> rbuf(kPageSize);
+ ASSERT_THAT(read(rfd.get(), rbuf.data(), rbuf.size()),
+ SyscallSucceedsWithValue(kPageSize));
+ EXPECT_EQ(memcmp(rbuf.data(), buf.data(), buf.size()), 0);
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/stat.cc b/test/syscalls/linux/stat.cc
new file mode 100644
index 000000000..2503960f3
--- /dev/null
+++ b/test/syscalls/linux/stat.cc
@@ -0,0 +1,720 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/statfs.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <string>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "test/syscalls/linux/file_base.h"
+#include "test/util/cleanup.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+#ifndef AT_STATX_FORCE_SYNC
+#define AT_STATX_FORCE_SYNC 0x2000
+#endif
+#ifndef AT_STATX_DONT_SYNC
+#define AT_STATX_DONT_SYNC 0x4000
+#endif
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+class StatTest : public FileTest {};
+
+TEST_F(StatTest, FstatatAbs) {
+ struct stat st;
+
+ // Check that the stat works.
+ EXPECT_THAT(fstatat(AT_FDCWD, test_file_name_.c_str(), &st, 0),
+ SyscallSucceeds());
+ EXPECT_TRUE(S_ISREG(st.st_mode));
+}
+
+TEST_F(StatTest, FstatatEmptyPath) {
+ struct stat st;
+ const auto fd = ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDONLY));
+
+ // Check that the stat works.
+ EXPECT_THAT(fstatat(fd.get(), "", &st, AT_EMPTY_PATH), SyscallSucceeds());
+ EXPECT_TRUE(S_ISREG(st.st_mode));
+}
+
+TEST_F(StatTest, FstatatRel) {
+ struct stat st;
+ int dirfd;
+ auto filename = std::string(Basename(test_file_name_));
+
+ // Open the temporary directory read-only.
+ ASSERT_THAT(dirfd = open(GetAbsoluteTestTmpdir().c_str(), O_RDONLY),
+ SyscallSucceeds());
+
+ // Check that the stat works.
+ EXPECT_THAT(fstatat(dirfd, filename.c_str(), &st, 0), SyscallSucceeds());
+ EXPECT_TRUE(S_ISREG(st.st_mode));
+ close(dirfd);
+}
+
+TEST_F(StatTest, FstatatSymlink) {
+ struct stat st;
+
+ // Check that the link is followed.
+ EXPECT_THAT(fstatat(AT_FDCWD, "/proc/self", &st, 0), SyscallSucceeds());
+ EXPECT_TRUE(S_ISDIR(st.st_mode));
+ EXPECT_FALSE(S_ISLNK(st.st_mode));
+
+ // Check that the flag works.
+ EXPECT_THAT(fstatat(AT_FDCWD, "/proc/self", &st, AT_SYMLINK_NOFOLLOW),
+ SyscallSucceeds());
+ EXPECT_TRUE(S_ISLNK(st.st_mode));
+ EXPECT_FALSE(S_ISDIR(st.st_mode));
+}
+
+TEST_F(StatTest, Nlinks) {
+ TempPath basedir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+ // Directory is initially empty, it should contain 2 links (one from itself,
+ // one from ".").
+ EXPECT_THAT(Links(basedir.path()), IsPosixErrorOkAndHolds(2));
+
+ // Create a file in the test directory. Files shouldn't increase the link
+ // count on the base directory.
+ TempPath file1 =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(basedir.path()));
+ EXPECT_THAT(Links(basedir.path()), IsPosixErrorOkAndHolds(2));
+
+ // Create subdirectories. This should increase the link count by 1 per
+ // subdirectory.
+ TempPath dir1 =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(basedir.path()));
+ EXPECT_THAT(Links(basedir.path()), IsPosixErrorOkAndHolds(3));
+ TempPath dir2 =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(basedir.path()));
+ EXPECT_THAT(Links(basedir.path()), IsPosixErrorOkAndHolds(4));
+
+ // Removing directories should reduce the link count.
+ dir1.reset();
+ EXPECT_THAT(Links(basedir.path()), IsPosixErrorOkAndHolds(3));
+ dir2.reset();
+ EXPECT_THAT(Links(basedir.path()), IsPosixErrorOkAndHolds(2));
+
+ // Removing files should have no effect on link count.
+ file1.reset();
+ EXPECT_THAT(Links(basedir.path()), IsPosixErrorOkAndHolds(2));
+}
+
+TEST_F(StatTest, BlocksIncreaseOnWrite) {
+ struct stat st;
+
+ // Stat the empty file.
+ ASSERT_THAT(fstat(test_file_fd_.get(), &st), SyscallSucceeds());
+
+ const int initial_blocks = st.st_blocks;
+
+ // Write to the file, making sure to exceed the block size.
+ std::vector<char> buf(2 * st.st_blksize, 'a');
+ ASSERT_THAT(write(test_file_fd_.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(buf.size()));
+
+ // Stat the file again, and verify that number of allocated blocks has
+ // increased.
+ ASSERT_THAT(fstat(test_file_fd_.get(), &st), SyscallSucceeds());
+ EXPECT_GT(st.st_blocks, initial_blocks);
+}
+
+TEST_F(StatTest, PathNotCleaned) {
+ TempPath basedir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+ // Create a file in the basedir.
+ TempPath file =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(basedir.path()));
+
+ // Stating the file directly should succeed.
+ struct stat buf;
+ EXPECT_THAT(lstat(file.path().c_str(), &buf), SyscallSucceeds());
+
+ // Try to stat the file using a directory that does not exist followed by
+ // "..". If the path is cleaned prior to stating (which it should not be)
+ // then this will succeed.
+ const std::string bad_path = JoinPath("/does_not_exist/..", file.path());
+ EXPECT_THAT(lstat(bad_path.c_str(), &buf), SyscallFailsWithErrno(ENOENT));
+}
+
+TEST_F(StatTest, PathCanContainDotDot) {
+ TempPath basedir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ TempPath subdir =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(basedir.path()));
+ const std::string subdir_name = std::string(Basename(subdir.path()));
+
+ // Create a file in the subdir.
+ TempPath file =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(subdir.path()));
+ const std::string file_name = std::string(Basename(file.path()));
+
+ // Stat the file through a path that includes '..' and '.' but still resolves
+ // to the file.
+ const std::string good_path =
+ JoinPath(basedir.path(), subdir_name, "..", subdir_name, ".", file_name);
+ struct stat buf;
+ EXPECT_THAT(lstat(good_path.c_str(), &buf), SyscallSucceeds());
+}
+
+TEST_F(StatTest, PathCanContainEmptyComponent) {
+ TempPath basedir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+ // Create a file in the basedir.
+ TempPath file =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(basedir.path()));
+ const std::string file_name = std::string(Basename(file.path()));
+
+ // Stat the file through a path that includes an empty component. We have to
+ // build this ourselves because JoinPath automatically removes empty
+ // components.
+ const std::string good_path = absl::StrCat(basedir.path(), "//", file_name);
+ struct stat buf;
+ EXPECT_THAT(lstat(good_path.c_str(), &buf), SyscallSucceeds());
+}
+
+TEST_F(StatTest, TrailingSlashNotCleanedReturnsENOTDIR) {
+ TempPath basedir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+ // Create a file in the basedir.
+ TempPath file =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(basedir.path()));
+
+ // Stat the file with an extra "/" on the end of it. Since file is not a
+ // directory, this should return ENOTDIR.
+ const std::string bad_path = absl::StrCat(file.path(), "/");
+ struct stat buf;
+ EXPECT_THAT(lstat(bad_path.c_str(), &buf), SyscallFailsWithErrno(ENOTDIR));
+}
+
+// Test fstatating a symlink directory.
+TEST_F(StatTest, FstatatSymlinkDir) {
+ // Create a directory and symlink to it.
+ const auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+ const std::string symlink_to_dir = NewTempAbsPath();
+ EXPECT_THAT(symlink(dir.path().c_str(), symlink_to_dir.c_str()),
+ SyscallSucceeds());
+ auto cleanup = Cleanup([&symlink_to_dir]() {
+ EXPECT_THAT(unlink(symlink_to_dir.c_str()), SyscallSucceeds());
+ });
+
+ // Fstatat the link with AT_SYMLINK_NOFOLLOW should return symlink data.
+ struct stat st = {};
+ EXPECT_THAT(
+ fstatat(AT_FDCWD, symlink_to_dir.c_str(), &st, AT_SYMLINK_NOFOLLOW),
+ SyscallSucceeds());
+ EXPECT_FALSE(S_ISDIR(st.st_mode));
+ EXPECT_TRUE(S_ISLNK(st.st_mode));
+
+ // Fstatat the link should return dir data.
+ EXPECT_THAT(fstatat(AT_FDCWD, symlink_to_dir.c_str(), &st, 0),
+ SyscallSucceeds());
+ EXPECT_TRUE(S_ISDIR(st.st_mode));
+ EXPECT_FALSE(S_ISLNK(st.st_mode));
+}
+
+// Test fstatating a symlink directory with trailing slash.
+TEST_F(StatTest, FstatatSymlinkDirWithTrailingSlash) {
+ // Create a directory and symlink to it.
+ const auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const std::string symlink_to_dir = NewTempAbsPath();
+ EXPECT_THAT(symlink(dir.path().c_str(), symlink_to_dir.c_str()),
+ SyscallSucceeds());
+ auto cleanup = Cleanup([&symlink_to_dir]() {
+ EXPECT_THAT(unlink(symlink_to_dir.c_str()), SyscallSucceeds());
+ });
+
+ // Fstatat on the symlink with a trailing slash should return the directory
+ // data.
+ struct stat st = {};
+ EXPECT_THAT(
+ fstatat(AT_FDCWD, absl::StrCat(symlink_to_dir, "/").c_str(), &st, 0),
+ SyscallSucceeds());
+ EXPECT_TRUE(S_ISDIR(st.st_mode));
+ EXPECT_FALSE(S_ISLNK(st.st_mode));
+
+ // Fstatat on the symlink with a trailing slash with AT_SYMLINK_NOFOLLOW
+ // should return the directory data.
+ // Symlink to directory with trailing slash will ignore AT_SYMLINK_NOFOLLOW.
+ EXPECT_THAT(fstatat(AT_FDCWD, absl::StrCat(symlink_to_dir, "/").c_str(), &st,
+ AT_SYMLINK_NOFOLLOW),
+ SyscallSucceeds());
+ EXPECT_TRUE(S_ISDIR(st.st_mode));
+ EXPECT_FALSE(S_ISLNK(st.st_mode));
+}
+
+// Test fstatating a symlink directory with a trailing slash
+// should return same stat data with fstatating directory.
+TEST_F(StatTest, FstatatSymlinkDirWithTrailingSlashSameInode) {
+ // Create a directory and symlink to it.
+ const auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+ // We are going to assert that the symlink inode id is the same as the linked
+ // dir's inode id. In order for the inode id to be stable across
+ // save/restore, it must be kept open. The FileDescriptor type will do that
+ // for us automatically.
+ auto fd = ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_RDONLY | O_DIRECTORY));
+
+ const std::string symlink_to_dir = NewTempAbsPath();
+ EXPECT_THAT(symlink(dir.path().c_str(), symlink_to_dir.c_str()),
+ SyscallSucceeds());
+ auto cleanup = Cleanup([&symlink_to_dir]() {
+ EXPECT_THAT(unlink(symlink_to_dir.c_str()), SyscallSucceeds());
+ });
+
+ // Fstatat on the symlink with a trailing slash should return the directory
+ // data.
+ struct stat st = {};
+ EXPECT_THAT(fstatat(AT_FDCWD, absl::StrCat(symlink_to_dir, "/").c_str(), &st,
+ AT_SYMLINK_NOFOLLOW),
+ SyscallSucceeds());
+ EXPECT_TRUE(S_ISDIR(st.st_mode));
+
+ // Dir and symlink should point to same inode.
+ struct stat st_dir = {};
+ EXPECT_THAT(
+ fstatat(AT_FDCWD, dir.path().c_str(), &st_dir, AT_SYMLINK_NOFOLLOW),
+ SyscallSucceeds());
+ EXPECT_EQ(st.st_ino, st_dir.st_ino);
+}
+
+TEST_F(StatTest, LeadingDoubleSlash) {
+ // Create a file, and make sure we can stat it.
+ TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ struct stat st;
+ ASSERT_THAT(lstat(file.path().c_str(), &st), SyscallSucceeds());
+
+ // Now add an extra leading slash.
+ const std::string double_slash_path = absl::StrCat("/", file.path());
+ ASSERT_TRUE(absl::StartsWith(double_slash_path, "//"));
+
+ // We should be able to stat the new path, and it should resolve to the same
+ // file (same device and inode).
+ struct stat double_slash_st;
+ ASSERT_THAT(lstat(double_slash_path.c_str(), &double_slash_st),
+ SyscallSucceeds());
+ EXPECT_EQ(st.st_dev, double_slash_st.st_dev);
+ EXPECT_EQ(st.st_ino, double_slash_st.st_ino);
+}
+
+// Test that a rename doesn't change the underlying file.
+TEST_F(StatTest, StatDoesntChangeAfterRename) {
+ const TempPath old_dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const TempPath new_path(NewTempAbsPath());
+
+ struct stat st_old = {};
+ struct stat st_new = {};
+
+ ASSERT_THAT(stat(old_dir.path().c_str(), &st_old), SyscallSucceeds());
+ ASSERT_THAT(rename(old_dir.path().c_str(), new_path.path().c_str()),
+ SyscallSucceeds());
+ ASSERT_THAT(stat(new_path.path().c_str(), &st_new), SyscallSucceeds());
+
+ EXPECT_EQ(st_old.st_nlink, st_new.st_nlink);
+ EXPECT_EQ(st_old.st_dev, st_new.st_dev);
+ EXPECT_EQ(st_old.st_ino, st_new.st_ino);
+ EXPECT_EQ(st_old.st_mode, st_new.st_mode);
+ EXPECT_EQ(st_old.st_uid, st_new.st_uid);
+ EXPECT_EQ(st_old.st_gid, st_new.st_gid);
+ EXPECT_EQ(st_old.st_size, st_new.st_size);
+}
+
+// Test link counts with a regular file as the child.
+TEST_F(StatTest, LinkCountsWithRegularFileChild) {
+ const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+ struct stat st_parent_before = {};
+ ASSERT_THAT(stat(dir.path().c_str(), &st_parent_before), SyscallSucceeds());
+ EXPECT_EQ(st_parent_before.st_nlink, 2);
+
+ // Adding a regular file doesn't adjust the parent's link count.
+ const TempPath child =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir.path()));
+
+ struct stat st_parent_after = {};
+ ASSERT_THAT(stat(dir.path().c_str(), &st_parent_after), SyscallSucceeds());
+ EXPECT_EQ(st_parent_after.st_nlink, 2);
+
+ // The child should have a single link from the parent.
+ struct stat st_child = {};
+ ASSERT_THAT(stat(child.path().c_str(), &st_child), SyscallSucceeds());
+ EXPECT_TRUE(S_ISREG(st_child.st_mode));
+ EXPECT_EQ(st_child.st_nlink, 1);
+
+ // Finally unlinking the child should not affect the parent's link count.
+ ASSERT_THAT(unlink(child.path().c_str()), SyscallSucceeds());
+ ASSERT_THAT(stat(dir.path().c_str(), &st_parent_after), SyscallSucceeds());
+ EXPECT_EQ(st_parent_after.st_nlink, 2);
+}
+
+// This test verifies that inodes remain around when there is an open fd
+// after link count hits 0.
+TEST_F(StatTest, ZeroLinksOpenFdRegularFileChild_NoRandomSave) {
+ // Setting the enviornment variable GVISOR_GOFER_UNCACHED to any value
+ // will prevent this test from running, see the tmpfs lifecycle.
+ //
+ // We need to support this because when a file is unlinked and we forward
+ // the stat to the gofer it would return ENOENT.
+ const char* uncached_gofer = getenv("GVISOR_GOFER_UNCACHED");
+ SKIP_IF(uncached_gofer != nullptr);
+
+ // We don't support saving unlinked files.
+ const DisableSave ds;
+
+ const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const TempPath child = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ dir.path(), "hello", TempPath::kDefaultFileMode));
+
+ // The child should have a single link from the parent.
+ struct stat st_child_before = {};
+ ASSERT_THAT(stat(child.path().c_str(), &st_child_before), SyscallSucceeds());
+ EXPECT_TRUE(S_ISREG(st_child_before.st_mode));
+ EXPECT_EQ(st_child_before.st_nlink, 1);
+ EXPECT_EQ(st_child_before.st_size, 5); // Hello is 5 bytes.
+
+ // Open the file so we can fstat after unlinking.
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(child.path(), O_RDONLY));
+
+ // Now a stat should return ENOENT but we should still be able to stat
+ // via the open fd and fstat.
+ ASSERT_THAT(unlink(child.path().c_str()), SyscallSucceeds());
+
+ // Since the file has no more links stat should fail.
+ struct stat st_child_after = {};
+ ASSERT_THAT(stat(child.path().c_str(), &st_child_after),
+ SyscallFailsWithErrno(ENOENT));
+
+ // Fstat should still allow us to access the same file via the fd.
+ struct stat st_child_fd = {};
+ ASSERT_THAT(fstat(fd.get(), &st_child_fd), SyscallSucceeds());
+ EXPECT_EQ(st_child_before.st_dev, st_child_fd.st_dev);
+ EXPECT_EQ(st_child_before.st_ino, st_child_fd.st_ino);
+ EXPECT_EQ(st_child_before.st_mode, st_child_fd.st_mode);
+ EXPECT_EQ(st_child_before.st_uid, st_child_fd.st_uid);
+ EXPECT_EQ(st_child_before.st_gid, st_child_fd.st_gid);
+ EXPECT_EQ(st_child_before.st_size, st_child_fd.st_size);
+
+ // TODO(b/34861058): This isn't ideal but since fstatfs(2) will always return
+ // OVERLAYFS_SUPER_MAGIC we have no way to know if this fs is backed by a
+ // gofer which doesn't support links.
+ EXPECT_TRUE(st_child_fd.st_nlink == 0 || st_child_fd.st_nlink == 1);
+}
+
+// Test link counts with a directory as the child.
+TEST_F(StatTest, LinkCountsWithDirChild) {
+ const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+ // Before a child is added the two links are "." and the link from the parent.
+ struct stat st_parent_before = {};
+ ASSERT_THAT(stat(dir.path().c_str(), &st_parent_before), SyscallSucceeds());
+ EXPECT_EQ(st_parent_before.st_nlink, 2);
+
+ // Create a subdirectory and stat for the parent link counts.
+ const TempPath sub_dir =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(dir.path()));
+
+ // The three links are ".", the link from the parent, and the link from
+ // the child as "..".
+ struct stat st_parent_after = {};
+ ASSERT_THAT(stat(dir.path().c_str(), &st_parent_after), SyscallSucceeds());
+ EXPECT_EQ(st_parent_after.st_nlink, 3);
+
+ // The child will have 1 link from the parent and 1 link which represents ".".
+ struct stat st_child = {};
+ ASSERT_THAT(stat(sub_dir.path().c_str(), &st_child), SyscallSucceeds());
+ EXPECT_TRUE(S_ISDIR(st_child.st_mode));
+ EXPECT_EQ(st_child.st_nlink, 2);
+
+ // Finally delete the child dir and the parent link count should return to 2.
+ ASSERT_THAT(rmdir(sub_dir.path().c_str()), SyscallSucceeds());
+ ASSERT_THAT(stat(dir.path().c_str(), &st_parent_after), SyscallSucceeds());
+
+ // Now we should only have links from the parent and "." since the subdir
+ // has been removed.
+ EXPECT_EQ(st_parent_after.st_nlink, 2);
+}
+
+// Test statting a child of a non-directory.
+TEST_F(StatTest, ChildOfNonDir) {
+ // Create a path that has a child of a regular file.
+ const std::string filename = JoinPath(test_file_name_, "child");
+
+ // Statting the path should return ENOTDIR.
+ struct stat st;
+ EXPECT_THAT(lstat(filename.c_str(), &st), SyscallFailsWithErrno(ENOTDIR));
+}
+
+// Test lstating a symlink directory.
+TEST_F(StatTest, LstatSymlinkDir) {
+ // Create a directory and symlink to it.
+ const auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const std::string symlink_to_dir = NewTempAbsPath();
+ EXPECT_THAT(symlink(dir.path().c_str(), symlink_to_dir.c_str()),
+ SyscallSucceeds());
+ auto cleanup = Cleanup([&symlink_to_dir]() {
+ EXPECT_THAT(unlink(symlink_to_dir.c_str()), SyscallSucceeds());
+ });
+
+ // Lstat on the symlink should return symlink data.
+ struct stat st = {};
+ ASSERT_THAT(lstat(symlink_to_dir.c_str(), &st), SyscallSucceeds());
+ EXPECT_FALSE(S_ISDIR(st.st_mode));
+ EXPECT_TRUE(S_ISLNK(st.st_mode));
+
+ // Lstat on the symlink with a trailing slash should return the directory
+ // data.
+ ASSERT_THAT(lstat(absl::StrCat(symlink_to_dir, "/").c_str(), &st),
+ SyscallSucceeds());
+ EXPECT_TRUE(S_ISDIR(st.st_mode));
+ EXPECT_FALSE(S_ISLNK(st.st_mode));
+}
+
+// Verify that we get an ELOOP from too many symbolic links even when there
+// are directories in the middle.
+TEST_F(StatTest, LstatELOOPPath) {
+ const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ std::string subdir_base = "subdir";
+ ASSERT_THAT(mkdir(JoinPath(dir.path(), subdir_base).c_str(), 0755),
+ SyscallSucceeds());
+
+ std::string target = JoinPath(dir.path(), subdir_base, subdir_base);
+ std::string dst = JoinPath("..", subdir_base);
+ ASSERT_THAT(symlink(dst.c_str(), target.c_str()), SyscallSucceeds());
+ auto cleanup = Cleanup(
+ [&target]() { EXPECT_THAT(unlink(target.c_str()), SyscallSucceeds()); });
+
+ // Now build a path which is /subdir/subdir/... repeated many times so that
+ // we can build a path that is shorter than PATH_MAX but can still cause
+ // too many symbolic links. Note: Every other subdir is actually a directory
+ // so we're not in a situation where it's a -> b -> a -> b, where a and b
+ // are symbolic links.
+ std::string path = dir.path();
+ std::string subdir_append = absl::StrCat("/", subdir_base);
+ do {
+ absl::StrAppend(&path, subdir_append);
+ // Keep appending /subdir until we would overflow PATH_MAX.
+ } while ((path.size() + subdir_append.size()) < PATH_MAX);
+
+ struct stat s = {};
+ ASSERT_THAT(lstat(path.c_str(), &s), SyscallFailsWithErrno(ELOOP));
+}
+
+// Ensure that inode allocation for anonymous devices work correctly across
+// save/restore. In particular, inode numbers should be unique across S/R.
+TEST(SimpleStatTest, AnonDeviceAllocatesUniqueInodesAcrossSaveRestore) {
+ // Use sockets as a convenient way to create inodes on an anonymous device.
+ int fd;
+ ASSERT_THAT(fd = socket(AF_UNIX, SOCK_STREAM, 0), SyscallSucceeds());
+ FileDescriptor fd1(fd);
+ MaybeSave();
+ ASSERT_THAT(fd = socket(AF_UNIX, SOCK_STREAM, 0), SyscallSucceeds());
+ FileDescriptor fd2(fd);
+
+ struct stat st1;
+ struct stat st2;
+ ASSERT_THAT(fstat(fd1.get(), &st1), SyscallSucceeds());
+ ASSERT_THAT(fstat(fd2.get(), &st2), SyscallSucceeds());
+
+ // The two fds should have different inode numbers.
+ EXPECT_NE(st2.st_ino, st1.st_ino);
+
+ // Verify again after another S/R cycle. The inode numbers should remain the
+ // same.
+ MaybeSave();
+
+ struct stat st1_after;
+ struct stat st2_after;
+ ASSERT_THAT(fstat(fd1.get(), &st1_after), SyscallSucceeds());
+ ASSERT_THAT(fstat(fd2.get(), &st2_after), SyscallSucceeds());
+
+ EXPECT_EQ(st1_after.st_ino, st1.st_ino);
+ EXPECT_EQ(st2_after.st_ino, st2.st_ino);
+}
+
+#ifndef SYS_statx
+#if defined(__x86_64__)
+#define SYS_statx 332
+#elif defined(__aarch64__)
+#define SYS_statx 291
+#else
+#error "Unknown architecture"
+#endif
+#endif // SYS_statx
+
+#ifndef STATX_ALL
+#define STATX_ALL 0x00000fffU
+#endif // STATX_ALL
+
+// struct kernel_statx_timestamp is a Linux statx_timestamp struct.
+struct kernel_statx_timestamp {
+ int64_t tv_sec;
+ uint32_t tv_nsec;
+ int32_t __reserved;
+};
+
+// struct kernel_statx is a Linux statx struct. Old versions of glibc do not
+// expose it. See include/uapi/linux/stat.h
+struct kernel_statx {
+ uint32_t stx_mask;
+ uint32_t stx_blksize;
+ uint64_t stx_attributes;
+ uint32_t stx_nlink;
+ uint32_t stx_uid;
+ uint32_t stx_gid;
+ uint16_t stx_mode;
+ uint16_t __spare0[1];
+ uint64_t stx_ino;
+ uint64_t stx_size;
+ uint64_t stx_blocks;
+ uint64_t stx_attributes_mask;
+ struct kernel_statx_timestamp stx_atime;
+ struct kernel_statx_timestamp stx_btime;
+ struct kernel_statx_timestamp stx_ctime;
+ struct kernel_statx_timestamp stx_mtime;
+ uint32_t stx_rdev_major;
+ uint32_t stx_rdev_minor;
+ uint32_t stx_dev_major;
+ uint32_t stx_dev_minor;
+ uint64_t __spare2[14];
+};
+
+int statx(int dirfd, const char* pathname, int flags, unsigned int mask,
+ struct kernel_statx* statxbuf) {
+ return syscall(SYS_statx, dirfd, pathname, flags, mask, statxbuf);
+}
+
+TEST_F(StatTest, StatxAbsPath) {
+ SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 &&
+ errno == ENOSYS);
+
+ struct kernel_statx stx;
+ EXPECT_THAT(statx(-1, test_file_name_.c_str(), 0, STATX_ALL, &stx),
+ SyscallSucceeds());
+ EXPECT_TRUE(S_ISREG(stx.stx_mode));
+}
+
+TEST_F(StatTest, StatxRelPathDirFD) {
+ SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 &&
+ errno == ENOSYS);
+
+ struct kernel_statx stx;
+ auto const dirfd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(GetAbsoluteTestTmpdir(), O_RDONLY));
+ auto filename = std::string(Basename(test_file_name_));
+
+ EXPECT_THAT(statx(dirfd.get(), filename.c_str(), 0, STATX_ALL, &stx),
+ SyscallSucceeds());
+ EXPECT_TRUE(S_ISREG(stx.stx_mode));
+}
+
+TEST_F(StatTest, StatxRelPathCwd) {
+ SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 &&
+ errno == ENOSYS);
+
+ ASSERT_THAT(chdir(GetAbsoluteTestTmpdir().c_str()), SyscallSucceeds());
+ auto filename = std::string(Basename(test_file_name_));
+ struct kernel_statx stx;
+ EXPECT_THAT(statx(AT_FDCWD, filename.c_str(), 0, STATX_ALL, &stx),
+ SyscallSucceeds());
+ EXPECT_TRUE(S_ISREG(stx.stx_mode));
+}
+
+TEST_F(StatTest, StatxEmptyPath) {
+ SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 &&
+ errno == ENOSYS);
+
+ const auto fd = ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDONLY));
+ struct kernel_statx stx;
+ EXPECT_THAT(statx(fd.get(), "", AT_EMPTY_PATH, STATX_ALL, &stx),
+ SyscallSucceeds());
+ EXPECT_TRUE(S_ISREG(stx.stx_mode));
+}
+
+TEST_F(StatTest, StatxDoesNotRejectExtraneousMaskBits) {
+ SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 &&
+ errno == ENOSYS);
+
+ struct kernel_statx stx;
+ // Set all mask bits except for STATX__RESERVED.
+ uint mask = 0xffffffff & ~0x80000000;
+ EXPECT_THAT(statx(-1, test_file_name_.c_str(), 0, mask, &stx),
+ SyscallSucceeds());
+ EXPECT_TRUE(S_ISREG(stx.stx_mode));
+}
+
+TEST_F(StatTest, StatxRejectsReservedMaskBit) {
+ SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 &&
+ errno == ENOSYS);
+
+ struct kernel_statx stx;
+ // Set STATX__RESERVED in the mask.
+ EXPECT_THAT(statx(-1, test_file_name_.c_str(), 0, 0x80000000, &stx),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_F(StatTest, StatxSymlink) {
+ SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 &&
+ errno == ENOSYS);
+
+ std::string parent_dir = "/tmp";
+ TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateSymlinkTo(parent_dir, test_file_name_));
+ std::string p = link.path();
+
+ struct kernel_statx stx;
+ EXPECT_THAT(statx(AT_FDCWD, p.c_str(), AT_SYMLINK_NOFOLLOW, STATX_ALL, &stx),
+ SyscallSucceeds());
+ EXPECT_TRUE(S_ISLNK(stx.stx_mode));
+ EXPECT_THAT(statx(AT_FDCWD, p.c_str(), 0, STATX_ALL, &stx),
+ SyscallSucceeds());
+ EXPECT_TRUE(S_ISREG(stx.stx_mode));
+}
+
+TEST_F(StatTest, StatxInvalidFlags) {
+ SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 &&
+ errno == ENOSYS);
+
+ struct kernel_statx stx;
+ EXPECT_THAT(statx(AT_FDCWD, test_file_name_.c_str(), 12345, 0, &stx),
+ SyscallFailsWithErrno(EINVAL));
+
+ // Sync flags are mutually exclusive.
+ EXPECT_THAT(statx(AT_FDCWD, test_file_name_.c_str(),
+ AT_STATX_FORCE_SYNC | AT_STATX_DONT_SYNC, 0, &stx),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/stat_times.cc b/test/syscalls/linux/stat_times.cc
new file mode 100644
index 000000000..68c0bef09
--- /dev/null
+++ b/test/syscalls/linux/stat_times.cc
@@ -0,0 +1,303 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <sys/stat.h>
+
+#include <tuple>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+using ::testing::IsEmpty;
+using ::testing::Not;
+
+std::tuple<absl::Time, absl::Time, absl::Time> GetTime(const TempPath& file) {
+ struct stat statbuf = {};
+ EXPECT_THAT(stat(file.path().c_str(), &statbuf), SyscallSucceeds());
+
+ const auto atime = absl::TimeFromTimespec(statbuf.st_atim);
+ const auto mtime = absl::TimeFromTimespec(statbuf.st_mtim);
+ const auto ctime = absl::TimeFromTimespec(statbuf.st_ctim);
+ return std::make_tuple(atime, mtime, ctime);
+}
+
+enum class AtimeEffect {
+ Unchanged,
+ Changed,
+};
+
+enum class MtimeEffect {
+ Unchanged,
+ Changed,
+};
+
+enum class CtimeEffect {
+ Unchanged,
+ Changed,
+};
+
+// Tests that fn modifies the atime/mtime/ctime of path as specified.
+void CheckTimes(const TempPath& path, std::function<void()> fn,
+ AtimeEffect atime_effect, MtimeEffect mtime_effect,
+ CtimeEffect ctime_effect) {
+ absl::Time atime, mtime, ctime;
+ std::tie(atime, mtime, ctime) = GetTime(path);
+
+ // FIXME(b/132819225): gVisor filesystem timestamps inconsistently use the
+ // internal or host clock, which may diverge slightly. Allow some slack on
+ // times to account for the difference.
+ //
+ // Here we sleep for 1s so that initial creation of path doesn't fall within
+ // the before slack window.
+ absl::SleepFor(absl::Seconds(1));
+
+ const absl::Time before = absl::Now() - absl::Seconds(1);
+
+ // Perform the op.
+ fn();
+
+ const absl::Time after = absl::Now() + absl::Seconds(1);
+
+ absl::Time atime2, mtime2, ctime2;
+ std::tie(atime2, mtime2, ctime2) = GetTime(path);
+
+ if (atime_effect == AtimeEffect::Changed) {
+ EXPECT_LE(before, atime2);
+ EXPECT_GE(after, atime2);
+ EXPECT_GT(atime2, atime);
+ } else {
+ EXPECT_EQ(atime2, atime);
+ }
+
+ if (mtime_effect == MtimeEffect::Changed) {
+ EXPECT_LE(before, mtime2);
+ EXPECT_GE(after, mtime2);
+ EXPECT_GT(mtime2, mtime);
+ } else {
+ EXPECT_EQ(mtime2, mtime);
+ }
+
+ if (ctime_effect == CtimeEffect::Changed) {
+ EXPECT_LE(before, ctime2);
+ EXPECT_GE(after, ctime2);
+ EXPECT_GT(ctime2, ctime);
+ } else {
+ EXPECT_EQ(ctime2, ctime);
+ }
+}
+
+// File creation time is reflected in atime, mtime, and ctime.
+TEST(StatTimesTest, FileCreation) {
+ const DisableSave ds; // Timing-related test.
+
+ // Get a time for when the file is created.
+ //
+ // FIXME(b/132819225): See above.
+ const absl::Time before = absl::Now() - absl::Seconds(1);
+ const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const absl::Time after = absl::Now() + absl::Seconds(1);
+
+ absl::Time atime, mtime, ctime;
+ std::tie(atime, mtime, ctime) = GetTime(file);
+
+ EXPECT_LE(before, atime);
+ EXPECT_LE(before, mtime);
+ EXPECT_LE(before, ctime);
+ EXPECT_GE(after, atime);
+ EXPECT_GE(after, mtime);
+ EXPECT_GE(after, ctime);
+}
+
+// Calling chmod on a file changes ctime.
+TEST(StatTimesTest, FileChmod) {
+ TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+ auto fn = [&] {
+ EXPECT_THAT(chmod(file.path().c_str(), 0666), SyscallSucceeds());
+ };
+ CheckTimes(file, fn, AtimeEffect::Unchanged, MtimeEffect::Unchanged,
+ CtimeEffect::Changed);
+}
+
+// Renaming a file changes ctime.
+TEST(StatTimesTest, FileRename) {
+ TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+ const std::string newpath = NewTempAbsPath();
+
+ auto fn = [&] {
+ ASSERT_THAT(rename(file.release().c_str(), newpath.c_str()),
+ SyscallSucceeds());
+ file.reset(newpath);
+ };
+ CheckTimes(file, fn, AtimeEffect::Unchanged, MtimeEffect::Unchanged,
+ CtimeEffect::Changed);
+}
+
+// Renaming a file changes ctime, even with an open FD.
+//
+// NOTE(b/132732387): This is a regression test for fs/gofer failing to update
+// cached ctime.
+TEST(StatTimesTest, FileRenameOpenFD) {
+ TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+ // Holding an FD shouldn't affect behavior.
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY));
+
+ const std::string newpath = NewTempAbsPath();
+
+ // FIXME(b/132814682): Restore fails with an uncached gofer and an open FD
+ // across rename.
+ //
+ // N.B. The logic here looks backwards because it isn't possible to
+ // conditionally disable save, only conditionally re-enable it.
+ DisableSave ds;
+ if (!getenv("GVISOR_GOFER_UNCACHED")) {
+ ds.reset();
+ }
+
+ auto fn = [&] {
+ ASSERT_THAT(rename(file.release().c_str(), newpath.c_str()),
+ SyscallSucceeds());
+ file.reset(newpath);
+ };
+ CheckTimes(file, fn, AtimeEffect::Unchanged, MtimeEffect::Unchanged,
+ CtimeEffect::Changed);
+}
+
+// Calling utimes on a file changes ctime and the time that we ask to change
+// (atime to now in this case).
+TEST(StatTimesTest, FileUtimes) {
+ TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+ auto fn = [&] {
+ const struct timespec ts[2] = {{0, UTIME_NOW}, {0, UTIME_OMIT}};
+ ASSERT_THAT(utimensat(AT_FDCWD, file.path().c_str(), ts, 0),
+ SyscallSucceeds());
+ };
+ CheckTimes(file, fn, AtimeEffect::Changed, MtimeEffect::Unchanged,
+ CtimeEffect::Changed);
+}
+
+// Truncating a file changes mtime and ctime.
+TEST(StatTimesTest, FileTruncate) {
+ const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateFileWith(GetAbsoluteTestTmpdir(), "yaaass", 0666));
+
+ auto fn = [&] {
+ EXPECT_THAT(truncate(file.path().c_str(), 0), SyscallSucceeds());
+ };
+ CheckTimes(file, fn, AtimeEffect::Unchanged, MtimeEffect::Changed,
+ CtimeEffect::Changed);
+}
+
+// Writing a file changes mtime and ctime.
+TEST(StatTimesTest, FileWrite) {
+ const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateFileWith(GetAbsoluteTestTmpdir(), "yaaass", 0666));
+
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR, 0));
+
+ auto fn = [&] {
+ const std::string contents = "all the single dollars";
+ EXPECT_THAT(WriteFd(fd.get(), contents.data(), contents.size()),
+ SyscallSucceeds());
+ };
+ CheckTimes(file, fn, AtimeEffect::Unchanged, MtimeEffect::Changed,
+ CtimeEffect::Changed);
+}
+
+// Reading a file changes atime.
+TEST(StatTimesTest, FileRead) {
+ const std::string contents = "bills bills bills";
+ const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateFileWith(GetAbsoluteTestTmpdir(), contents, 0666));
+
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDONLY, 0));
+
+ auto fn = [&] {
+ char buf[20];
+ ASSERT_THAT(ReadFd(fd.get(), buf, sizeof(buf)),
+ SyscallSucceedsWithValue(contents.size()));
+ };
+ CheckTimes(file, fn, AtimeEffect::Changed, MtimeEffect::Unchanged,
+ CtimeEffect::Unchanged);
+}
+
+// Listing files in a directory changes atime.
+TEST(StatTimesTest, DirList) {
+ const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const TempPath file =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir.path()));
+
+ auto fn = [&] {
+ const auto contents = ASSERT_NO_ERRNO_AND_VALUE(ListDir(dir.path(), false));
+ EXPECT_THAT(contents, Not(IsEmpty()));
+ };
+ CheckTimes(dir, fn, AtimeEffect::Changed, MtimeEffect::Unchanged,
+ CtimeEffect::Unchanged);
+}
+
+// Creating a file in a directory changes mtime and ctime.
+TEST(StatTimesTest, DirCreateFile) {
+ const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+ TempPath file;
+ auto fn = [&] {
+ file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir.path()));
+ };
+ CheckTimes(dir, fn, AtimeEffect::Unchanged, MtimeEffect::Changed,
+ CtimeEffect::Changed);
+}
+
+// Creating a directory in a directory changes mtime and ctime.
+TEST(StatTimesTest, DirCreateDir) {
+ const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+ TempPath dir2;
+ auto fn = [&] {
+ dir2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(dir.path()));
+ };
+ CheckTimes(dir, fn, AtimeEffect::Unchanged, MtimeEffect::Changed,
+ CtimeEffect::Changed);
+}
+
+// Removing a file from a directory changes mtime and ctime.
+TEST(StatTimesTest, DirRemoveFile) {
+ const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+ TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir.path()));
+ auto fn = [&] { file.reset(); };
+ CheckTimes(dir, fn, AtimeEffect::Unchanged, MtimeEffect::Changed,
+ CtimeEffect::Changed);
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/statfs.cc b/test/syscalls/linux/statfs.cc
new file mode 100644
index 000000000..aca51d30f
--- /dev/null
+++ b/test/syscalls/linux/statfs.cc
@@ -0,0 +1,82 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <sys/statfs.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(StatfsTest, CannotStatBadPath) {
+ auto temp_file = NewTempAbsPathInDir("/tmp");
+
+ struct statfs st;
+ EXPECT_THAT(statfs(temp_file.c_str(), &st), SyscallFailsWithErrno(ENOENT));
+}
+
+TEST(StatfsTest, InternalTmpfs) {
+ auto temp_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+ struct statfs st;
+ EXPECT_THAT(statfs(temp_file.path().c_str(), &st), SyscallSucceeds());
+}
+
+TEST(StatfsTest, InternalDevShm) {
+ struct statfs st;
+ EXPECT_THAT(statfs("/dev/shm", &st), SyscallSucceeds());
+}
+
+TEST(StatfsTest, NameLen) {
+ struct statfs st;
+ EXPECT_THAT(statfs("/dev/shm", &st), SyscallSucceeds());
+
+ // This assumes that /dev/shm is tmpfs.
+ EXPECT_EQ(st.f_namelen, NAME_MAX);
+}
+
+TEST(FstatfsTest, CannotStatBadFd) {
+ struct statfs st;
+ EXPECT_THAT(fstatfs(-1, &st), SyscallFailsWithErrno(EBADF));
+}
+
+TEST(FstatfsTest, InternalTmpfs) {
+ auto temp_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(temp_file.path(), O_RDONLY));
+
+ struct statfs st;
+ EXPECT_THAT(fstatfs(fd.get(), &st), SyscallSucceeds());
+}
+
+TEST(FstatfsTest, InternalDevShm) {
+ auto temp_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/shm", O_RDONLY));
+
+ struct statfs st;
+ EXPECT_THAT(fstatfs(fd.get(), &st), SyscallSucceeds());
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/sticky.cc b/test/syscalls/linux/sticky.cc
new file mode 100644
index 000000000..4afed6d08
--- /dev/null
+++ b/test/syscalls/linux/sticky.cc
@@ -0,0 +1,161 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <grp.h>
+#include <sys/prctl.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "absl/flags/flag.h"
+#include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+ABSL_FLAG(int32_t, scratch_uid, 65534, "first scratch UID");
+ABSL_FLAG(int32_t, scratch_gid, 65534, "first scratch GID");
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(StickyTest, StickyBitPermDenied) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SETUID)));
+
+ const TempPath parent = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ EXPECT_THAT(chmod(parent.path().c_str(), 0777 | S_ISVTX), SyscallSucceeds());
+
+ // After changing credentials below, we need to use an open fd to make
+ // modifications in the parent dir, because there is no guarantee that we will
+ // still have the ability to open it.
+ const FileDescriptor parent_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(parent.path(), O_DIRECTORY));
+ ASSERT_THAT(openat(parent_fd.get(), "file", O_CREAT), SyscallSucceeds());
+ ASSERT_THAT(mkdirat(parent_fd.get(), "dir", 0777), SyscallSucceeds());
+ ASSERT_THAT(symlinkat("xyz", parent_fd.get(), "link"), SyscallSucceeds());
+
+ // Drop privileges and change IDs only in child thread, or else this parent
+ // thread won't be able to open some log files after the test ends.
+ ScopedThread([&] {
+ // Drop privileges.
+ if (HaveCapability(CAP_FOWNER).ValueOrDie()) {
+ EXPECT_NO_ERRNO(SetCapability(CAP_FOWNER, false));
+ }
+
+ // Change EUID and EGID.
+ EXPECT_THAT(
+ syscall(SYS_setresgid, -1, absl::GetFlag(FLAGS_scratch_gid), -1),
+ SyscallSucceeds());
+ EXPECT_THAT(
+ syscall(SYS_setresuid, -1, absl::GetFlag(FLAGS_scratch_uid), -1),
+ SyscallSucceeds());
+
+ EXPECT_THAT(renameat(parent_fd.get(), "file", parent_fd.get(), "file2"),
+ SyscallFailsWithErrno(EPERM));
+ EXPECT_THAT(unlinkat(parent_fd.get(), "file", 0),
+ SyscallFailsWithErrno(EPERM));
+ EXPECT_THAT(unlinkat(parent_fd.get(), "dir", AT_REMOVEDIR),
+ SyscallFailsWithErrno(EPERM));
+ EXPECT_THAT(unlinkat(parent_fd.get(), "link", 0),
+ SyscallFailsWithErrno(EPERM));
+ });
+}
+
+TEST(StickyTest, StickyBitSameUID) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SETUID)));
+
+ const TempPath parent = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ EXPECT_THAT(chmod(parent.path().c_str(), 0777 | S_ISVTX), SyscallSucceeds());
+
+ // After changing credentials below, we need to use an open fd to make
+ // modifications in the parent dir, because there is no guarantee that we will
+ // still have the ability to open it.
+ const FileDescriptor parent_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(parent.path(), O_DIRECTORY));
+ ASSERT_THAT(openat(parent_fd.get(), "file", O_CREAT), SyscallSucceeds());
+ ASSERT_THAT(mkdirat(parent_fd.get(), "dir", 0777), SyscallSucceeds());
+ ASSERT_THAT(symlinkat("xyz", parent_fd.get(), "link"), SyscallSucceeds());
+
+ // Drop privileges and change IDs only in child thread, or else this parent
+ // thread won't be able to open some log files after the test ends.
+ ScopedThread([&] {
+ // Drop privileges.
+ if (HaveCapability(CAP_FOWNER).ValueOrDie()) {
+ EXPECT_NO_ERRNO(SetCapability(CAP_FOWNER, false));
+ }
+
+ // Change EGID.
+ EXPECT_THAT(
+ syscall(SYS_setresgid, -1, absl::GetFlag(FLAGS_scratch_gid), -1),
+ SyscallSucceeds());
+
+ // We still have the same EUID.
+ EXPECT_THAT(renameat(parent_fd.get(), "file", parent_fd.get(), "file2"),
+ SyscallSucceeds());
+ EXPECT_THAT(unlinkat(parent_fd.get(), "file2", 0), SyscallSucceeds());
+ EXPECT_THAT(unlinkat(parent_fd.get(), "dir", AT_REMOVEDIR),
+ SyscallSucceeds());
+ EXPECT_THAT(unlinkat(parent_fd.get(), "link", 0), SyscallSucceeds());
+ });
+}
+
+TEST(StickyTest, StickyBitCapFOWNER) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SETUID)));
+
+ const TempPath parent = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ EXPECT_THAT(chmod(parent.path().c_str(), 0777 | S_ISVTX), SyscallSucceeds());
+
+ // After changing credentials below, we need to use an open fd to make
+ // modifications in the parent dir, because there is no guarantee that we will
+ // still have the ability to open it.
+ const FileDescriptor parent_fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(parent.path(), O_DIRECTORY));
+ ASSERT_THAT(openat(parent_fd.get(), "file", O_CREAT), SyscallSucceeds());
+ ASSERT_THAT(mkdirat(parent_fd.get(), "dir", 0777), SyscallSucceeds());
+ ASSERT_THAT(symlinkat("xyz", parent_fd.get(), "link"), SyscallSucceeds());
+
+ // Drop privileges and change IDs only in child thread, or else this parent
+ // thread won't be able to open some log files after the test ends.
+ ScopedThread([&] {
+ // Set PR_SET_KEEPCAPS.
+ EXPECT_THAT(prctl(PR_SET_KEEPCAPS, 1, 0, 0, 0), SyscallSucceeds());
+
+ // Change EUID and EGID.
+ EXPECT_THAT(
+ syscall(SYS_setresgid, -1, absl::GetFlag(FLAGS_scratch_gid), -1),
+ SyscallSucceeds());
+ EXPECT_THAT(
+ syscall(SYS_setresuid, -1, absl::GetFlag(FLAGS_scratch_uid), -1),
+ SyscallSucceeds());
+
+ EXPECT_NO_ERRNO(SetCapability(CAP_FOWNER, true));
+ EXPECT_THAT(renameat(parent_fd.get(), "file", parent_fd.get(), "file2"),
+ SyscallSucceeds());
+ EXPECT_THAT(unlinkat(parent_fd.get(), "file2", 0), SyscallSucceeds());
+ EXPECT_THAT(unlinkat(parent_fd.get(), "dir", AT_REMOVEDIR),
+ SyscallSucceeds());
+ EXPECT_THAT(unlinkat(parent_fd.get(), "link", 0), SyscallSucceeds());
+ });
+}
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/symlink.cc b/test/syscalls/linux/symlink.cc
new file mode 100644
index 000000000..a17ff62e9
--- /dev/null
+++ b/test/syscalls/linux/symlink.cc
@@ -0,0 +1,402 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <string>
+
+#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+mode_t FilePermission(const std::string& path) {
+ struct stat buf = {0};
+ TEST_CHECK(lstat(path.c_str(), &buf) == 0);
+ return buf.st_mode & 0777;
+}
+
+// Test that name collisions are checked on the new link path, not the source
+// path. Regression test for b/31782115.
+TEST(SymlinkTest, CanCreateSymlinkWithCachedSourceDirent) {
+ const std::string srcname = NewTempAbsPath();
+ const std::string newname = NewTempAbsPath();
+ const std::string basedir = std::string(Dirname(srcname));
+ ASSERT_EQ(basedir, Dirname(newname));
+
+ ASSERT_THAT(chdir(basedir.c_str()), SyscallSucceeds());
+
+ // Open the source node to cause the underlying dirent to be cached. It will
+ // remain cached while we have the file open.
+ int fd;
+ ASSERT_THAT(fd = open(srcname.c_str(), O_CREAT | O_RDWR, 0666),
+ SyscallSucceeds());
+ FileDescriptor fd_closer(fd);
+
+ // Attempt to create a symlink. If the bug exists, this will fail since the
+ // dirent link creation code will check for a name collision on the source
+ // link name.
+ EXPECT_THAT(symlink(std::string(Basename(srcname)).c_str(),
+ std::string(Basename(newname)).c_str()),
+ SyscallSucceeds());
+}
+
+TEST(SymlinkTest, CanCreateSymlinkFile) {
+ const std::string oldname = NewTempAbsPath();
+ const std::string newname = NewTempAbsPath();
+
+ int fd;
+ ASSERT_THAT(fd = open(oldname.c_str(), O_CREAT | O_RDWR, 0666),
+ SyscallSucceeds());
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+
+ EXPECT_THAT(symlink(oldname.c_str(), newname.c_str()), SyscallSucceeds());
+ EXPECT_EQ(FilePermission(newname), 0777);
+
+ auto link = ASSERT_NO_ERRNO_AND_VALUE(ReadLink(newname));
+ EXPECT_EQ(oldname, link);
+
+ EXPECT_THAT(unlink(newname.c_str()), SyscallSucceeds());
+ EXPECT_THAT(unlink(oldname.c_str()), SyscallSucceeds());
+}
+
+TEST(SymlinkTest, CanCreateSymlinkDir) {
+ const std::string olddir = NewTempAbsPath();
+ const std::string newdir = NewTempAbsPath();
+
+ EXPECT_THAT(mkdir(olddir.c_str(), 0777), SyscallSucceeds());
+ EXPECT_THAT(symlink(olddir.c_str(), newdir.c_str()), SyscallSucceeds());
+ EXPECT_EQ(FilePermission(newdir), 0777);
+
+ auto link = ASSERT_NO_ERRNO_AND_VALUE(ReadLink(newdir));
+ EXPECT_EQ(olddir, link);
+
+ EXPECT_THAT(unlink(newdir.c_str()), SyscallSucceeds());
+
+ ASSERT_THAT(rmdir(olddir.c_str()), SyscallSucceeds());
+}
+
+TEST(SymlinkTest, CannotCreateSymlinkInReadOnlyDir) {
+ // Drop capabilities that allow us to override file and directory permissions.
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+ const std::string olddir = NewTempAbsPath();
+ ASSERT_THAT(mkdir(olddir.c_str(), 0444), SyscallSucceeds());
+
+ const std::string newdir = NewTempAbsPathInDir(olddir);
+ EXPECT_THAT(symlink(olddir.c_str(), newdir.c_str()),
+ SyscallFailsWithErrno(EACCES));
+
+ ASSERT_THAT(rmdir(olddir.c_str()), SyscallSucceeds());
+}
+
+TEST(SymlinkTest, CannotSymlinkOverExistingFile) {
+ const auto oldfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const auto newfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+ EXPECT_THAT(symlink(oldfile.path().c_str(), newfile.path().c_str()),
+ SyscallFailsWithErrno(EEXIST));
+}
+
+TEST(SymlinkTest, CannotSymlinkOverExistingDir) {
+ const auto oldfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const auto newdir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+ EXPECT_THAT(symlink(oldfile.path().c_str(), newdir.path().c_str()),
+ SyscallFailsWithErrno(EEXIST));
+}
+
+TEST(SymlinkTest, OldnameIsEmpty) {
+ const std::string newname = NewTempAbsPath();
+ EXPECT_THAT(symlink("", newname.c_str()), SyscallFailsWithErrno(ENOENT));
+}
+
+TEST(SymlinkTest, OldnameIsDangling) {
+ const std::string newname = NewTempAbsPath();
+ EXPECT_THAT(symlink("/dangling", newname.c_str()), SyscallSucceeds());
+
+ // This is required for S/R random save tests, which pre-run this test
+ // in the same TEST_TMPDIR, which means that we need to clean it for any
+ // operations exclusively creating files, like symlink above.
+ EXPECT_THAT(unlink(newname.c_str()), SyscallSucceeds());
+}
+
+TEST(SymlinkTest, NewnameCannotExist) {
+ const std::string newname =
+ JoinPath(GetAbsoluteTestTmpdir(), "thisdoesnotexist", "foo");
+ EXPECT_THAT(symlink("/thisdoesnotmatter", newname.c_str()),
+ SyscallFailsWithErrno(ENOENT));
+}
+
+TEST(SymlinkTest, CanEvaluateLink) {
+ const auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+
+ // We are going to assert that the symlink inode id is the same as the linked
+ // file's inode id. In order for the inode id to be stable across
+ // save/restore, it must be kept open. The FileDescriptor type will do that
+ // for us automatically.
+ auto fd = ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR));
+ struct stat file_st;
+ EXPECT_THAT(fstat(fd.get(), &file_st), SyscallSucceeds());
+
+ const std::string link = NewTempAbsPath();
+ EXPECT_THAT(symlink(file.path().c_str(), link.c_str()), SyscallSucceeds());
+ EXPECT_EQ(FilePermission(link), 0777);
+
+ auto linkfd = ASSERT_NO_ERRNO_AND_VALUE(Open(link.c_str(), O_RDWR));
+ struct stat link_st;
+ EXPECT_THAT(fstat(linkfd.get(), &link_st), SyscallSucceeds());
+
+ // Check that in fact newname points to the file we expect.
+ EXPECT_EQ(file_st.st_dev, link_st.st_dev);
+ EXPECT_EQ(file_st.st_ino, link_st.st_ino);
+}
+
+TEST(SymlinkTest, TargetIsNotMapped) {
+ const std::string oldname = NewTempAbsPath();
+ const std::string newname = NewTempAbsPath();
+
+ int fd;
+ // Create the target so that when we read the link, it exists.
+ ASSERT_THAT(fd = open(oldname.c_str(), O_CREAT | O_RDWR, 0666),
+ SyscallSucceeds());
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+
+ // Create a symlink called newname that points to oldname.
+ EXPECT_THAT(symlink(oldname.c_str(), newname.c_str()), SyscallSucceeds());
+
+ std::vector<char> buf(1024);
+ int linksize;
+ // Read the link and assert that the oldname is still the same.
+ EXPECT_THAT(linksize = readlink(newname.c_str(), buf.data(), 1024),
+ SyscallSucceeds());
+ EXPECT_EQ(0, strncmp(oldname.c_str(), buf.data(), linksize));
+
+ EXPECT_THAT(unlink(newname.c_str()), SyscallSucceeds());
+ EXPECT_THAT(unlink(oldname.c_str()), SyscallSucceeds());
+}
+
+TEST(SymlinkTest, PreadFromSymlink) {
+ std::string name = NewTempAbsPath();
+ int fd;
+ ASSERT_THAT(fd = open(name.c_str(), O_CREAT, 0644), SyscallSucceeds());
+ ASSERT_THAT(close(fd), SyscallSucceeds());
+
+ std::string linkname = NewTempAbsPath();
+ ASSERT_THAT(symlink(name.c_str(), linkname.c_str()), SyscallSucceeds());
+
+ ASSERT_THAT(fd = open(linkname.c_str(), O_RDONLY), SyscallSucceeds());
+
+ char buf[1024];
+ EXPECT_THAT(pread64(fd, buf, 1024, 0), SyscallSucceeds());
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+
+ EXPECT_THAT(unlink(name.c_str()), SyscallSucceeds());
+ EXPECT_THAT(unlink(linkname.c_str()), SyscallSucceeds());
+}
+
+TEST(SymlinkTest, SymlinkAtDegradedPermissions_NoRandomSave) {
+ // Drop capabilities that allow us to override file and directory permissions.
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+ auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir.path()));
+
+ int dirfd;
+ ASSERT_THAT(dirfd = open(dir.path().c_str(), O_DIRECTORY, 0),
+ SyscallSucceeds());
+
+ const DisableSave ds; // Permissions are dropped.
+ EXPECT_THAT(fchmod(dirfd, 0), SyscallSucceeds());
+
+ std::string basename = std::string(Basename(file.path()));
+ EXPECT_THAT(symlinkat("/dangling", dirfd, basename.c_str()),
+ SyscallFailsWithErrno(EACCES));
+ EXPECT_THAT(close(dirfd), SyscallSucceeds());
+}
+
+TEST(SymlinkTest, ReadlinkAtDegradedPermissions_NoRandomSave) {
+ // Drop capabilities that allow us to override file and directory permissions.
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+ auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const std::string oldpath = NewTempAbsPathInDir(dir.path());
+ const std::string oldbase = std::string(Basename(oldpath));
+ ASSERT_THAT(symlink("/dangling", oldpath.c_str()), SyscallSucceeds());
+
+ int dirfd;
+ EXPECT_THAT(dirfd = open(dir.path().c_str(), O_DIRECTORY, 0),
+ SyscallSucceeds());
+
+ const DisableSave ds; // Permissions are dropped.
+ EXPECT_THAT(fchmod(dirfd, 0), SyscallSucceeds());
+
+ char buf[1024];
+ int linksize;
+ EXPECT_THAT(linksize = readlinkat(dirfd, oldbase.c_str(), buf, 1024),
+ SyscallFailsWithErrno(EACCES));
+ EXPECT_THAT(close(dirfd), SyscallSucceeds());
+}
+
+TEST(SymlinkTest, ChmodSymlink) {
+ auto target = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const std::string newpath = NewTempAbsPath();
+ ASSERT_THAT(symlink(target.path().c_str(), newpath.c_str()),
+ SyscallSucceeds());
+ EXPECT_EQ(FilePermission(newpath), 0777);
+ EXPECT_THAT(chmod(newpath.c_str(), 0666), SyscallSucceeds());
+ EXPECT_EQ(FilePermission(newpath), 0777);
+}
+
+// Test that following a symlink updates the atime on the symlink.
+TEST(SymlinkTest, FollowUpdatesATime) {
+ const auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const std::string link = NewTempAbsPath();
+ EXPECT_THAT(symlink(file.path().c_str(), link.c_str()), SyscallSucceeds());
+
+ // Lstat the symlink.
+ struct stat st_before_follow;
+ ASSERT_THAT(lstat(link.c_str(), &st_before_follow), SyscallSucceeds());
+
+ // Let the clock advance.
+ absl::SleepFor(absl::Seconds(1));
+
+ // Open the file via the symlink.
+ int fd;
+ ASSERT_THAT(fd = open(link.c_str(), O_RDWR, 0666), SyscallSucceeds());
+ FileDescriptor fd_closer(fd);
+
+ // Lstat the symlink again, and check that atime is updated.
+ struct stat st_after_follow;
+ ASSERT_THAT(lstat(link.c_str(), &st_after_follow), SyscallSucceeds());
+ EXPECT_LT(st_before_follow.st_atime, st_after_follow.st_atime);
+}
+
+class ParamSymlinkTest : public ::testing::TestWithParam<std::string> {};
+
+// Test that creating an existing symlink with creat will create the target.
+TEST_P(ParamSymlinkTest, CreatLinkCreatesTarget) {
+ const std::string target = GetParam();
+ const std::string linkpath = NewTempAbsPath();
+
+ ASSERT_THAT(symlink(target.c_str(), linkpath.c_str()), SyscallSucceeds());
+
+ int fd;
+ EXPECT_THAT(fd = creat(linkpath.c_str(), 0666), SyscallSucceeds());
+ ASSERT_THAT(close(fd), SyscallSucceeds());
+
+ ASSERT_THAT(chdir(GetAbsoluteTestTmpdir().c_str()), SyscallSucceeds());
+ struct stat st;
+ EXPECT_THAT(stat(target.c_str(), &st), SyscallSucceeds());
+
+ ASSERT_THAT(unlink(linkpath.c_str()), SyscallSucceeds());
+ ASSERT_THAT(unlink(target.c_str()), SyscallSucceeds());
+}
+
+// Test that opening an existing symlink with O_CREAT will create the target.
+TEST_P(ParamSymlinkTest, OpenLinkCreatesTarget) {
+ const std::string target = GetParam();
+ const std::string linkpath = NewTempAbsPath();
+
+ ASSERT_THAT(symlink(target.c_str(), linkpath.c_str()), SyscallSucceeds());
+
+ int fd;
+ EXPECT_THAT(fd = open(linkpath.c_str(), O_CREAT, 0666), SyscallSucceeds());
+ ASSERT_THAT(close(fd), SyscallSucceeds());
+
+ ASSERT_THAT(chdir(GetAbsoluteTestTmpdir().c_str()), SyscallSucceeds());
+ struct stat st;
+ EXPECT_THAT(stat(target.c_str(), &st), SyscallSucceeds());
+
+ ASSERT_THAT(unlink(linkpath.c_str()), SyscallSucceeds());
+ ASSERT_THAT(unlink(target.c_str()), SyscallSucceeds());
+}
+
+// Test that opening a self-symlink with O_CREAT will fail with ELOOP.
+TEST_P(ParamSymlinkTest, CreateExistingSelfLink) {
+ ASSERT_THAT(chdir(GetAbsoluteTestTmpdir().c_str()), SyscallSucceeds());
+
+ const std::string linkpath = GetParam();
+ ASSERT_THAT(symlink(linkpath.c_str(), linkpath.c_str()), SyscallSucceeds());
+
+ EXPECT_THAT(open(linkpath.c_str(), O_CREAT, 0666),
+ SyscallFailsWithErrno(ELOOP));
+
+ ASSERT_THAT(unlink(linkpath.c_str()), SyscallSucceeds());
+}
+
+// Test that opening a file that is a symlink to its parent directory fails
+// with ELOOP.
+TEST_P(ParamSymlinkTest, CreateExistingParentLink) {
+ ASSERT_THAT(chdir(GetAbsoluteTestTmpdir().c_str()), SyscallSucceeds());
+
+ const std::string linkpath = GetParam();
+ const std::string target = JoinPath(linkpath, "child");
+ ASSERT_THAT(symlink(target.c_str(), linkpath.c_str()), SyscallSucceeds());
+
+ EXPECT_THAT(open(linkpath.c_str(), O_CREAT, 0666),
+ SyscallFailsWithErrno(ELOOP));
+
+ ASSERT_THAT(unlink(linkpath.c_str()), SyscallSucceeds());
+}
+
+// Test that opening an existing symlink with O_CREAT|O_EXCL will fail with
+// EEXIST.
+TEST_P(ParamSymlinkTest, OpenLinkExclFails) {
+ const std::string target = GetParam();
+ const std::string linkpath = NewTempAbsPath();
+
+ ASSERT_THAT(symlink(target.c_str(), linkpath.c_str()), SyscallSucceeds());
+
+ EXPECT_THAT(open(linkpath.c_str(), O_CREAT | O_EXCL, 0666),
+ SyscallFailsWithErrno(EEXIST));
+
+ ASSERT_THAT(unlink(linkpath.c_str()), SyscallSucceeds());
+}
+
+// Test that opening an existing symlink with O_CREAT|O_NOFOLLOW will fail with
+// ELOOP.
+TEST_P(ParamSymlinkTest, OpenLinkNoFollowFails) {
+ const std::string target = GetParam();
+ const std::string linkpath = NewTempAbsPath();
+
+ ASSERT_THAT(symlink(target.c_str(), linkpath.c_str()), SyscallSucceeds());
+
+ EXPECT_THAT(open(linkpath.c_str(), O_CREAT | O_NOFOLLOW, 0666),
+ SyscallFailsWithErrno(ELOOP));
+
+ ASSERT_THAT(unlink(linkpath.c_str()), SyscallSucceeds());
+}
+
+INSTANTIATE_TEST_SUITE_P(AbsAndRelTarget, ParamSymlinkTest,
+ ::testing::Values(NewTempAbsPath(), NewTempRelPath()));
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/sync.cc b/test/syscalls/linux/sync.cc
new file mode 100644
index 000000000..8aa2525a9
--- /dev/null
+++ b/test/syscalls/linux/sync.cc
@@ -0,0 +1,59 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include <string>
+
+#include "gtest/gtest.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(SyncTest, SyncEverything) {
+ ASSERT_THAT(syscall(SYS_sync), SyscallSucceeds());
+}
+
+TEST(SyncTest, SyncFileSytem) {
+ int fd;
+ auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ ASSERT_THAT(fd = open(f.path().c_str(), O_RDONLY), SyscallSucceeds());
+ EXPECT_THAT(syncfs(fd), SyscallSucceeds());
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
+TEST(SyncTest, SyncFromPipe) {
+ int pipes[2];
+ EXPECT_THAT(pipe(pipes), SyscallSucceeds());
+ EXPECT_THAT(syncfs(pipes[0]), SyscallSucceeds());
+ EXPECT_THAT(syncfs(pipes[1]), SyscallSucceeds());
+ EXPECT_THAT(close(pipes[0]), SyscallSucceeds());
+ EXPECT_THAT(close(pipes[1]), SyscallSucceeds());
+}
+
+TEST(SyncTest, CannotSyncFileSytemAtBadFd) {
+ EXPECT_THAT(syncfs(-1), SyscallFailsWithErrno(EBADF));
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/sync_file_range.cc b/test/syscalls/linux/sync_file_range.cc
new file mode 100644
index 000000000..36cc42043
--- /dev/null
+++ b/test/syscalls/linux/sync_file_range.cc
@@ -0,0 +1,112 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include <string>
+
+#include "gtest/gtest.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(SyncFileRangeTest, TempFileSucceeds) {
+ auto tmpfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ auto f = ASSERT_NO_ERRNO_AND_VALUE(Open(tmpfile.path(), O_RDWR));
+ constexpr char data[] = "some data to sync";
+ int fd = f.get();
+
+ EXPECT_THAT(write(fd, data, sizeof(data)),
+ SyscallSucceedsWithValue(sizeof(data)));
+ EXPECT_THAT(sync_file_range(fd, 0, 0, SYNC_FILE_RANGE_WRITE),
+ SyscallSucceeds());
+ EXPECT_THAT(sync_file_range(fd, 0, 0, 0), SyscallSucceeds());
+ EXPECT_THAT(
+ sync_file_range(fd, 0, 0,
+ SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER |
+ SYNC_FILE_RANGE_WAIT_BEFORE),
+ SyscallSucceeds());
+ EXPECT_THAT(sync_file_range(
+ fd, 0, 1, SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER),
+ SyscallSucceeds());
+ EXPECT_THAT(sync_file_range(
+ fd, 1, 0, SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER),
+ SyscallSucceeds());
+}
+
+TEST(SyncFileRangeTest, CannotSyncFileRangeOnUnopenedFd) {
+ auto tmpfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ auto f = ASSERT_NO_ERRNO_AND_VALUE(Open(tmpfile.path(), O_RDWR));
+ constexpr char data[] = "some data to sync";
+ int fd = f.get();
+
+ EXPECT_THAT(write(fd, data, sizeof(data)),
+ SyscallSucceedsWithValue(sizeof(data)));
+
+ pid_t pid = fork();
+ if (pid == 0) {
+ f.reset();
+
+ // fd is now invalid.
+ TEST_CHECK(sync_file_range(fd, 0, 0, SYNC_FILE_RANGE_WRITE) == -1);
+ TEST_PCHECK(errno == EBADF);
+ _exit(0);
+ }
+ ASSERT_THAT(pid, SyscallSucceeds());
+
+ int status = 0;
+ ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
+ EXPECT_TRUE(WIFEXITED(status));
+ EXPECT_EQ(WEXITSTATUS(status), 0);
+}
+
+TEST(SyncFileRangeTest, BadArgs) {
+ auto tmpfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ auto f = ASSERT_NO_ERRNO_AND_VALUE(Open(tmpfile.path(), O_RDWR));
+ int fd = f.get();
+
+ EXPECT_THAT(sync_file_range(fd, -1, 0, 0), SyscallFailsWithErrno(EINVAL));
+ EXPECT_THAT(sync_file_range(fd, 0, -1, 0), SyscallFailsWithErrno(EINVAL));
+ EXPECT_THAT(sync_file_range(fd, 8912, INT64_MAX - 4096, 0),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(SyncFileRangeTest, CannotSyncFileRangeWithWaitBefore) {
+ auto tmpfile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ auto f = ASSERT_NO_ERRNO_AND_VALUE(Open(tmpfile.path(), O_RDWR));
+ constexpr char data[] = "some data to sync";
+ int fd = f.get();
+
+ EXPECT_THAT(write(fd, data, sizeof(data)),
+ SyscallSucceedsWithValue(sizeof(data)));
+ if (IsRunningOnGvisor()) {
+ EXPECT_THAT(sync_file_range(fd, 0, 0, SYNC_FILE_RANGE_WAIT_BEFORE),
+ SyscallFailsWithErrno(ENOSYS));
+ EXPECT_THAT(
+ sync_file_range(fd, 0, 0,
+ SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE),
+ SyscallFailsWithErrno(ENOSYS));
+ }
+}
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/sysinfo.cc b/test/syscalls/linux/sysinfo.cc
new file mode 100644
index 000000000..1a71256da
--- /dev/null
+++ b/test/syscalls/linux/sysinfo.cc
@@ -0,0 +1,86 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This is a very simple sanity test to validate that the sysinfo syscall is
+// supported by gvisor and returns sane values.
+#include <sys/syscall.h>
+#include <sys/sysinfo.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(SysinfoTest, SysinfoIsCallable) {
+ struct sysinfo ignored = {};
+ EXPECT_THAT(syscall(SYS_sysinfo, &ignored), SyscallSucceedsWithValue(0));
+}
+
+TEST(SysinfoTest, EfaultProducedOnBadAddress) {
+ // Validate that we return EFAULT when a bad address is provided.
+ // specified by man 2 sysinfo
+ EXPECT_THAT(syscall(SYS_sysinfo, nullptr), SyscallFailsWithErrno(EFAULT));
+}
+
+TEST(SysinfoTest, TotalRamSaneValue) {
+ struct sysinfo s = {};
+ EXPECT_THAT(sysinfo(&s), SyscallSucceedsWithValue(0));
+ EXPECT_GT(s.totalram, 0);
+}
+
+TEST(SysinfoTest, MemunitSet) {
+ struct sysinfo s = {};
+ EXPECT_THAT(sysinfo(&s), SyscallSucceedsWithValue(0));
+ EXPECT_GE(s.mem_unit, 1);
+}
+
+TEST(SysinfoTest, UptimeSaneValue) {
+ struct sysinfo s = {};
+ EXPECT_THAT(sysinfo(&s), SyscallSucceedsWithValue(0));
+ EXPECT_GE(s.uptime, 0);
+}
+
+TEST(SysinfoTest, UptimeIncreasingValue) {
+ struct sysinfo s = {};
+ EXPECT_THAT(sysinfo(&s), SyscallSucceedsWithValue(0));
+ absl::SleepFor(absl::Seconds(2));
+ struct sysinfo s2 = {};
+ EXPECT_THAT(sysinfo(&s2), SyscallSucceedsWithValue(0));
+ EXPECT_LT(s.uptime, s2.uptime);
+}
+
+TEST(SysinfoTest, FreeRamSaneValue) {
+ struct sysinfo s = {};
+ EXPECT_THAT(sysinfo(&s), SyscallSucceedsWithValue(0));
+ EXPECT_GT(s.freeram, 0);
+ EXPECT_LT(s.freeram, s.totalram);
+}
+
+TEST(SysinfoTest, NumProcsSaneValue) {
+ struct sysinfo s = {};
+ EXPECT_THAT(sysinfo(&s), SyscallSucceedsWithValue(0));
+ EXPECT_GT(s.procs, 0);
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/syslog.cc b/test/syscalls/linux/syslog.cc
new file mode 100644
index 000000000..9a7407d96
--- /dev/null
+++ b/test/syscalls/linux/syslog.cc
@@ -0,0 +1,51 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/klog.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+constexpr int SYSLOG_ACTION_READ_ALL = 3;
+constexpr int SYSLOG_ACTION_SIZE_BUFFER = 10;
+
+int Syslog(int type, char* buf, int len) {
+ return syscall(__NR_syslog, type, buf, len);
+}
+
+// Only SYSLOG_ACTION_SIZE_BUFFER and SYSLOG_ACTION_READ_ALL are implemented in
+// gVisor.
+
+TEST(Syslog, Size) {
+ EXPECT_THAT(Syslog(SYSLOG_ACTION_SIZE_BUFFER, nullptr, 0), SyscallSucceeds());
+}
+
+TEST(Syslog, ReadAll) {
+ // There might not be anything to read, so we can't check the write count.
+ char buf[100];
+ EXPECT_THAT(Syslog(SYSLOG_ACTION_READ_ALL, buf, sizeof(buf)),
+ SyscallSucceeds());
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/sysret.cc b/test/syscalls/linux/sysret.cc
new file mode 100644
index 000000000..19ffbd85b
--- /dev/null
+++ b/test/syscalls/linux/sysret.cc
@@ -0,0 +1,142 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Tests to verify that the behavior of linux and gvisor matches when
+// 'sysret' returns to bad (aka non-canonical) %rip or %rsp.
+
+#include <linux/elf.h>
+#include <sys/ptrace.h>
+#include <sys/user.h>
+
+#include "gtest/gtest.h"
+#include "test/util/logging.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+constexpr uint64_t kNonCanonicalRip = 0xCCCC000000000000;
+constexpr uint64_t kNonCanonicalRsp = 0xFFFF000000000000;
+
+class SysretTest : public ::testing::Test {
+ protected:
+ struct user_regs_struct regs_;
+ struct iovec iov;
+ pid_t child_;
+
+ void SetUp() override {
+ pid_t pid = fork();
+
+ // Child.
+ if (pid == 0) {
+ TEST_PCHECK(ptrace(PTRACE_TRACEME, 0, 0, 0) == 0);
+ MaybeSave();
+ TEST_PCHECK(raise(SIGSTOP) == 0);
+ MaybeSave();
+ _exit(0);
+ }
+
+ // Parent.
+ int status;
+ memset(&iov, 0, sizeof(iov));
+ ASSERT_THAT(pid, SyscallSucceeds()); // Might still be < 0.
+ ASSERT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
+ EXPECT_TRUE(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP);
+
+ iov.iov_base = &regs_;
+ iov.iov_len = sizeof(regs_);
+ ASSERT_THAT(ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov),
+ SyscallSucceeds());
+
+ child_ = pid;
+ }
+
+ void Detach() {
+ ASSERT_THAT(ptrace(PTRACE_DETACH, child_, 0, 0), SyscallSucceeds());
+ }
+
+ void SetRip(uint64_t newrip) {
+#if defined(__x86_64__)
+ regs_.rip = newrip;
+#elif defined(__aarch64__)
+ regs_.pc = newrip;
+#else
+#error "Unknown architecture"
+#endif
+ ASSERT_THAT(ptrace(PTRACE_SETREGSET, child_, NT_PRSTATUS, &iov),
+ SyscallSucceeds());
+ }
+
+ void SetRsp(uint64_t newrsp) {
+#if defined(__x86_64__)
+ regs_.rsp = newrsp;
+#elif defined(__aarch64__)
+ regs_.sp = newrsp;
+#else
+#error "Unknown architecture"
+#endif
+ ASSERT_THAT(ptrace(PTRACE_SETREGSET, child_, NT_PRSTATUS, &iov),
+ SyscallSucceeds());
+ }
+
+ // Wait waits for the child pid and returns the exit status.
+ int Wait() {
+ int status;
+ while (true) {
+ int rval = wait4(child_, &status, 0, NULL);
+ if (rval < 0) {
+ return rval;
+ }
+ if (rval == child_) {
+ return status;
+ }
+ }
+ }
+};
+
+TEST_F(SysretTest, JustDetach) {
+ Detach();
+ int status = Wait();
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0)
+ << "status = " << status;
+}
+
+TEST_F(SysretTest, BadRip) {
+ SetRip(kNonCanonicalRip);
+ Detach();
+ int status = Wait();
+ EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGSEGV)
+ << "status = " << status;
+}
+
+TEST_F(SysretTest, BadRsp) {
+ SetRsp(kNonCanonicalRsp);
+ Detach();
+ int status = Wait();
+#if defined(__x86_64__)
+ EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGBUS)
+ << "status = " << status;
+#elif defined(__aarch64__)
+ EXPECT_TRUE(WIFSIGNALED(status) && WTERMSIG(status) == SIGSEGV)
+ << "status = " << status;
+#else
+#error "Unknown architecture"
+#endif
+}
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
new file mode 100644
index 000000000..a4d2953e1
--- /dev/null
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -0,0 +1,1568 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <poll.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <unistd.h>
+
+#include <limits>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+PosixErrorOr<sockaddr_storage> InetLoopbackAddr(int family) {
+ struct sockaddr_storage addr;
+ memset(&addr, 0, sizeof(addr));
+ addr.ss_family = family;
+ switch (family) {
+ case AF_INET:
+ reinterpret_cast<struct sockaddr_in*>(&addr)->sin_addr.s_addr =
+ htonl(INADDR_LOOPBACK);
+ break;
+ case AF_INET6:
+ reinterpret_cast<struct sockaddr_in6*>(&addr)->sin6_addr =
+ in6addr_loopback;
+ break;
+ default:
+ return PosixError(EINVAL,
+ absl::StrCat("unknown socket family: ", family));
+ }
+ return addr;
+}
+
+// Fixture for tests parameterized by the address family to use (AF_INET and
+// AF_INET6) when creating sockets.
+class TcpSocketTest : public ::testing::TestWithParam<int> {
+ protected:
+ // Creates three sockets that will be used by test cases -- a listener, one
+ // that connects, and the accepted one.
+ void SetUp() override;
+
+ // Closes the sockets created by SetUp().
+ void TearDown() override;
+
+ // Listening socket.
+ int listener_ = -1;
+
+ // Socket connected via connect().
+ int s_ = -1;
+
+ // Socket connected via accept().
+ int t_ = -1;
+
+ // Initial size of the send buffer.
+ int sendbuf_size_ = -1;
+};
+
+void TcpSocketTest::SetUp() {
+ ASSERT_THAT(listener_ = socket(GetParam(), SOCK_STREAM, IPPROTO_TCP),
+ SyscallSucceeds());
+
+ ASSERT_THAT(s_ = socket(GetParam(), SOCK_STREAM, IPPROTO_TCP),
+ SyscallSucceeds());
+
+ // Initialize address to the loopback one.
+ sockaddr_storage addr =
+ ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam()));
+ socklen_t addrlen = sizeof(addr);
+
+ // Bind to some port then start listening.
+ ASSERT_THAT(
+ bind(listener_, reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+ SyscallSucceeds());
+
+ ASSERT_THAT(listen(listener_, SOMAXCONN), SyscallSucceeds());
+
+ // Get the address we're listening on, then connect to it. We need to do this
+ // because we're allowing the stack to pick a port for us.
+ ASSERT_THAT(getsockname(listener_, reinterpret_cast<struct sockaddr*>(&addr),
+ &addrlen),
+ SyscallSucceeds());
+
+ ASSERT_THAT(RetryEINTR(connect)(s_, reinterpret_cast<struct sockaddr*>(&addr),
+ addrlen),
+ SyscallSucceeds());
+
+ // Get the initial send buffer size.
+ socklen_t optlen = sizeof(sendbuf_size_);
+ ASSERT_THAT(getsockopt(s_, SOL_SOCKET, SO_SNDBUF, &sendbuf_size_, &optlen),
+ SyscallSucceeds());
+
+ // Accept the connection.
+ ASSERT_THAT(t_ = RetryEINTR(accept)(listener_, nullptr, nullptr),
+ SyscallSucceeds());
+}
+
+void TcpSocketTest::TearDown() {
+ EXPECT_THAT(close(listener_), SyscallSucceeds());
+ if (s_ >= 0) {
+ EXPECT_THAT(close(s_), SyscallSucceeds());
+ }
+ if (t_ >= 0) {
+ EXPECT_THAT(close(t_), SyscallSucceeds());
+ }
+}
+
+TEST_P(TcpSocketTest, ConnectOnEstablishedConnection) {
+ sockaddr_storage addr =
+ ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam()));
+ socklen_t addrlen = sizeof(addr);
+
+ ASSERT_THAT(
+ connect(s_, reinterpret_cast<const struct sockaddr*>(&addr), addrlen),
+ SyscallFailsWithErrno(EISCONN));
+ ASSERT_THAT(
+ connect(t_, reinterpret_cast<const struct sockaddr*>(&addr), addrlen),
+ SyscallFailsWithErrno(EISCONN));
+}
+
+TEST_P(TcpSocketTest, ShutdownWriteInTimeWait) {
+ EXPECT_THAT(shutdown(t_, SHUT_WR), SyscallSucceeds());
+ EXPECT_THAT(shutdown(s_, SHUT_RDWR), SyscallSucceeds());
+ absl::SleepFor(absl::Seconds(1)); // Wait to enter TIME_WAIT.
+ EXPECT_THAT(shutdown(t_, SHUT_WR), SyscallFailsWithErrno(ENOTCONN));
+}
+
+TEST_P(TcpSocketTest, ShutdownWriteInFinWait1) {
+ EXPECT_THAT(shutdown(t_, SHUT_WR), SyscallSucceeds());
+ EXPECT_THAT(shutdown(t_, SHUT_WR), SyscallSucceeds());
+ absl::SleepFor(absl::Seconds(1)); // Wait to enter FIN-WAIT2.
+ EXPECT_THAT(shutdown(t_, SHUT_WR), SyscallSucceeds());
+}
+
+TEST_P(TcpSocketTest, DataCoalesced) {
+ char buf[10];
+
+ // Write in two steps.
+ ASSERT_THAT(RetryEINTR(write)(s_, buf, sizeof(buf) / 2),
+ SyscallSucceedsWithValue(sizeof(buf) / 2));
+ ASSERT_THAT(RetryEINTR(write)(s_, buf, sizeof(buf) / 2),
+ SyscallSucceedsWithValue(sizeof(buf) / 2));
+
+ // Allow stack to process both packets.
+ absl::SleepFor(absl::Seconds(1));
+
+ // Read in one shot.
+ EXPECT_THAT(RetryEINTR(recv)(t_, buf, sizeof(buf), 0),
+ SyscallSucceedsWithValue(sizeof(buf)));
+}
+
+TEST_P(TcpSocketTest, SenderAddressIgnored) {
+ char buf[3];
+ ASSERT_THAT(RetryEINTR(write)(s_, buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ struct sockaddr_storage addr;
+ socklen_t addrlen = sizeof(addr);
+ memset(&addr, 0, sizeof(addr));
+
+ ASSERT_THAT(
+ RetryEINTR(recvfrom)(t_, buf, sizeof(buf), 0,
+ reinterpret_cast<struct sockaddr*>(&addr), &addrlen),
+ SyscallSucceedsWithValue(3));
+
+ // Check that addr remains zeroed-out.
+ const char* ptr = reinterpret_cast<char*>(&addr);
+ for (size_t i = 0; i < sizeof(addr); i++) {
+ EXPECT_EQ(ptr[i], 0);
+ }
+}
+
+TEST_P(TcpSocketTest, SenderAddressIgnoredOnPeek) {
+ char buf[3];
+ ASSERT_THAT(RetryEINTR(write)(s_, buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ struct sockaddr_storage addr;
+ socklen_t addrlen = sizeof(addr);
+ memset(&addr, 0, sizeof(addr));
+
+ ASSERT_THAT(
+ RetryEINTR(recvfrom)(t_, buf, sizeof(buf), MSG_PEEK,
+ reinterpret_cast<struct sockaddr*>(&addr), &addrlen),
+ SyscallSucceedsWithValue(3));
+
+ // Check that addr remains zeroed-out.
+ const char* ptr = reinterpret_cast<char*>(&addr);
+ for (size_t i = 0; i < sizeof(addr); i++) {
+ EXPECT_EQ(ptr[i], 0);
+ }
+}
+
+TEST_P(TcpSocketTest, SendtoAddressIgnored) {
+ struct sockaddr_storage addr;
+ memset(&addr, 0, sizeof(addr));
+ addr.ss_family = GetParam(); // FIXME(b/63803955)
+
+ char data = '\0';
+ EXPECT_THAT(
+ RetryEINTR(sendto)(s_, &data, sizeof(data), 0,
+ reinterpret_cast<sockaddr*>(&addr), sizeof(addr)),
+ SyscallSucceedsWithValue(1));
+}
+
+TEST_P(TcpSocketTest, WritevZeroIovec) {
+ // 2 bytes just to be safe and have vecs[1] not point to something random
+ // (even though length is 0).
+ char buf[2];
+ char recv_buf[1];
+
+ // Construct a vec where the final vector is of length 0.
+ iovec vecs[2] = {};
+ vecs[0].iov_base = buf;
+ vecs[0].iov_len = 1;
+ vecs[1].iov_base = buf + 1;
+ vecs[1].iov_len = 0;
+
+ EXPECT_THAT(RetryEINTR(writev)(s_, vecs, 2), SyscallSucceedsWithValue(1));
+
+ EXPECT_THAT(RetryEINTR(recv)(t_, recv_buf, 1, 0),
+ SyscallSucceedsWithValue(1));
+ EXPECT_EQ(memcmp(recv_buf, buf, 1), 0);
+}
+
+TEST_P(TcpSocketTest, ZeroWriteAllowed) {
+ char buf[3];
+ // Send a zero length packet.
+ ASSERT_THAT(RetryEINTR(write)(s_, buf, 0), SyscallSucceedsWithValue(0));
+ // Verify that there is no packet available.
+ EXPECT_THAT(RetryEINTR(recv)(t_, buf, sizeof(buf), MSG_DONTWAIT),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+// Test that a non-blocking write with a buffer that is larger than the send
+// buffer size will not actually write the whole thing at once. Regression test
+// for b/64438887.
+TEST_P(TcpSocketTest, NonblockingLargeWrite) {
+ // Set the FD to O_NONBLOCK.
+ int opts;
+ ASSERT_THAT(opts = fcntl(s_, F_GETFL), SyscallSucceeds());
+ opts |= O_NONBLOCK;
+ ASSERT_THAT(fcntl(s_, F_SETFL, opts), SyscallSucceeds());
+
+ // Allocate a buffer three times the size of the send buffer. We do this with
+ // a vector to avoid allocating on the stack.
+ int size = 3 * sendbuf_size_;
+ std::vector<char> buf(size);
+
+ // Try to write the whole thing.
+ int n;
+ ASSERT_THAT(n = RetryEINTR(write)(s_, buf.data(), size), SyscallSucceeds());
+
+ // We should have written something, but not the whole thing.
+ EXPECT_GT(n, 0);
+ EXPECT_LT(n, size);
+}
+
+// Test that a blocking write with a buffer that is larger than the send buffer
+// will block until the entire buffer is sent.
+TEST_P(TcpSocketTest, BlockingLargeWrite_NoRandomSave) {
+ // Allocate a buffer three times the size of the send buffer on the heap. We
+ // do this as a vector to avoid allocating on the stack.
+ int size = 3 * sendbuf_size_;
+ std::vector<char> writebuf(size);
+
+ // Start reading the response in a loop.
+ int read_bytes = 0;
+ ScopedThread t([this, &read_bytes]() {
+ // Avoid interrupting the blocking write in main thread.
+ const DisableSave ds;
+
+ // Take ownership of the FD so that we close it on failure. This will
+ // unblock the blocking write below.
+ FileDescriptor fd(t_);
+ t_ = -1;
+
+ char readbuf[2500] = {};
+ int n = -1;
+ while (n != 0) {
+ ASSERT_THAT(n = RetryEINTR(read)(fd.get(), &readbuf, sizeof(readbuf)),
+ SyscallSucceeds());
+ read_bytes += n;
+ }
+ });
+
+ // Try to write the whole thing.
+ int n;
+ ASSERT_THAT(n = WriteFd(s_, writebuf.data(), size), SyscallSucceeds());
+
+ // We should have written the whole thing.
+ EXPECT_EQ(n, size);
+ EXPECT_THAT(close(s_), SyscallSucceedsWithValue(0));
+ s_ = -1;
+ t.Join();
+
+ // We should have read the whole thing.
+ EXPECT_EQ(read_bytes, size);
+}
+
+// Test that a send with MSG_DONTWAIT flag and buffer that larger than the send
+// buffer size will not write the whole thing.
+TEST_P(TcpSocketTest, LargeSendDontWait) {
+ // Allocate a buffer three times the size of the send buffer. We do this on
+ // with a vector to avoid allocating on the stack.
+ int size = 3 * sendbuf_size_;
+ std::vector<char> buf(size);
+
+ // Try to write the whole thing with MSG_DONTWAIT flag, which can
+ // return a partial write.
+ int n;
+ ASSERT_THAT(n = RetryEINTR(send)(s_, buf.data(), size, MSG_DONTWAIT),
+ SyscallSucceeds());
+
+ // We should have written something, but not the whole thing.
+ EXPECT_GT(n, 0);
+ EXPECT_LT(n, size);
+}
+
+// Test that a send on a non-blocking socket with a buffer that larger than the
+// send buffer will not write the whole thing at once.
+TEST_P(TcpSocketTest, NonblockingLargeSend) {
+ // Set the FD to O_NONBLOCK.
+ int opts;
+ ASSERT_THAT(opts = fcntl(s_, F_GETFL), SyscallSucceeds());
+ opts |= O_NONBLOCK;
+ ASSERT_THAT(fcntl(s_, F_SETFL, opts), SyscallSucceeds());
+
+ // Allocate a buffer three times the size of the send buffer. We do this on
+ // with a vector to avoid allocating on the stack.
+ int size = 3 * sendbuf_size_;
+ std::vector<char> buf(size);
+
+ // Try to write the whole thing.
+ int n;
+ ASSERT_THAT(n = RetryEINTR(send)(s_, buf.data(), size, 0), SyscallSucceeds());
+
+ // We should have written something, but not the whole thing.
+ EXPECT_GT(n, 0);
+ EXPECT_LT(n, size);
+}
+
+// Same test as above, but calls send instead of write.
+TEST_P(TcpSocketTest, BlockingLargeSend_NoRandomSave) {
+ // Allocate a buffer three times the size of the send buffer. We do this on
+ // with a vector to avoid allocating on the stack.
+ int size = 3 * sendbuf_size_;
+ std::vector<char> writebuf(size);
+
+ // Start reading the response in a loop.
+ int read_bytes = 0;
+ ScopedThread t([this, &read_bytes]() {
+ // Avoid interrupting the blocking write in main thread.
+ const DisableSave ds;
+
+ // Take ownership of the FD so that we close it on failure. This will
+ // unblock the blocking write below.
+ FileDescriptor fd(t_);
+ t_ = -1;
+
+ char readbuf[2500] = {};
+ int n = -1;
+ while (n != 0) {
+ ASSERT_THAT(n = RetryEINTR(read)(fd.get(), &readbuf, sizeof(readbuf)),
+ SyscallSucceeds());
+ read_bytes += n;
+ }
+ });
+
+ // Try to send the whole thing.
+ int n;
+ ASSERT_THAT(n = SendFd(s_, writebuf.data(), size, 0), SyscallSucceeds());
+
+ // We should have written the whole thing.
+ EXPECT_EQ(n, size);
+ EXPECT_THAT(close(s_), SyscallSucceedsWithValue(0));
+ s_ = -1;
+ t.Join();
+
+ // We should have read the whole thing.
+ EXPECT_EQ(read_bytes, size);
+}
+
+// Test that polling on a socket with a full send buffer will block.
+TEST_P(TcpSocketTest, PollWithFullBufferBlocks) {
+ // Set the FD to O_NONBLOCK.
+ int opts;
+ ASSERT_THAT(opts = fcntl(s_, F_GETFL), SyscallSucceeds());
+ opts |= O_NONBLOCK;
+ ASSERT_THAT(fcntl(s_, F_SETFL, opts), SyscallSucceeds());
+
+ // Set TCP_NODELAY, which will cause linux to fill the receive buffer from the
+ // send buffer as quickly as possibly. This way we can fill up both buffers
+ // faster.
+ constexpr int tcp_nodelay_flag = 1;
+ ASSERT_THAT(setsockopt(s_, IPPROTO_TCP, TCP_NODELAY, &tcp_nodelay_flag,
+ sizeof(tcp_nodelay_flag)),
+ SyscallSucceeds());
+
+ // Set a 256KB send/receive buffer.
+ int buf_sz = 1 << 18;
+ EXPECT_THAT(setsockopt(t_, SOL_SOCKET, SO_RCVBUF, &buf_sz, sizeof(buf_sz)),
+ SyscallSucceedsWithValue(0));
+ EXPECT_THAT(setsockopt(s_, SOL_SOCKET, SO_SNDBUF, &buf_sz, sizeof(buf_sz)),
+ SyscallSucceedsWithValue(0));
+
+ // Create a large buffer that will be used for sending.
+ std::vector<char> buf(1 << 16);
+
+ // Write until we receive an error.
+ while (RetryEINTR(send)(s_, buf.data(), buf.size(), 0) != -1) {
+ // Sleep to give linux a chance to move data from the send buffer to the
+ // receive buffer.
+ usleep(10000); // 10ms.
+ }
+ // The last error should have been EWOULDBLOCK.
+ ASSERT_EQ(errno, EWOULDBLOCK);
+
+ // Now polling on the FD with a timeout should return 0 corresponding to no
+ // FDs ready.
+ struct pollfd poll_fd = {s_, POLLOUT, 0};
+ EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, 10), SyscallSucceedsWithValue(0));
+}
+
+TEST_P(TcpSocketTest, MsgTrunc) {
+ char sent_data[512];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+ ASSERT_THAT(RetryEINTR(send)(s_, sent_data, sizeof(sent_data), 0),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+ char received_data[sizeof(sent_data)] = {};
+ ASSERT_THAT(
+ RetryEINTR(recv)(t_, received_data, sizeof(received_data) / 2, MSG_TRUNC),
+ SyscallSucceedsWithValue(sizeof(sent_data) / 2));
+
+ // Check that we didn't get anything.
+ char zeros[sizeof(received_data)] = {};
+ EXPECT_EQ(0, memcmp(zeros, received_data, sizeof(received_data)));
+}
+
+// MSG_CTRUNC is a return flag but linux allows it to be set on input flags
+// without returning an error.
+TEST_P(TcpSocketTest, MsgTruncWithCtrunc) {
+ char sent_data[512];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+ ASSERT_THAT(RetryEINTR(send)(s_, sent_data, sizeof(sent_data), 0),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+ char received_data[sizeof(sent_data)] = {};
+ ASSERT_THAT(RetryEINTR(recv)(t_, received_data, sizeof(received_data) / 2,
+ MSG_TRUNC | MSG_CTRUNC),
+ SyscallSucceedsWithValue(sizeof(sent_data) / 2));
+
+ // Check that we didn't get anything.
+ char zeros[sizeof(received_data)] = {};
+ EXPECT_EQ(0, memcmp(zeros, received_data, sizeof(received_data)));
+}
+
+// This test will verify that MSG_CTRUNC doesn't do anything when specified
+// on input.
+TEST_P(TcpSocketTest, MsgTruncWithCtruncOnly) {
+ char sent_data[512];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+ ASSERT_THAT(RetryEINTR(send)(s_, sent_data, sizeof(sent_data), 0),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+ char received_data[sizeof(sent_data)] = {};
+ ASSERT_THAT(RetryEINTR(recv)(t_, received_data, sizeof(received_data) / 2,
+ MSG_CTRUNC),
+ SyscallSucceedsWithValue(sizeof(sent_data) / 2));
+
+ // Since MSG_CTRUNC here had no affect, it should not behave like MSG_TRUNC.
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data) / 2));
+}
+
+TEST_P(TcpSocketTest, MsgTruncLargeSize) {
+ char sent_data[512];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+ ASSERT_THAT(RetryEINTR(send)(s_, sent_data, sizeof(sent_data), 0),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+ char received_data[sizeof(sent_data) * 2] = {};
+ ASSERT_THAT(
+ RetryEINTR(recv)(t_, received_data, sizeof(received_data), MSG_TRUNC),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ // Check that we didn't get anything.
+ char zeros[sizeof(received_data)] = {};
+ EXPECT_EQ(0, memcmp(zeros, received_data, sizeof(received_data)));
+}
+
+TEST_P(TcpSocketTest, MsgTruncPeek) {
+ char sent_data[512];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+ ASSERT_THAT(RetryEINTR(send)(s_, sent_data, sizeof(sent_data), 0),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+ char received_data[sizeof(sent_data)] = {};
+ ASSERT_THAT(RetryEINTR(recv)(t_, received_data, sizeof(received_data) / 2,
+ MSG_TRUNC | MSG_PEEK),
+ SyscallSucceedsWithValue(sizeof(sent_data) / 2));
+
+ // Check that we didn't get anything.
+ char zeros[sizeof(received_data)] = {};
+ EXPECT_EQ(0, memcmp(zeros, received_data, sizeof(received_data)));
+
+ // Check that we can still get all of the data.
+ ASSERT_THAT(RetryEINTR(recv)(t_, received_data, sizeof(received_data), 0),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+}
+
+TEST_P(TcpSocketTest, NoDelayDefault) {
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ EXPECT_THAT(getsockopt(s_, IPPROTO_TCP, TCP_NODELAY, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kSockOptOff);
+}
+
+TEST_P(TcpSocketTest, SetNoDelay) {
+ ASSERT_THAT(
+ setsockopt(s_, IPPROTO_TCP, TCP_NODELAY, &kSockOptOn, sizeof(kSockOptOn)),
+ SyscallSucceeds());
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ EXPECT_THAT(getsockopt(s_, IPPROTO_TCP, TCP_NODELAY, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kSockOptOn);
+
+ ASSERT_THAT(setsockopt(s_, IPPROTO_TCP, TCP_NODELAY, &kSockOptOff,
+ sizeof(kSockOptOff)),
+ SyscallSucceeds());
+
+ EXPECT_THAT(getsockopt(s_, IPPROTO_TCP, TCP_NODELAY, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kSockOptOff);
+}
+
+#ifndef TCP_INQ
+#define TCP_INQ 36
+#endif
+
+TEST_P(TcpSocketTest, TcpInqSetSockOpt) {
+ char buf[1024];
+ ASSERT_THAT(RetryEINTR(write)(s_, buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ // TCP_INQ is disabled by default.
+ int val = -1;
+ socklen_t slen = sizeof(val);
+ EXPECT_THAT(getsockopt(t_, SOL_TCP, TCP_INQ, &val, &slen),
+ SyscallSucceedsWithValue(0));
+ ASSERT_EQ(val, 0);
+
+ // Try to set TCP_INQ.
+ val = 1;
+ EXPECT_THAT(setsockopt(t_, SOL_TCP, TCP_INQ, &val, sizeof(val)),
+ SyscallSucceedsWithValue(0));
+ val = -1;
+ slen = sizeof(val);
+ EXPECT_THAT(getsockopt(t_, SOL_TCP, TCP_INQ, &val, &slen),
+ SyscallSucceedsWithValue(0));
+ ASSERT_EQ(val, 1);
+
+ // Try to unset TCP_INQ.
+ val = 0;
+ EXPECT_THAT(setsockopt(t_, SOL_TCP, TCP_INQ, &val, sizeof(val)),
+ SyscallSucceedsWithValue(0));
+ val = -1;
+ slen = sizeof(val);
+ EXPECT_THAT(getsockopt(t_, SOL_TCP, TCP_INQ, &val, &slen),
+ SyscallSucceedsWithValue(0));
+ ASSERT_EQ(val, 0);
+}
+
+TEST_P(TcpSocketTest, TcpInq) {
+ char buf[1024];
+ // Write more than one TCP segment.
+ int size = sizeof(buf);
+ int kChunk = sizeof(buf) / 4;
+ for (int i = 0; i < size; i += kChunk) {
+ ASSERT_THAT(RetryEINTR(write)(s_, buf, kChunk),
+ SyscallSucceedsWithValue(kChunk));
+ }
+
+ int val = 1;
+ kChunk = sizeof(buf) / 2;
+ EXPECT_THAT(setsockopt(t_, SOL_TCP, TCP_INQ, &val, sizeof(val)),
+ SyscallSucceedsWithValue(0));
+
+ // Wait when all data will be in the received queue.
+ while (true) {
+ ASSERT_THAT(ioctl(t_, TIOCINQ, &size), SyscallSucceeds());
+ if (size == sizeof(buf)) {
+ break;
+ }
+ absl::SleepFor(absl::Milliseconds(10));
+ }
+
+ struct msghdr msg = {};
+ std::vector<char> control(CMSG_SPACE(sizeof(int)));
+ size = sizeof(buf);
+ struct iovec iov;
+ for (int i = 0; size != 0; i += kChunk) {
+ msg.msg_control = &control[0];
+ msg.msg_controllen = control.size();
+
+ iov.iov_base = buf;
+ iov.iov_len = kChunk;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ ASSERT_THAT(RetryEINTR(recvmsg)(t_, &msg, 0),
+ SyscallSucceedsWithValue(kChunk));
+ size -= kChunk;
+
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, nullptr);
+ ASSERT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(int)));
+ ASSERT_EQ(cmsg->cmsg_level, SOL_TCP);
+ ASSERT_EQ(cmsg->cmsg_type, TCP_INQ);
+
+ int inq = 0;
+ memcpy(&inq, CMSG_DATA(cmsg), sizeof(int));
+ ASSERT_EQ(inq, size);
+ }
+}
+
+TEST_P(TcpSocketTest, Tiocinq) {
+ char buf[1024];
+ size_t size = sizeof(buf);
+ ASSERT_THAT(RetryEINTR(write)(s_, buf, size), SyscallSucceedsWithValue(size));
+
+ uint32_t seed = time(nullptr);
+ const size_t max_chunk = size / 10;
+ while (size > 0) {
+ size_t chunk = (rand_r(&seed) % max_chunk) + 1;
+ ssize_t read = RetryEINTR(recvfrom)(t_, buf, chunk, 0, nullptr, nullptr);
+ ASSERT_THAT(read, SyscallSucceeds());
+ size -= read;
+
+ int inq = 0;
+ ASSERT_THAT(ioctl(t_, TIOCINQ, &inq), SyscallSucceeds());
+ ASSERT_EQ(inq, size);
+ }
+}
+
+TEST_P(TcpSocketTest, TcpSCMPriority) {
+ char buf[1024];
+ ASSERT_THAT(RetryEINTR(write)(s_, buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ int val = 1;
+ EXPECT_THAT(setsockopt(t_, SOL_TCP, TCP_INQ, &val, sizeof(val)),
+ SyscallSucceedsWithValue(0));
+ EXPECT_THAT(setsockopt(t_, SOL_SOCKET, SO_TIMESTAMP, &val, sizeof(val)),
+ SyscallSucceedsWithValue(0));
+
+ struct msghdr msg = {};
+ std::vector<char> control(
+ CMSG_SPACE(sizeof(struct timeval) + CMSG_SPACE(sizeof(int))));
+ struct iovec iov;
+ msg.msg_control = &control[0];
+ msg.msg_controllen = control.size();
+
+ iov.iov_base = buf;
+ iov.iov_len = sizeof(buf);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ ASSERT_THAT(RetryEINTR(recvmsg)(t_, &msg, 0),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, nullptr);
+ // TODO(b/78348848): SO_TIMESTAMP isn't implemented for TCP sockets.
+ if (!IsRunningOnGvisor() || cmsg->cmsg_level == SOL_SOCKET) {
+ ASSERT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+ ASSERT_EQ(cmsg->cmsg_type, SO_TIMESTAMP);
+ ASSERT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(struct timeval)));
+
+ cmsg = CMSG_NXTHDR(&msg, cmsg);
+ ASSERT_NE(cmsg, nullptr);
+ }
+ ASSERT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(int)));
+ ASSERT_EQ(cmsg->cmsg_level, SOL_TCP);
+ ASSERT_EQ(cmsg->cmsg_type, TCP_INQ);
+
+ int inq = 0;
+ memcpy(&inq, CMSG_DATA(cmsg), sizeof(int));
+ ASSERT_EQ(inq, 0);
+
+ cmsg = CMSG_NXTHDR(&msg, cmsg);
+ ASSERT_EQ(cmsg, nullptr);
+}
+
+INSTANTIATE_TEST_SUITE_P(AllInetTests, TcpSocketTest,
+ ::testing::Values(AF_INET, AF_INET6));
+
+// Fixture for tests parameterized by address family that don't want the fixture
+// to do things.
+using SimpleTcpSocketTest = ::testing::TestWithParam<int>;
+
+TEST_P(SimpleTcpSocketTest, SendUnconnected) {
+ int fd;
+ ASSERT_THAT(fd = socket(GetParam(), SOCK_STREAM, IPPROTO_TCP),
+ SyscallSucceeds());
+ FileDescriptor sock_fd(fd);
+
+ char data = '\0';
+ EXPECT_THAT(RetryEINTR(send)(fd, &data, sizeof(data), 0),
+ SyscallFailsWithErrno(EPIPE));
+}
+
+TEST_P(SimpleTcpSocketTest, SendtoWithoutAddressUnconnected) {
+ int fd;
+ ASSERT_THAT(fd = socket(GetParam(), SOCK_STREAM, IPPROTO_TCP),
+ SyscallSucceeds());
+ FileDescriptor sock_fd(fd);
+
+ char data = '\0';
+ EXPECT_THAT(RetryEINTR(sendto)(fd, &data, sizeof(data), 0, nullptr, 0),
+ SyscallFailsWithErrno(EPIPE));
+}
+
+TEST_P(SimpleTcpSocketTest, SendtoWithAddressUnconnected) {
+ int fd;
+ ASSERT_THAT(fd = socket(GetParam(), SOCK_STREAM, IPPROTO_TCP),
+ SyscallSucceeds());
+ FileDescriptor sock_fd(fd);
+
+ sockaddr_storage addr =
+ ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam()));
+ char data = '\0';
+ EXPECT_THAT(
+ RetryEINTR(sendto)(fd, &data, sizeof(data), 0,
+ reinterpret_cast<sockaddr*>(&addr), sizeof(addr)),
+ SyscallFailsWithErrno(EPIPE));
+}
+
+TEST_P(SimpleTcpSocketTest, GetPeerNameUnconnected) {
+ int fd;
+ ASSERT_THAT(fd = socket(GetParam(), SOCK_STREAM, IPPROTO_TCP),
+ SyscallSucceeds());
+ FileDescriptor sock_fd(fd);
+
+ sockaddr_storage addr;
+ socklen_t addrlen = sizeof(addr);
+ EXPECT_THAT(getpeername(fd, reinterpret_cast<sockaddr*>(&addr), &addrlen),
+ SyscallFailsWithErrno(ENOTCONN));
+}
+
+TEST_P(TcpSocketTest, FullBuffer) {
+ // Set both FDs to be blocking.
+ int flags = 0;
+ ASSERT_THAT(flags = fcntl(s_, F_GETFL), SyscallSucceeds());
+ EXPECT_THAT(fcntl(s_, F_SETFL, flags & ~O_NONBLOCK), SyscallSucceeds());
+ flags = 0;
+ ASSERT_THAT(flags = fcntl(t_, F_GETFL), SyscallSucceeds());
+ EXPECT_THAT(fcntl(t_, F_SETFL, flags & ~O_NONBLOCK), SyscallSucceeds());
+
+ // 2500 was chosen as a small value that can be set on Linux.
+ int set_snd = 2500;
+ EXPECT_THAT(setsockopt(s_, SOL_SOCKET, SO_SNDBUF, &set_snd, sizeof(set_snd)),
+ SyscallSucceedsWithValue(0));
+ int get_snd = -1;
+ socklen_t get_snd_len = sizeof(get_snd);
+ EXPECT_THAT(getsockopt(s_, SOL_SOCKET, SO_SNDBUF, &get_snd, &get_snd_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_snd_len, sizeof(get_snd));
+ EXPECT_GT(get_snd, 0);
+
+ // 2500 was chosen as a small value that can be set on Linux and gVisor.
+ int set_rcv = 2500;
+ EXPECT_THAT(setsockopt(t_, SOL_SOCKET, SO_RCVBUF, &set_rcv, sizeof(set_rcv)),
+ SyscallSucceedsWithValue(0));
+ int get_rcv = -1;
+ socklen_t get_rcv_len = sizeof(get_rcv);
+ EXPECT_THAT(getsockopt(t_, SOL_SOCKET, SO_RCVBUF, &get_rcv, &get_rcv_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_rcv_len, sizeof(get_rcv));
+ EXPECT_GE(get_rcv, 2500);
+
+ // Quick sanity test.
+ EXPECT_LT(get_snd + get_rcv, 2500 * IOV_MAX);
+
+ char data[2500] = {};
+ std::vector<struct iovec> iovecs;
+ for (int i = 0; i < IOV_MAX; i++) {
+ struct iovec iov = {};
+ iov.iov_base = data;
+ iov.iov_len = sizeof(data);
+ iovecs.push_back(iov);
+ }
+ ScopedThread t([this, &iovecs]() {
+ int result = -1;
+ EXPECT_THAT(result = RetryEINTR(writev)(s_, iovecs.data(), iovecs.size()),
+ SyscallSucceeds());
+ EXPECT_GT(result, 1);
+ EXPECT_LT(result, sizeof(data) * iovecs.size());
+ });
+
+ char recv = 0;
+ EXPECT_THAT(RetryEINTR(read)(t_, &recv, 1), SyscallSucceedsWithValue(1));
+ EXPECT_THAT(close(t_), SyscallSucceedsWithValue(0));
+ t_ = -1;
+}
+
+TEST_P(TcpSocketTest, PollAfterShutdown) {
+ ScopedThread client_thread([this]() {
+ EXPECT_THAT(shutdown(s_, SHUT_WR), SyscallSucceedsWithValue(0));
+ struct pollfd poll_fd = {s_, POLLIN | POLLERR | POLLHUP, 0};
+ EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, 10000),
+ SyscallSucceedsWithValue(1));
+ });
+
+ EXPECT_THAT(shutdown(t_, SHUT_WR), SyscallSucceedsWithValue(0));
+ struct pollfd poll_fd = {t_, POLLIN | POLLERR | POLLHUP, 0};
+ EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, 10000),
+ SyscallSucceedsWithValue(1));
+}
+
+TEST_P(SimpleTcpSocketTest, NonBlockingConnectNoListener) {
+ // Initialize address to the loopback one.
+ sockaddr_storage addr =
+ ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam()));
+ socklen_t addrlen = sizeof(addr);
+
+ const FileDescriptor s =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+ // Set the FD to O_NONBLOCK.
+ int opts;
+ ASSERT_THAT(opts = fcntl(s.get(), F_GETFL), SyscallSucceeds());
+ opts |= O_NONBLOCK;
+ ASSERT_THAT(fcntl(s.get(), F_SETFL, opts), SyscallSucceeds());
+
+ ASSERT_THAT(RetryEINTR(connect)(
+ s.get(), reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+ SyscallFailsWithErrno(EINPROGRESS));
+
+ // Now polling on the FD with a timeout should return 0 corresponding to no
+ // FDs ready.
+ struct pollfd poll_fd = {s.get(), POLLOUT, 0};
+ EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, 10000),
+ SyscallSucceedsWithValue(1));
+
+ int err;
+ socklen_t optlen = sizeof(err);
+ ASSERT_THAT(getsockopt(s.get(), SOL_SOCKET, SO_ERROR, &err, &optlen),
+ SyscallSucceeds());
+
+ EXPECT_EQ(err, ECONNREFUSED);
+}
+
+TEST_P(SimpleTcpSocketTest, NonBlockingConnect) {
+ const FileDescriptor listener =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+ // Initialize address to the loopback one.
+ sockaddr_storage addr =
+ ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam()));
+ socklen_t addrlen = sizeof(addr);
+
+ // Bind to some port then start listening.
+ ASSERT_THAT(
+ bind(listener.get(), reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+ SyscallSucceeds());
+
+ ASSERT_THAT(listen(listener.get(), SOMAXCONN), SyscallSucceeds());
+
+ FileDescriptor s =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+ // Set the FD to O_NONBLOCK.
+ int opts;
+ ASSERT_THAT(opts = fcntl(s.get(), F_GETFL), SyscallSucceeds());
+ opts |= O_NONBLOCK;
+ ASSERT_THAT(fcntl(s.get(), F_SETFL, opts), SyscallSucceeds());
+
+ ASSERT_THAT(getsockname(listener.get(),
+ reinterpret_cast<struct sockaddr*>(&addr), &addrlen),
+ SyscallSucceeds());
+
+ ASSERT_THAT(RetryEINTR(connect)(
+ s.get(), reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+ SyscallFailsWithErrno(EINPROGRESS));
+
+ int t;
+ ASSERT_THAT(t = RetryEINTR(accept)(listener.get(), nullptr, nullptr),
+ SyscallSucceeds());
+
+ // Now polling on the FD with a timeout should return 0 corresponding to no
+ // FDs ready.
+ struct pollfd poll_fd = {s.get(), POLLOUT, 0};
+ EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, 10000),
+ SyscallSucceedsWithValue(1));
+
+ int err;
+ socklen_t optlen = sizeof(err);
+ ASSERT_THAT(getsockopt(s.get(), SOL_SOCKET, SO_ERROR, &err, &optlen),
+ SyscallSucceeds());
+
+ EXPECT_EQ(err, 0);
+
+ EXPECT_THAT(close(t), SyscallSucceeds());
+}
+
+TEST_P(SimpleTcpSocketTest, NonBlockingConnectRemoteClose) {
+ const FileDescriptor listener =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+ // Initialize address to the loopback one.
+ sockaddr_storage addr =
+ ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam()));
+ socklen_t addrlen = sizeof(addr);
+
+ // Bind to some port then start listening.
+ ASSERT_THAT(
+ bind(listener.get(), reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+ SyscallSucceeds());
+
+ ASSERT_THAT(listen(listener.get(), SOMAXCONN), SyscallSucceeds());
+
+ FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(GetParam(), SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP));
+
+ ASSERT_THAT(getsockname(listener.get(),
+ reinterpret_cast<struct sockaddr*>(&addr), &addrlen),
+ SyscallSucceeds());
+
+ ASSERT_THAT(RetryEINTR(connect)(
+ s.get(), reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+ SyscallFailsWithErrno(EINPROGRESS));
+
+ int t;
+ ASSERT_THAT(t = RetryEINTR(accept)(listener.get(), nullptr, nullptr),
+ SyscallSucceeds());
+
+ EXPECT_THAT(close(t), SyscallSucceeds());
+
+ // Now polling on the FD with a timeout should return 0 corresponding to no
+ // FDs ready.
+ struct pollfd poll_fd = {s.get(), POLLOUT, 0};
+ EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, 10000),
+ SyscallSucceedsWithValue(1));
+
+ ASSERT_THAT(RetryEINTR(connect)(
+ s.get(), reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+ SyscallSucceeds());
+
+ ASSERT_THAT(RetryEINTR(connect)(
+ s.get(), reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+ SyscallFailsWithErrno(EISCONN));
+}
+
+// Test that we get an ECONNREFUSED with a blocking socket when no one is
+// listening on the other end.
+TEST_P(SimpleTcpSocketTest, BlockingConnectRefused) {
+ FileDescriptor s =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+ // Initialize address to the loopback one.
+ sockaddr_storage addr =
+ ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam()));
+ socklen_t addrlen = sizeof(addr);
+
+ ASSERT_THAT(RetryEINTR(connect)(
+ s.get(), reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+ SyscallFailsWithErrno(ECONNREFUSED));
+
+ // Avoiding triggering save in destructor of s.
+ EXPECT_THAT(close(s.release()), SyscallSucceeds());
+}
+
+// Test that connecting to a non-listening port and thus receiving a RST is
+// handled appropriately by the socket - the port that the socket was bound to
+// is released and the expected error is returned.
+TEST_P(SimpleTcpSocketTest, CleanupOnConnectionRefused) {
+ // Create a socket that is known to not be listening. As is it bound but not
+ // listening, when another socket connects to the port, it will refuse..
+ FileDescriptor bound_s =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+ sockaddr_storage bound_addr =
+ ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam()));
+ socklen_t bound_addrlen = sizeof(bound_addr);
+
+ ASSERT_THAT(
+ bind(bound_s.get(), reinterpret_cast<struct sockaddr*>(&bound_addr),
+ bound_addrlen),
+ SyscallSucceeds());
+
+ // Get the addresses the socket is bound to because the port is chosen by the
+ // stack.
+ ASSERT_THAT(getsockname(bound_s.get(),
+ reinterpret_cast<struct sockaddr*>(&bound_addr),
+ &bound_addrlen),
+ SyscallSucceeds());
+
+ // Create, initialize, and bind the socket that is used to test connecting to
+ // the non-listening port.
+ FileDescriptor client_s =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+ // Initialize client address to the loopback one.
+ sockaddr_storage client_addr =
+ ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam()));
+ socklen_t client_addrlen = sizeof(client_addr);
+
+ ASSERT_THAT(
+ bind(client_s.get(), reinterpret_cast<struct sockaddr*>(&client_addr),
+ client_addrlen),
+ SyscallSucceeds());
+
+ ASSERT_THAT(getsockname(client_s.get(),
+ reinterpret_cast<struct sockaddr*>(&client_addr),
+ &client_addrlen),
+ SyscallSucceeds());
+
+ // Now the test: connect to the bound but not listening socket with the
+ // client socket. The bound socket should return a RST and cause the client
+ // socket to return an error and clean itself up immediately.
+ // The error being ECONNREFUSED diverges with RFC 793, page 37, but does what
+ // Linux does.
+ ASSERT_THAT(connect(client_s.get(),
+ reinterpret_cast<const struct sockaddr*>(&bound_addr),
+ bound_addrlen),
+ SyscallFailsWithErrno(ECONNREFUSED));
+
+ FileDescriptor new_s =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+ // Test binding to the address from the client socket. This should be okay
+ // if it was dropped correctly.
+ ASSERT_THAT(
+ bind(new_s.get(), reinterpret_cast<struct sockaddr*>(&client_addr),
+ client_addrlen),
+ SyscallSucceeds());
+
+ // Attempt #2, with the new socket and reused addr our connect should fail in
+ // the same way as before, not with an EADDRINUSE.
+ ASSERT_THAT(connect(client_s.get(),
+ reinterpret_cast<const struct sockaddr*>(&bound_addr),
+ bound_addrlen),
+ SyscallFailsWithErrno(ECONNREFUSED));
+}
+
+// Test that we get an ECONNREFUSED with a nonblocking socket.
+TEST_P(SimpleTcpSocketTest, NonBlockingConnectRefused) {
+ FileDescriptor s = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(GetParam(), SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP));
+
+ // Initialize address to the loopback one.
+ sockaddr_storage addr =
+ ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam()));
+ socklen_t addrlen = sizeof(addr);
+
+ ASSERT_THAT(RetryEINTR(connect)(
+ s.get(), reinterpret_cast<struct sockaddr*>(&addr), addrlen),
+ SyscallFailsWithErrno(EINPROGRESS));
+
+ // We don't need to specify any events to get POLLHUP or POLLERR as these
+ // are added before the poll.
+ struct pollfd poll_fd = {s.get(), /*events=*/0, 0};
+ EXPECT_THAT(RetryEINTR(poll)(&poll_fd, 1, 1000), SyscallSucceedsWithValue(1));
+
+ // The ECONNREFUSED should cause us to be woken up with POLLHUP.
+ EXPECT_NE(poll_fd.revents & (POLLHUP | POLLERR), 0);
+
+ // Avoiding triggering save in destructor of s.
+ EXPECT_THAT(close(s.release()), SyscallSucceeds());
+}
+
+// Test that setting a supported congestion control algorithm succeeds for an
+// unconnected TCP socket
+TEST_P(SimpleTcpSocketTest, SetCongestionControlSucceedsForSupported) {
+ // This is Linux's net/tcp.h TCP_CA_NAME_MAX.
+ const int kTcpCaNameMax = 16;
+
+ FileDescriptor s =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+ {
+ const char kSetCC[kTcpCaNameMax] = "reno";
+ ASSERT_THAT(setsockopt(s.get(), IPPROTO_TCP, TCP_CONGESTION, &kSetCC,
+ strlen(kSetCC)),
+ SyscallSucceedsWithValue(0));
+
+ char got_cc[kTcpCaNameMax];
+ memset(got_cc, '1', sizeof(got_cc));
+ socklen_t optlen = sizeof(got_cc);
+ ASSERT_THAT(
+ getsockopt(s.get(), IPPROTO_TCP, TCP_CONGESTION, &got_cc, &optlen),
+ SyscallSucceedsWithValue(0));
+ // We ignore optlen here as the linux kernel sets optlen to the lower of the
+ // size of the buffer passed in or kTcpCaNameMax and not the length of the
+ // congestion control algorithm's actual name.
+ EXPECT_EQ(0, memcmp(got_cc, kSetCC, sizeof(kTcpCaNameMax)));
+ }
+ {
+ const char kSetCC[kTcpCaNameMax] = "cubic";
+ ASSERT_THAT(setsockopt(s.get(), IPPROTO_TCP, TCP_CONGESTION, &kSetCC,
+ strlen(kSetCC)),
+ SyscallSucceedsWithValue(0));
+
+ char got_cc[kTcpCaNameMax];
+ memset(got_cc, '1', sizeof(got_cc));
+ socklen_t optlen = sizeof(got_cc);
+ ASSERT_THAT(
+ getsockopt(s.get(), IPPROTO_TCP, TCP_CONGESTION, &got_cc, &optlen),
+ SyscallSucceedsWithValue(0));
+ // We ignore optlen here as the linux kernel sets optlen to the lower of the
+ // size of the buffer passed in or kTcpCaNameMax and not the length of the
+ // congestion control algorithm's actual name.
+ EXPECT_EQ(0, memcmp(got_cc, kSetCC, sizeof(kTcpCaNameMax)));
+ }
+}
+
+// This test verifies that a getsockopt(...TCP_CONGESTION) behaviour is
+// consistent between linux and gvisor when the passed in buffer is smaller than
+// kTcpCaNameMax.
+TEST_P(SimpleTcpSocketTest, SetGetTCPCongestionShortReadBuffer) {
+ FileDescriptor s =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+ {
+ // Verify that getsockopt/setsockopt work with buffers smaller than
+ // kTcpCaNameMax.
+ const char kSetCC[] = "cubic";
+ ASSERT_THAT(setsockopt(s.get(), IPPROTO_TCP, TCP_CONGESTION, &kSetCC,
+ strlen(kSetCC)),
+ SyscallSucceedsWithValue(0));
+
+ char got_cc[sizeof(kSetCC)];
+ socklen_t optlen = sizeof(got_cc);
+ ASSERT_THAT(
+ getsockopt(s.get(), IPPROTO_TCP, TCP_CONGESTION, &got_cc, &optlen),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(sizeof(got_cc), optlen);
+ EXPECT_EQ(0, memcmp(got_cc, kSetCC, sizeof(got_cc)));
+ }
+}
+
+// This test verifies that a getsockopt(...TCP_CONGESTION) behaviour is
+// consistent between linux and gvisor when the passed in buffer is larger than
+// kTcpCaNameMax.
+TEST_P(SimpleTcpSocketTest, SetGetTCPCongestionLargeReadBuffer) {
+ // This is Linux's net/tcp.h TCP_CA_NAME_MAX.
+ const int kTcpCaNameMax = 16;
+
+ FileDescriptor s =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+ {
+ // Verify that getsockopt works with buffers larger than
+ // kTcpCaNameMax.
+ const char kSetCC[] = "cubic";
+ ASSERT_THAT(setsockopt(s.get(), IPPROTO_TCP, TCP_CONGESTION, &kSetCC,
+ strlen(kSetCC)),
+ SyscallSucceedsWithValue(0));
+
+ char got_cc[kTcpCaNameMax + 5];
+ socklen_t optlen = sizeof(got_cc);
+ ASSERT_THAT(
+ getsockopt(s.get(), IPPROTO_TCP, TCP_CONGESTION, &got_cc, &optlen),
+ SyscallSucceedsWithValue(0));
+ // Linux copies the minimum of kTcpCaNameMax or the length of the passed in
+ // buffer and sets optlen to the number of bytes actually copied
+ // irrespective of the actual length of the congestion control name.
+ EXPECT_EQ(kTcpCaNameMax, optlen);
+ EXPECT_EQ(0, memcmp(got_cc, kSetCC, sizeof(kSetCC)));
+ }
+}
+
+// Test that setting an unsupported congestion control algorithm fails for an
+// unconnected TCP socket.
+TEST_P(SimpleTcpSocketTest, SetCongestionControlFailsForUnsupported) {
+ // This is Linux's net/tcp.h TCP_CA_NAME_MAX.
+ const int kTcpCaNameMax = 16;
+
+ FileDescriptor s =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+ char old_cc[kTcpCaNameMax];
+ socklen_t optlen = sizeof(old_cc);
+ ASSERT_THAT(
+ getsockopt(s.get(), IPPROTO_TCP, TCP_CONGESTION, &old_cc, &optlen),
+ SyscallSucceedsWithValue(0));
+
+ const char kSetCC[] = "invalid_ca_kSetCC";
+ ASSERT_THAT(
+ setsockopt(s.get(), SOL_TCP, TCP_CONGESTION, &kSetCC, strlen(kSetCC)),
+ SyscallFailsWithErrno(ENOENT));
+
+ char got_cc[kTcpCaNameMax];
+ ASSERT_THAT(
+ getsockopt(s.get(), IPPROTO_TCP, TCP_CONGESTION, &got_cc, &optlen),
+ SyscallSucceedsWithValue(0));
+ // We ignore optlen here as the linux kernel sets optlen to the lower of the
+ // size of the buffer passed in or kTcpCaNameMax and not the length of the
+ // congestion control algorithm's actual name.
+ EXPECT_EQ(0, memcmp(got_cc, old_cc, sizeof(kTcpCaNameMax)));
+}
+
+TEST_P(SimpleTcpSocketTest, MaxSegDefault) {
+ FileDescriptor s =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+ constexpr int kDefaultMSS = 536;
+ int tcp_max_seg;
+ socklen_t optlen = sizeof(tcp_max_seg);
+ ASSERT_THAT(
+ getsockopt(s.get(), IPPROTO_TCP, TCP_MAXSEG, &tcp_max_seg, &optlen),
+ SyscallSucceedsWithValue(0));
+
+ EXPECT_EQ(kDefaultMSS, tcp_max_seg);
+ EXPECT_EQ(sizeof(tcp_max_seg), optlen);
+}
+
+TEST_P(SimpleTcpSocketTest, SetMaxSeg) {
+ FileDescriptor s =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+ constexpr int kDefaultMSS = 536;
+ constexpr int kTCPMaxSeg = 1024;
+ ASSERT_THAT(setsockopt(s.get(), IPPROTO_TCP, TCP_MAXSEG, &kTCPMaxSeg,
+ sizeof(kTCPMaxSeg)),
+ SyscallSucceedsWithValue(0));
+
+ // Linux actually never returns the user_mss value. It will always return the
+ // default MSS value defined above for an unconnected socket and always return
+ // the actual current MSS for a connected one.
+ int optval;
+ socklen_t optlen = sizeof(optval);
+ ASSERT_THAT(getsockopt(s.get(), IPPROTO_TCP, TCP_MAXSEG, &optval, &optlen),
+ SyscallSucceedsWithValue(0));
+
+ EXPECT_EQ(kDefaultMSS, optval);
+ EXPECT_EQ(sizeof(optval), optlen);
+}
+
+TEST_P(SimpleTcpSocketTest, SetMaxSegFailsForInvalidMSSValues) {
+ FileDescriptor s =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+ {
+ constexpr int tcp_max_seg = 10;
+ ASSERT_THAT(setsockopt(s.get(), IPPROTO_TCP, TCP_MAXSEG, &tcp_max_seg,
+ sizeof(tcp_max_seg)),
+ SyscallFailsWithErrno(EINVAL));
+ }
+ {
+ constexpr int tcp_max_seg = 75000;
+ ASSERT_THAT(setsockopt(s.get(), IPPROTO_TCP, TCP_MAXSEG, &tcp_max_seg,
+ sizeof(tcp_max_seg)),
+ SyscallFailsWithErrno(EINVAL));
+ }
+}
+
+TEST_P(SimpleTcpSocketTest, SetTCPUserTimeout) {
+ FileDescriptor s =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+ {
+ constexpr int kTCPUserTimeout = -1;
+ EXPECT_THAT(setsockopt(s.get(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+ &kTCPUserTimeout, sizeof(kTCPUserTimeout)),
+ SyscallFailsWithErrno(EINVAL));
+ }
+
+ // kTCPUserTimeout is in milliseconds.
+ constexpr int kTCPUserTimeout = 100;
+ ASSERT_THAT(setsockopt(s.get(), IPPROTO_TCP, TCP_USER_TIMEOUT,
+ &kTCPUserTimeout, sizeof(kTCPUserTimeout)),
+ SyscallSucceedsWithValue(0));
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ ASSERT_THAT(
+ getsockopt(s.get(), IPPROTO_TCP, TCP_USER_TIMEOUT, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kTCPUserTimeout);
+}
+
+TEST_P(SimpleTcpSocketTest, SetTCPDeferAcceptNeg) {
+ FileDescriptor s =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+ // -ve TCP_DEFER_ACCEPT is same as setting it to zero.
+ constexpr int kNeg = -1;
+ EXPECT_THAT(
+ setsockopt(s.get(), IPPROTO_TCP, TCP_DEFER_ACCEPT, &kNeg, sizeof(kNeg)),
+ SyscallSucceeds());
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ ASSERT_THAT(
+ getsockopt(s.get(), IPPROTO_TCP, TCP_DEFER_ACCEPT, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, 0);
+}
+
+TEST_P(SimpleTcpSocketTest, GetTCPDeferAcceptDefault) {
+ FileDescriptor s =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ ASSERT_THAT(
+ getsockopt(s.get(), IPPROTO_TCP, TCP_DEFER_ACCEPT, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, 0);
+}
+
+TEST_P(SimpleTcpSocketTest, SetTCPDeferAcceptGreaterThanZero) {
+ FileDescriptor s =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+ // kTCPDeferAccept is in seconds.
+ // NOTE: linux translates seconds to # of retries and back from
+ // #of retries to seconds. Which means only certain values
+ // translate back exactly. That's why we use 3 here, a value of
+ // 5 will result in us getting back 7 instead of 5 in the
+ // getsockopt.
+ constexpr int kTCPDeferAccept = 3;
+ ASSERT_THAT(setsockopt(s.get(), IPPROTO_TCP, TCP_DEFER_ACCEPT,
+ &kTCPDeferAccept, sizeof(kTCPDeferAccept)),
+ SyscallSucceeds());
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ ASSERT_THAT(
+ getsockopt(s.get(), IPPROTO_TCP, TCP_DEFER_ACCEPT, &get, &get_len),
+ SyscallSucceeds());
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kTCPDeferAccept);
+}
+
+TEST_P(SimpleTcpSocketTest, RecvOnClosedSocket) {
+ auto s =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+ char buf[1];
+ EXPECT_THAT(recv(s.get(), buf, 0, 0), SyscallFailsWithErrno(ENOTCONN));
+ EXPECT_THAT(recv(s.get(), buf, sizeof(buf), 0),
+ SyscallFailsWithErrno(ENOTCONN));
+}
+
+TEST_P(SimpleTcpSocketTest, TCPConnectSoRcvBufRace) {
+ auto s = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(GetParam(), SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP));
+ sockaddr_storage addr =
+ ASSERT_NO_ERRNO_AND_VALUE(InetLoopbackAddr(GetParam()));
+ socklen_t addrlen = sizeof(addr);
+
+ RetryEINTR(connect)(s.get(), reinterpret_cast<struct sockaddr*>(&addr),
+ addrlen);
+ int buf_sz = 1 << 18;
+ EXPECT_THAT(
+ setsockopt(s.get(), SOL_SOCKET, SO_RCVBUF, &buf_sz, sizeof(buf_sz)),
+ SyscallSucceedsWithValue(0));
+}
+
+TEST_P(SimpleTcpSocketTest, SetTCPSynCntLessThanOne) {
+ FileDescriptor s =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ ASSERT_THAT(getsockopt(s.get(), IPPROTO_TCP, TCP_SYNCNT, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ int default_syn_cnt = get;
+
+ {
+ // TCP_SYNCNT less than 1 should be rejected with an EINVAL.
+ constexpr int kZero = 0;
+ EXPECT_THAT(
+ setsockopt(s.get(), IPPROTO_TCP, TCP_SYNCNT, &kZero, sizeof(kZero)),
+ SyscallFailsWithErrno(EINVAL));
+
+ // TCP_SYNCNT less than 1 should be rejected with an EINVAL.
+ constexpr int kNeg = -1;
+ EXPECT_THAT(
+ setsockopt(s.get(), IPPROTO_TCP, TCP_SYNCNT, &kNeg, sizeof(kNeg)),
+ SyscallFailsWithErrno(EINVAL));
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+
+ ASSERT_THAT(getsockopt(s.get(), IPPROTO_TCP, TCP_SYNCNT, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(default_syn_cnt, get);
+ }
+}
+
+TEST_P(SimpleTcpSocketTest, GetTCPSynCntDefault) {
+ FileDescriptor s =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ constexpr int kDefaultSynCnt = 6;
+
+ ASSERT_THAT(getsockopt(s.get(), IPPROTO_TCP, TCP_SYNCNT, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kDefaultSynCnt);
+}
+
+TEST_P(SimpleTcpSocketTest, SetTCPSynCntGreaterThanOne) {
+ FileDescriptor s =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+ constexpr int kTCPSynCnt = 20;
+ ASSERT_THAT(setsockopt(s.get(), IPPROTO_TCP, TCP_SYNCNT, &kTCPSynCnt,
+ sizeof(kTCPSynCnt)),
+ SyscallSucceeds());
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ ASSERT_THAT(getsockopt(s.get(), IPPROTO_TCP, TCP_SYNCNT, &get, &get_len),
+ SyscallSucceeds());
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kTCPSynCnt);
+}
+
+TEST_P(SimpleTcpSocketTest, SetTCPSynCntAboveMax) {
+ FileDescriptor s =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ ASSERT_THAT(getsockopt(s.get(), IPPROTO_TCP, TCP_SYNCNT, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ int default_syn_cnt = get;
+ {
+ constexpr int kTCPSynCnt = 256;
+ ASSERT_THAT(setsockopt(s.get(), IPPROTO_TCP, TCP_SYNCNT, &kTCPSynCnt,
+ sizeof(kTCPSynCnt)),
+ SyscallFailsWithErrno(EINVAL));
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ ASSERT_THAT(getsockopt(s.get(), IPPROTO_TCP, TCP_SYNCNT, &get, &get_len),
+ SyscallSucceeds());
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, default_syn_cnt);
+ }
+}
+
+TEST_P(SimpleTcpSocketTest, SetTCPWindowClampBelowMinRcvBuf) {
+ FileDescriptor s =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+ // Discover minimum receive buf by setting a really low value
+ // for the receive buffer.
+ constexpr int kZero = 0;
+ EXPECT_THAT(setsockopt(s.get(), SOL_SOCKET, SO_RCVBUF, &kZero, sizeof(kZero)),
+ SyscallSucceeds());
+
+ // Now retrieve the minimum value for SO_RCVBUF as the set above should
+ // have caused SO_RCVBUF for the socket to be set to the minimum.
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ ASSERT_THAT(getsockopt(s.get(), SOL_SOCKET, SO_RCVBUF, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ int min_so_rcvbuf = get;
+
+ {
+ // TCP_WINDOW_CLAMP less than min_so_rcvbuf/2 should be set to
+ // min_so_rcvbuf/2.
+ int below_half_min_rcvbuf = min_so_rcvbuf / 2 - 1;
+ EXPECT_THAT(
+ setsockopt(s.get(), IPPROTO_TCP, TCP_WINDOW_CLAMP,
+ &below_half_min_rcvbuf, sizeof(below_half_min_rcvbuf)),
+ SyscallSucceeds());
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+
+ ASSERT_THAT(
+ getsockopt(s.get(), IPPROTO_TCP, TCP_WINDOW_CLAMP, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(min_so_rcvbuf / 2, get);
+ }
+}
+
+TEST_P(SimpleTcpSocketTest, SetTCPWindowClampZeroClosedSocket) {
+ FileDescriptor s =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+ constexpr int kZero = 0;
+ ASSERT_THAT(
+ setsockopt(s.get(), IPPROTO_TCP, TCP_WINDOW_CLAMP, &kZero, sizeof(kZero)),
+ SyscallSucceeds());
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ ASSERT_THAT(
+ getsockopt(s.get(), IPPROTO_TCP, TCP_WINDOW_CLAMP, &get, &get_len),
+ SyscallSucceeds());
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(get, kZero);
+}
+
+TEST_P(SimpleTcpSocketTest, SetTCPWindowClampAboveHalfMinRcvBuf) {
+ FileDescriptor s =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+
+ // Discover minimum receive buf by setting a really low value
+ // for the receive buffer.
+ constexpr int kZero = 0;
+ EXPECT_THAT(setsockopt(s.get(), SOL_SOCKET, SO_RCVBUF, &kZero, sizeof(kZero)),
+ SyscallSucceeds());
+
+ // Now retrieve the minimum value for SO_RCVBUF as the set above should
+ // have caused SO_RCVBUF for the socket to be set to the minimum.
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+ ASSERT_THAT(getsockopt(s.get(), SOL_SOCKET, SO_RCVBUF, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ int min_so_rcvbuf = get;
+
+ {
+ int above_half_min_rcv_buf = min_so_rcvbuf / 2 + 1;
+ EXPECT_THAT(
+ setsockopt(s.get(), IPPROTO_TCP, TCP_WINDOW_CLAMP,
+ &above_half_min_rcv_buf, sizeof(above_half_min_rcv_buf)),
+ SyscallSucceeds());
+
+ int get = -1;
+ socklen_t get_len = sizeof(get);
+
+ ASSERT_THAT(
+ getsockopt(s.get(), IPPROTO_TCP, TCP_WINDOW_CLAMP, &get, &get_len),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(get_len, sizeof(get));
+ EXPECT_EQ(above_half_min_rcv_buf, get);
+ }
+}
+
+INSTANTIATE_TEST_SUITE_P(AllInetTests, SimpleTcpSocketTest,
+ ::testing::Values(AF_INET, AF_INET6));
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/tgkill.cc b/test/syscalls/linux/tgkill.cc
new file mode 100644
index 000000000..80acae5de
--- /dev/null
+++ b/test/syscalls/linux/tgkill.cc
@@ -0,0 +1,48 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "test/util/signal_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(TgkillTest, InvalidTID) {
+ EXPECT_THAT(tgkill(getpid(), -1, 0), SyscallFailsWithErrno(EINVAL));
+ EXPECT_THAT(tgkill(getpid(), 0, 0), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(TgkillTest, InvalidTGID) {
+ EXPECT_THAT(tgkill(-1, gettid(), 0), SyscallFailsWithErrno(EINVAL));
+ EXPECT_THAT(tgkill(0, gettid(), 0), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(TgkillTest, ValidInput) {
+ EXPECT_THAT(tgkill(getpid(), gettid(), 0), SyscallSucceeds());
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/time.cc b/test/syscalls/linux/time.cc
new file mode 100644
index 000000000..e75bba669
--- /dev/null
+++ b/test/syscalls/linux/time.cc
@@ -0,0 +1,107 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <time.h>
+
+#include "gtest/gtest.h"
+#include "test/util/proc_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+constexpr long kFudgeSeconds = 5;
+
+#if defined(__x86_64__) || defined(__i386__)
+// Mimics the time(2) wrapper from glibc prior to 2.15.
+time_t vsyscall_time(time_t* t) {
+ constexpr uint64_t kVsyscallTimeEntry = 0xffffffffff600400;
+ return reinterpret_cast<time_t (*)(time_t*)>(kVsyscallTimeEntry)(t);
+}
+
+TEST(TimeTest, VsyscallTime_Succeeds) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsVsyscallEnabled()));
+
+ time_t t1, t2;
+
+ {
+ const DisableSave ds; // Timing assertions.
+ EXPECT_THAT(time(&t1), SyscallSucceeds());
+ EXPECT_THAT(vsyscall_time(&t2), SyscallSucceeds());
+ }
+
+ // Time should be monotonic.
+ EXPECT_LE(static_cast<long>(t1), static_cast<long>(t2));
+
+ // Check that it's within kFudge seconds.
+ EXPECT_LE(static_cast<long>(t2), static_cast<long>(t1) + kFudgeSeconds);
+
+ // Redo with save.
+ EXPECT_THAT(time(&t1), SyscallSucceeds());
+ EXPECT_THAT(vsyscall_time(&t2), SyscallSucceeds());
+
+ // Time should be monotonic.
+ EXPECT_LE(static_cast<long>(t1), static_cast<long>(t2));
+}
+
+TEST(TimeTest, VsyscallTime_InvalidAddressSIGSEGV) {
+ EXPECT_EXIT(vsyscall_time(reinterpret_cast<time_t*>(0x1)),
+ ::testing::KilledBySignal(SIGSEGV), "");
+}
+
+// Mimics the gettimeofday(2) wrapper from the Go runtime <= 1.2.
+int vsyscall_gettimeofday(struct timeval* tv, struct timezone* tz) {
+ constexpr uint64_t kVsyscallGettimeofdayEntry = 0xffffffffff600000;
+ return reinterpret_cast<int (*)(struct timeval*, struct timezone*)>(
+ kVsyscallGettimeofdayEntry)(tv, tz);
+}
+
+TEST(TimeTest, VsyscallGettimeofday_Succeeds) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsVsyscallEnabled()));
+
+ struct timeval tv1, tv2;
+ struct timezone tz1, tz2;
+
+ {
+ const DisableSave ds; // Timing assertions.
+ EXPECT_THAT(gettimeofday(&tv1, &tz1), SyscallSucceeds());
+ EXPECT_THAT(vsyscall_gettimeofday(&tv2, &tz2), SyscallSucceeds());
+ }
+
+ // See above.
+ EXPECT_LE(static_cast<long>(tv1.tv_sec), static_cast<long>(tv2.tv_sec));
+ EXPECT_LE(static_cast<long>(tv2.tv_sec),
+ static_cast<long>(tv1.tv_sec) + kFudgeSeconds);
+
+ // Redo with save.
+ EXPECT_THAT(gettimeofday(&tv1, &tz1), SyscallSucceeds());
+ EXPECT_THAT(vsyscall_gettimeofday(&tv2, &tz2), SyscallSucceeds());
+}
+
+TEST(TimeTest, VsyscallGettimeofday_InvalidAddressSIGSEGV) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsVsyscallEnabled()));
+
+ EXPECT_EXIT(vsyscall_gettimeofday(reinterpret_cast<struct timeval*>(0x1),
+ reinterpret_cast<struct timezone*>(0x1)),
+ ::testing::KilledBySignal(SIGSEGV), "");
+}
+#endif
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/timerfd.cc b/test/syscalls/linux/timerfd.cc
new file mode 100644
index 000000000..c4f8fdd7a
--- /dev/null
+++ b/test/syscalls/linux/timerfd.cc
@@ -0,0 +1,273 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <poll.h>
+#include <sys/timerfd.h>
+#include <time.h>
+
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Wrapper around timerfd_create(2) that returns a FileDescriptor.
+PosixErrorOr<FileDescriptor> TimerfdCreate(int clockid, int flags) {
+ int fd = timerfd_create(clockid, flags);
+ MaybeSave();
+ if (fd < 0) {
+ return PosixError(errno, "timerfd_create failed");
+ }
+ return FileDescriptor(fd);
+}
+
+// In tests that race a timerfd with a sleep, some slack is required because:
+//
+// - Timerfd expirations are asynchronous with respect to nanosleeps.
+//
+// - Because clock_gettime(CLOCK_MONOTONIC) is implemented through the VDSO,
+// it technically uses a closely-related, but distinct, time domain from the
+// CLOCK_MONOTONIC used to trigger timerfd expirations. The same applies to
+// CLOCK_BOOTTIME which is an alias for CLOCK_MONOTONIC.
+absl::Duration TimerSlack() { return absl::Milliseconds(500); }
+
+class TimerfdTest : public ::testing::TestWithParam<int> {};
+
+TEST_P(TimerfdTest, IsInitiallyStopped) {
+ auto const tfd = ASSERT_NO_ERRNO_AND_VALUE(TimerfdCreate(GetParam(), 0));
+ struct itimerspec its = {};
+ ASSERT_THAT(timerfd_gettime(tfd.get(), &its), SyscallSucceeds());
+ EXPECT_EQ(0, its.it_value.tv_sec);
+ EXPECT_EQ(0, its.it_value.tv_nsec);
+}
+
+TEST_P(TimerfdTest, SingleShot) {
+ constexpr absl::Duration kDelay = absl::Seconds(1);
+
+ auto const tfd = ASSERT_NO_ERRNO_AND_VALUE(TimerfdCreate(GetParam(), 0));
+ struct itimerspec its = {};
+ its.it_value = absl::ToTimespec(kDelay);
+ ASSERT_THAT(timerfd_settime(tfd.get(), /* flags = */ 0, &its, nullptr),
+ SyscallSucceeds());
+
+ // The timer should fire exactly once since the interval is zero.
+ absl::SleepFor(kDelay + TimerSlack());
+ uint64_t val = 0;
+ ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64_t)),
+ SyscallSucceedsWithValue(sizeof(uint64_t)));
+ EXPECT_EQ(1, val);
+}
+
+TEST_P(TimerfdTest, Periodic) {
+ constexpr absl::Duration kDelay = absl::Seconds(1);
+ constexpr int kPeriods = 3;
+
+ auto const tfd = ASSERT_NO_ERRNO_AND_VALUE(TimerfdCreate(GetParam(), 0));
+ struct itimerspec its = {};
+ its.it_value = absl::ToTimespec(kDelay);
+ its.it_interval = absl::ToTimespec(kDelay);
+ ASSERT_THAT(timerfd_settime(tfd.get(), /* flags = */ 0, &its, nullptr),
+ SyscallSucceeds());
+
+ // Expect to see at least kPeriods expirations. More may occur due to the
+ // timer slack, or due to delays from scheduling or save/restore.
+ absl::SleepFor(kPeriods * kDelay + TimerSlack());
+ uint64_t val = 0;
+ ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64_t)),
+ SyscallSucceedsWithValue(sizeof(uint64_t)));
+ EXPECT_GE(val, kPeriods);
+}
+
+TEST_P(TimerfdTest, BlockingRead) {
+ constexpr absl::Duration kDelay = absl::Seconds(3);
+
+ auto const tfd = ASSERT_NO_ERRNO_AND_VALUE(TimerfdCreate(GetParam(), 0));
+ struct itimerspec its = {};
+ its.it_value.tv_sec = absl::ToInt64Seconds(kDelay);
+ auto const start_time = absl::Now();
+ ASSERT_THAT(timerfd_settime(tfd.get(), /* flags = */ 0, &its, nullptr),
+ SyscallSucceeds());
+
+ // read should block until the timer fires.
+ uint64_t val = 0;
+ ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64_t)),
+ SyscallSucceedsWithValue(sizeof(uint64_t)));
+ auto const end_time = absl::Now();
+ EXPECT_EQ(1, val);
+ EXPECT_GE((end_time - start_time) + TimerSlack(), kDelay);
+}
+
+TEST_P(TimerfdTest, NonblockingRead_NoRandomSave) {
+ constexpr absl::Duration kDelay = absl::Seconds(5);
+
+ auto const tfd =
+ ASSERT_NO_ERRNO_AND_VALUE(TimerfdCreate(GetParam(), TFD_NONBLOCK));
+
+ // Since the timer is initially disabled and has never fired, read should
+ // return EAGAIN.
+ uint64_t val = 0;
+ ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64_t)),
+ SyscallFailsWithErrno(EAGAIN));
+
+ DisableSave ds; // Timing-sensitive.
+
+ // Arm the timer.
+ struct itimerspec its = {};
+ its.it_value.tv_sec = absl::ToInt64Seconds(kDelay);
+ ASSERT_THAT(timerfd_settime(tfd.get(), /* flags = */ 0, &its, nullptr),
+ SyscallSucceeds());
+
+ // Since the timer has not yet fired, read should return EAGAIN.
+ ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64_t)),
+ SyscallFailsWithErrno(EAGAIN));
+
+ ds.reset(); // No longer timing-sensitive.
+
+ // After the timer fires, read should indicate 1 expiration.
+ absl::SleepFor(kDelay + TimerSlack());
+ ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64_t)),
+ SyscallSucceedsWithValue(sizeof(uint64_t)));
+ EXPECT_EQ(1, val);
+
+ // The successful read should have reset the number of expirations.
+ ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64_t)),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST_P(TimerfdTest, BlockingPoll_SetTimeResetsExpirations) {
+ constexpr absl::Duration kDelay = absl::Seconds(3);
+
+ auto const tfd =
+ ASSERT_NO_ERRNO_AND_VALUE(TimerfdCreate(GetParam(), TFD_NONBLOCK));
+ struct itimerspec its = {};
+ its.it_value.tv_sec = absl::ToInt64Seconds(kDelay);
+ auto const start_time = absl::Now();
+ ASSERT_THAT(timerfd_settime(tfd.get(), /* flags = */ 0, &its, nullptr),
+ SyscallSucceeds());
+
+ // poll should block until the timer fires.
+ struct pollfd pfd = {};
+ pfd.fd = tfd.get();
+ pfd.events = POLLIN;
+ ASSERT_THAT(poll(&pfd, /* nfds = */ 1,
+ /* timeout = */ 2 * absl::ToInt64Seconds(kDelay) * 1000),
+ SyscallSucceedsWithValue(1));
+ auto const end_time = absl::Now();
+ EXPECT_EQ(POLLIN, pfd.revents);
+ EXPECT_GE((end_time - start_time) + TimerSlack(), kDelay);
+
+ // Call timerfd_settime again with a value of 0. This should reset the number
+ // of expirations to 0, causing read to return EAGAIN since the timerfd is
+ // non-blocking.
+ its.it_value.tv_sec = 0;
+ ASSERT_THAT(timerfd_settime(tfd.get(), /* flags = */ 0, &its, nullptr),
+ SyscallSucceeds());
+ uint64_t val = 0;
+ ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64_t)),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST_P(TimerfdTest, SetAbsoluteTime) {
+ constexpr absl::Duration kDelay = absl::Seconds(3);
+
+ // Use a non-blocking timerfd so that if TFD_TIMER_ABSTIME is incorrectly
+ // non-functional, we get EAGAIN rather than a test timeout.
+ auto const tfd =
+ ASSERT_NO_ERRNO_AND_VALUE(TimerfdCreate(GetParam(), TFD_NONBLOCK));
+ struct itimerspec its = {};
+ ASSERT_THAT(clock_gettime(GetParam(), &its.it_value), SyscallSucceeds());
+ its.it_value.tv_sec += absl::ToInt64Seconds(kDelay);
+ ASSERT_THAT(timerfd_settime(tfd.get(), TFD_TIMER_ABSTIME, &its, nullptr),
+ SyscallSucceeds());
+
+ absl::SleepFor(kDelay + TimerSlack());
+ uint64_t val = 0;
+ ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64_t)),
+ SyscallSucceedsWithValue(sizeof(uint64_t)));
+ EXPECT_EQ(1, val);
+}
+
+TEST_P(TimerfdTest, IllegalSeek) {
+ auto const tfd = ASSERT_NO_ERRNO_AND_VALUE(TimerfdCreate(GetParam(), 0));
+ if (!IsRunningWithVFS1()) {
+ EXPECT_THAT(lseek(tfd.get(), 0, SEEK_SET), SyscallFailsWithErrno(ESPIPE));
+ }
+}
+
+TEST_P(TimerfdTest, IllegalPread) {
+ auto const tfd = ASSERT_NO_ERRNO_AND_VALUE(TimerfdCreate(GetParam(), 0));
+ int val;
+ EXPECT_THAT(pread(tfd.get(), &val, sizeof(val), 0),
+ SyscallFailsWithErrno(ESPIPE));
+}
+
+TEST_P(TimerfdTest, IllegalPwrite) {
+ auto const tfd = ASSERT_NO_ERRNO_AND_VALUE(TimerfdCreate(GetParam(), 0));
+ EXPECT_THAT(pwrite(tfd.get(), "x", 1, 0), SyscallFailsWithErrno(ESPIPE));
+ if (!IsRunningWithVFS1()) {
+ }
+}
+
+TEST_P(TimerfdTest, IllegalWrite) {
+ auto const tfd =
+ ASSERT_NO_ERRNO_AND_VALUE(TimerfdCreate(GetParam(), TFD_NONBLOCK));
+ uint64_t val = 0;
+ EXPECT_THAT(write(tfd.get(), &val, sizeof(val)),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+std::string PrintClockId(::testing::TestParamInfo<int> info) {
+ switch (info.param) {
+ case CLOCK_MONOTONIC:
+ return "CLOCK_MONOTONIC";
+ case CLOCK_BOOTTIME:
+ return "CLOCK_BOOTTIME";
+ default:
+ return absl::StrCat(info.param);
+ }
+}
+
+INSTANTIATE_TEST_SUITE_P(AllTimerTypes, TimerfdTest,
+ ::testing::Values(CLOCK_MONOTONIC, CLOCK_BOOTTIME),
+ PrintClockId);
+
+TEST(TimerfdClockRealtimeTest, ClockRealtime) {
+ // Since CLOCK_REALTIME can, by definition, change, we can't make any
+ // non-flaky assertions about the amount of time it takes for a
+ // CLOCK_REALTIME-based timer to expire. Just check that it expires at all,
+ // and hope it happens before the test times out.
+ constexpr int kDelaySecs = 1;
+
+ auto const tfd = ASSERT_NO_ERRNO_AND_VALUE(TimerfdCreate(CLOCK_REALTIME, 0));
+ struct itimerspec its = {};
+ its.it_value.tv_sec = kDelaySecs;
+ ASSERT_THAT(timerfd_settime(tfd.get(), /* flags = */ 0, &its, nullptr),
+ SyscallSucceeds());
+
+ uint64_t val = 0;
+ ASSERT_THAT(ReadFd(tfd.get(), &val, sizeof(uint64_t)),
+ SyscallSucceedsWithValue(sizeof(uint64_t)));
+ EXPECT_EQ(1, val);
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/timers.cc b/test/syscalls/linux/timers.cc
new file mode 100644
index 000000000..4b3c44527
--- /dev/null
+++ b/test/syscalls/linux/timers.cc
@@ -0,0 +1,662 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <signal.h>
+#include <sys/resource.h>
+#include <sys/time.h>
+#include <syscall.h>
+#include <time.h>
+#include <unistd.h>
+
+#include <atomic>
+
+#include "gtest/gtest.h"
+#include "absl/flags/flag.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/cleanup.h"
+#include "test/util/logging.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/signal_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+ABSL_FLAG(bool, timers_test_sleep, false,
+ "If true, sleep forever instead of running tests.");
+
+using ::testing::_;
+using ::testing::AnyOf;
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+#ifndef CPUCLOCK_PROF
+#define CPUCLOCK_PROF 0
+#endif // CPUCLOCK_PROF
+
+PosixErrorOr<absl::Duration> ProcessCPUTime(pid_t pid) {
+ // Use pid-specific CPUCLOCK_PROF, which is the clock used to enforce
+ // RLIMIT_CPU.
+ clockid_t clockid = (~static_cast<clockid_t>(pid) << 3) | CPUCLOCK_PROF;
+
+ struct timespec ts;
+ int ret = clock_gettime(clockid, &ts);
+ if (ret < 0) {
+ return PosixError(errno, "clock_gettime failed");
+ }
+
+ return absl::DurationFromTimespec(ts);
+}
+
+void NoopSignalHandler(int signo) {
+ TEST_CHECK_MSG(SIGXCPU == signo,
+ "NoopSigHandler did not receive expected signal");
+}
+
+void UninstallingSignalHandler(int signo) {
+ TEST_CHECK_MSG(SIGXCPU == signo,
+ "UninstallingSignalHandler did not receive expected signal");
+ struct sigaction rev_action;
+ rev_action.sa_handler = SIG_DFL;
+ rev_action.sa_flags = 0;
+ sigemptyset(&rev_action.sa_mask);
+ sigaction(SIGXCPU, &rev_action, nullptr);
+}
+
+TEST(TimerTest, ProcessKilledOnCPUSoftLimit) {
+ constexpr absl::Duration kSoftLimit = absl::Seconds(1);
+ constexpr absl::Duration kHardLimit = absl::Seconds(3);
+
+ struct rlimit cpu_limits;
+ cpu_limits.rlim_cur = absl::ToInt64Seconds(kSoftLimit);
+ cpu_limits.rlim_max = absl::ToInt64Seconds(kHardLimit);
+
+ int pid = fork();
+ MaybeSave();
+ if (pid == 0) {
+ TEST_PCHECK(setrlimit(RLIMIT_CPU, &cpu_limits) == 0);
+ MaybeSave();
+ for (;;) {
+ }
+ }
+ ASSERT_THAT(pid, SyscallSucceeds());
+ auto c = Cleanup([pid] {
+ int status;
+ EXPECT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
+ EXPECT_TRUE(WIFSIGNALED(status));
+ EXPECT_EQ(WTERMSIG(status), SIGXCPU);
+ });
+
+ // Wait for the child to exit, but do not reap it. This will allow us to check
+ // its CPU usage while it is zombied.
+ EXPECT_THAT(waitid(P_PID, pid, nullptr, WEXITED | WNOWAIT),
+ SyscallSucceeds());
+
+ // Assert that the child spent 1s of CPU before getting killed.
+ //
+ // We must be careful to use CPUCLOCK_PROF, the same clock used for RLIMIT_CPU
+ // enforcement, to get correct results. Note that this is slightly different
+ // from rusage-reported CPU usage:
+ //
+ // RLIMIT_CPU, CPUCLOCK_PROF use kernel/sched/cputime.c:thread_group_cputime.
+ // rusage uses kernel/sched/cputime.c:thread_group_cputime_adjusted.
+ absl::Duration cpu = ASSERT_NO_ERRNO_AND_VALUE(ProcessCPUTime(pid));
+ EXPECT_GE(cpu, kSoftLimit);
+
+ // Child did not make it to the hard limit.
+ //
+ // Linux sends SIGXCPU synchronously with CPU tick updates. See
+ // kernel/time/timer.c:update_process_times:
+ // => account_process_tick // update task CPU usage.
+ // => run_posix_cpu_timers // enforce RLIMIT_CPU, sending signal.
+ //
+ // Thus, only chance for this to flake is if the system time required to
+ // deliver the signal exceeds 2s.
+ EXPECT_LT(cpu, kHardLimit);
+}
+
+TEST(TimerTest, ProcessPingedRepeatedlyAfterCPUSoftLimit) {
+ struct sigaction new_action;
+ new_action.sa_handler = UninstallingSignalHandler;
+ new_action.sa_flags = 0;
+ sigemptyset(&new_action.sa_mask);
+
+ constexpr absl::Duration kSoftLimit = absl::Seconds(1);
+ constexpr absl::Duration kHardLimit = absl::Seconds(10);
+
+ struct rlimit cpu_limits;
+ cpu_limits.rlim_cur = absl::ToInt64Seconds(kSoftLimit);
+ cpu_limits.rlim_max = absl::ToInt64Seconds(kHardLimit);
+
+ int pid = fork();
+ MaybeSave();
+ if (pid == 0) {
+ TEST_PCHECK(sigaction(SIGXCPU, &new_action, nullptr) == 0);
+ MaybeSave();
+ TEST_PCHECK(setrlimit(RLIMIT_CPU, &cpu_limits) == 0);
+ MaybeSave();
+ for (;;) {
+ }
+ }
+ ASSERT_THAT(pid, SyscallSucceeds());
+ auto c = Cleanup([pid] {
+ int status;
+ EXPECT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
+ EXPECT_TRUE(WIFSIGNALED(status));
+ EXPECT_EQ(WTERMSIG(status), SIGXCPU);
+ });
+
+ // Wait for the child to exit, but do not reap it. This will allow us to check
+ // its CPU usage while it is zombied.
+ EXPECT_THAT(waitid(P_PID, pid, nullptr, WEXITED | WNOWAIT),
+ SyscallSucceeds());
+
+ absl::Duration cpu = ASSERT_NO_ERRNO_AND_VALUE(ProcessCPUTime(pid));
+ // Following signals come every CPU second.
+ EXPECT_GE(cpu, kSoftLimit + absl::Seconds(1));
+
+ // Child did not make it to the hard limit.
+ //
+ // As above, should not flake.
+ EXPECT_LT(cpu, kHardLimit);
+}
+
+TEST(TimerTest, ProcessKilledOnCPUHardLimit) {
+ struct sigaction new_action;
+ new_action.sa_handler = NoopSignalHandler;
+ new_action.sa_flags = 0;
+ sigemptyset(&new_action.sa_mask);
+
+ constexpr absl::Duration kSoftLimit = absl::Seconds(1);
+ constexpr absl::Duration kHardLimit = absl::Seconds(3);
+
+ struct rlimit cpu_limits;
+ cpu_limits.rlim_cur = absl::ToInt64Seconds(kSoftLimit);
+ cpu_limits.rlim_max = absl::ToInt64Seconds(kHardLimit);
+
+ int pid = fork();
+ MaybeSave();
+ if (pid == 0) {
+ TEST_PCHECK(sigaction(SIGXCPU, &new_action, nullptr) == 0);
+ MaybeSave();
+ TEST_PCHECK(setrlimit(RLIMIT_CPU, &cpu_limits) == 0);
+ MaybeSave();
+ for (;;) {
+ }
+ }
+ ASSERT_THAT(pid, SyscallSucceeds());
+ auto c = Cleanup([pid] {
+ int status;
+ EXPECT_THAT(waitpid(pid, &status, 0), SyscallSucceedsWithValue(pid));
+ EXPECT_TRUE(WIFSIGNALED(status));
+ EXPECT_EQ(WTERMSIG(status), SIGKILL);
+ });
+
+ // Wait for the child to exit, but do not reap it. This will allow us to check
+ // its CPU usage while it is zombied.
+ EXPECT_THAT(waitid(P_PID, pid, nullptr, WEXITED | WNOWAIT),
+ SyscallSucceeds());
+
+ absl::Duration cpu = ASSERT_NO_ERRNO_AND_VALUE(ProcessCPUTime(pid));
+ EXPECT_GE(cpu, kHardLimit);
+}
+
+// RAII type for a kernel "POSIX" interval timer. (The kernel provides system
+// calls such as timer_create that behave very similarly, but not identically,
+// to those described by timer_create(2); in particular, the kernel does not
+// implement SIGEV_THREAD. glibc builds POSIX-compliant interval timers based on
+// these kernel interval timers.)
+//
+// Compare implementation to FileDescriptor.
+class IntervalTimer {
+ public:
+ IntervalTimer() = default;
+
+ explicit IntervalTimer(int id) { set_id(id); }
+
+ IntervalTimer(IntervalTimer&& orig) : id_(orig.release()) {}
+
+ IntervalTimer& operator=(IntervalTimer&& orig) {
+ if (this == &orig) return *this;
+ reset(orig.release());
+ return *this;
+ }
+
+ IntervalTimer(const IntervalTimer& other) = delete;
+ IntervalTimer& operator=(const IntervalTimer& other) = delete;
+
+ ~IntervalTimer() { reset(); }
+
+ int get() const { return id_; }
+
+ int release() {
+ int const id = id_;
+ id_ = -1;
+ return id;
+ }
+
+ void reset() { reset(-1); }
+
+ void reset(int id) {
+ if (id_ >= 0) {
+ TEST_PCHECK(syscall(SYS_timer_delete, id_) == 0);
+ MaybeSave();
+ }
+ set_id(id);
+ }
+
+ PosixErrorOr<struct itimerspec> Set(
+ int flags, const struct itimerspec& new_value) const {
+ struct itimerspec old_value = {};
+ if (syscall(SYS_timer_settime, id_, flags, &new_value, &old_value) < 0) {
+ return PosixError(errno, "timer_settime");
+ }
+ MaybeSave();
+ return old_value;
+ }
+
+ PosixErrorOr<struct itimerspec> Get() const {
+ struct itimerspec curr_value = {};
+ if (syscall(SYS_timer_gettime, id_, &curr_value) < 0) {
+ return PosixError(errno, "timer_gettime");
+ }
+ MaybeSave();
+ return curr_value;
+ }
+
+ PosixErrorOr<int> Overruns() const {
+ int rv = syscall(SYS_timer_getoverrun, id_);
+ if (rv < 0) {
+ return PosixError(errno, "timer_getoverrun");
+ }
+ MaybeSave();
+ return rv;
+ }
+
+ private:
+ void set_id(int id) { id_ = std::max(id, -1); }
+
+ // Kernel timer_t is int; glibc timer_t is void*.
+ int id_ = -1;
+};
+
+PosixErrorOr<IntervalTimer> TimerCreate(clockid_t clockid,
+ const struct sigevent& sev) {
+ int timerid;
+ int ret = syscall(SYS_timer_create, clockid, &sev, &timerid);
+ if (ret < 0) {
+ return PosixError(errno, "timer_create");
+ }
+ if (ret > 0) {
+ return PosixError(EINVAL, "timer_create should never return positive");
+ }
+ MaybeSave();
+ return IntervalTimer(timerid);
+}
+
+// See timerfd.cc:TimerSlack() for rationale.
+constexpr absl::Duration kTimerSlack = absl::Milliseconds(500);
+
+TEST(IntervalTimerTest, IsInitiallyStopped) {
+ struct sigevent sev = {};
+ sev.sigev_notify = SIGEV_NONE;
+ const auto timer =
+ ASSERT_NO_ERRNO_AND_VALUE(TimerCreate(CLOCK_MONOTONIC, sev));
+ const struct itimerspec its = ASSERT_NO_ERRNO_AND_VALUE(timer.Get());
+ EXPECT_EQ(0, its.it_value.tv_sec);
+ EXPECT_EQ(0, its.it_value.tv_nsec);
+}
+
+// Kernel can create multiple timers without issue.
+//
+// Regression test for gvisor.dev/issue/1738.
+TEST(IntervalTimerTest, MultipleTimers) {
+ struct sigevent sev = {};
+ sev.sigev_notify = SIGEV_NONE;
+ const auto timer1 =
+ ASSERT_NO_ERRNO_AND_VALUE(TimerCreate(CLOCK_MONOTONIC, sev));
+ const auto timer2 =
+ ASSERT_NO_ERRNO_AND_VALUE(TimerCreate(CLOCK_MONOTONIC, sev));
+}
+
+TEST(IntervalTimerTest, SingleShotSilent) {
+ struct sigevent sev = {};
+ sev.sigev_notify = SIGEV_NONE;
+ const auto timer =
+ ASSERT_NO_ERRNO_AND_VALUE(TimerCreate(CLOCK_MONOTONIC, sev));
+
+ constexpr absl::Duration kDelay = absl::Seconds(1);
+ struct itimerspec its = {};
+ its.it_value = absl::ToTimespec(kDelay);
+ ASSERT_NO_ERRNO(timer.Set(0, its));
+
+ // The timer should count down to 0 and stop since the interval is zero. No
+ // overruns should be counted.
+ absl::SleepFor(kDelay + kTimerSlack);
+ its = ASSERT_NO_ERRNO_AND_VALUE(timer.Get());
+ EXPECT_EQ(0, its.it_value.tv_sec);
+ EXPECT_EQ(0, its.it_value.tv_nsec);
+ EXPECT_THAT(timer.Overruns(), IsPosixErrorOkAndHolds(0));
+}
+
+TEST(IntervalTimerTest, PeriodicSilent) {
+ struct sigevent sev = {};
+ sev.sigev_notify = SIGEV_NONE;
+ const auto timer =
+ ASSERT_NO_ERRNO_AND_VALUE(TimerCreate(CLOCK_MONOTONIC, sev));
+
+ constexpr absl::Duration kPeriod = absl::Seconds(1);
+ struct itimerspec its = {};
+ its.it_value = its.it_interval = absl::ToTimespec(kPeriod);
+ ASSERT_NO_ERRNO(timer.Set(0, its));
+
+ absl::SleepFor(kPeriod * 3 + kTimerSlack);
+
+ // The timer should still be running.
+ its = ASSERT_NO_ERRNO_AND_VALUE(timer.Get());
+ EXPECT_TRUE(its.it_value.tv_nsec != 0 || its.it_value.tv_sec != 0);
+
+ // Timer expirations are not counted as overruns under SIGEV_NONE.
+ EXPECT_THAT(timer.Overruns(), IsPosixErrorOkAndHolds(0));
+}
+
+std::atomic<int> counted_signals;
+
+void IntervalTimerCountingSignalHandler(int sig, siginfo_t* info,
+ void* ucontext) {
+ counted_signals.fetch_add(1 + info->si_overrun);
+}
+
+TEST(IntervalTimerTest, PeriodicGroupDirectedSignal) {
+ constexpr int kSigno = SIGUSR1;
+ constexpr int kSigvalue = 42;
+
+ // Install our signal handler.
+ counted_signals.store(0);
+ struct sigaction sa = {};
+ sa.sa_sigaction = IntervalTimerCountingSignalHandler;
+ sigemptyset(&sa.sa_mask);
+ sa.sa_flags = SA_SIGINFO;
+ const auto scoped_sigaction =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(kSigno, sa));
+
+ // Ensure that kSigno is unblocked on at least one thread.
+ const auto scoped_sigmask =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_UNBLOCK, kSigno));
+
+ struct sigevent sev = {};
+ sev.sigev_notify = SIGEV_SIGNAL;
+ sev.sigev_signo = kSigno;
+ sev.sigev_value.sival_int = kSigvalue;
+ auto timer = ASSERT_NO_ERRNO_AND_VALUE(TimerCreate(CLOCK_MONOTONIC, sev));
+
+ constexpr absl::Duration kPeriod = absl::Seconds(1);
+ constexpr int kCycles = 3;
+ struct itimerspec its = {};
+ its.it_value = its.it_interval = absl::ToTimespec(kPeriod);
+ ASSERT_NO_ERRNO(timer.Set(0, its));
+
+ absl::SleepFor(kPeriod * kCycles + kTimerSlack);
+ EXPECT_GE(counted_signals.load(), kCycles);
+}
+
+// From Linux's include/uapi/asm-generic/siginfo.h.
+#ifndef sigev_notify_thread_id
+#define sigev_notify_thread_id _sigev_un._tid
+#endif
+
+TEST(IntervalTimerTest, PeriodicThreadDirectedSignal) {
+ constexpr int kSigno = SIGUSR1;
+ constexpr int kSigvalue = 42;
+
+ // Block kSigno so that we can accumulate overruns.
+ sigset_t mask;
+ sigemptyset(&mask);
+ sigaddset(&mask, kSigno);
+ const auto scoped_sigmask =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, mask));
+
+ struct sigevent sev = {};
+ sev.sigev_notify = SIGEV_THREAD_ID;
+ sev.sigev_signo = kSigno;
+ sev.sigev_value.sival_int = kSigvalue;
+ sev.sigev_notify_thread_id = gettid();
+ auto timer = ASSERT_NO_ERRNO_AND_VALUE(TimerCreate(CLOCK_MONOTONIC, sev));
+
+ constexpr absl::Duration kPeriod = absl::Seconds(1);
+ constexpr int kCycles = 3;
+ struct itimerspec its = {};
+ its.it_value = its.it_interval = absl::ToTimespec(kPeriod);
+ ASSERT_NO_ERRNO(timer.Set(0, its));
+ absl::SleepFor(kPeriod * kCycles + kTimerSlack);
+
+ // At least kCycles expirations should have occurred, resulting in kCycles-1
+ // overruns (the first expiration sent the signal successfully).
+ siginfo_t si;
+ struct timespec zero_ts = absl::ToTimespec(absl::ZeroDuration());
+ ASSERT_THAT(sigtimedwait(&mask, &si, &zero_ts),
+ SyscallSucceedsWithValue(kSigno));
+ EXPECT_EQ(si.si_signo, kSigno);
+ EXPECT_EQ(si.si_code, SI_TIMER);
+ EXPECT_EQ(si.si_timerid, timer.get());
+ EXPECT_GE(si.si_overrun, kCycles - 1);
+ EXPECT_EQ(si.si_int, kSigvalue);
+
+ // Kill the timer, then drain any additional signal it may have enqueued. We
+ // can't do this before the preceding sigtimedwait because stopping or
+ // deleting the timer resets si_overrun to 0.
+ timer.reset();
+ sigtimedwait(&mask, &si, &zero_ts);
+}
+
+TEST(IntervalTimerTest, OtherThreadGroup) {
+ constexpr int kSigno = SIGUSR1;
+
+ // Create a subprocess that does nothing until killed.
+ pid_t child_pid;
+ const auto sp = ASSERT_NO_ERRNO_AND_VALUE(ForkAndExec(
+ "/proc/self/exe", ExecveArray({"timers", "--timers_test_sleep"}),
+ ExecveArray(), &child_pid, nullptr));
+
+ // Verify that we can't create a timer that would send signals to it.
+ struct sigevent sev = {};
+ sev.sigev_notify = SIGEV_THREAD_ID;
+ sev.sigev_signo = kSigno;
+ sev.sigev_notify_thread_id = child_pid;
+ EXPECT_THAT(TimerCreate(CLOCK_MONOTONIC, sev), PosixErrorIs(EINVAL, _));
+}
+
+TEST(IntervalTimerTest, RealTimeSignalsAreNotDuplicated) {
+ const int kSigno = SIGRTMIN;
+ constexpr int kSigvalue = 42;
+
+ // Block signo so that we can accumulate overruns.
+ sigset_t mask;
+ sigemptyset(&mask);
+ sigaddset(&mask, kSigno);
+ const auto scoped_sigmask = ScopedSignalMask(SIG_BLOCK, mask);
+
+ struct sigevent sev = {};
+ sev.sigev_notify = SIGEV_THREAD_ID;
+ sev.sigev_signo = kSigno;
+ sev.sigev_value.sival_int = kSigvalue;
+ sev.sigev_notify_thread_id = gettid();
+ const auto timer =
+ ASSERT_NO_ERRNO_AND_VALUE(TimerCreate(CLOCK_MONOTONIC, sev));
+
+ constexpr absl::Duration kPeriod = absl::Seconds(1);
+ constexpr int kCycles = 3;
+ struct itimerspec its = {};
+ its.it_value = its.it_interval = absl::ToTimespec(kPeriod);
+ ASSERT_NO_ERRNO(timer.Set(0, its));
+ absl::SleepFor(kPeriod * kCycles + kTimerSlack);
+
+ // Stop the timer so that no further signals are enqueued after sigtimedwait.
+ struct timespec zero_ts = absl::ToTimespec(absl::ZeroDuration());
+ its.it_value = its.it_interval = zero_ts;
+ ASSERT_NO_ERRNO(timer.Set(0, its));
+
+ // The timer should have sent only a single signal, even though the kernel
+ // supports enqueueing of multiple RT signals.
+ siginfo_t si;
+ ASSERT_THAT(sigtimedwait(&mask, &si, &zero_ts),
+ SyscallSucceedsWithValue(kSigno));
+ EXPECT_EQ(si.si_signo, kSigno);
+ EXPECT_EQ(si.si_code, SI_TIMER);
+ EXPECT_EQ(si.si_timerid, timer.get());
+ // si_overrun was reset by timer_settime.
+ EXPECT_EQ(si.si_overrun, 0);
+ EXPECT_EQ(si.si_int, kSigvalue);
+ EXPECT_THAT(sigtimedwait(&mask, &si, &zero_ts),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST(IntervalTimerTest, AlreadyPendingSignal) {
+ constexpr int kSigno = SIGUSR1;
+ constexpr int kSigvalue = 42;
+
+ // Block kSigno so that we can accumulate overruns.
+ sigset_t mask;
+ sigemptyset(&mask);
+ sigaddset(&mask, kSigno);
+ const auto scoped_sigmask =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, mask));
+
+ // Send ourselves a signal, preventing the timer from enqueuing.
+ ASSERT_THAT(tgkill(getpid(), gettid(), kSigno), SyscallSucceeds());
+
+ struct sigevent sev = {};
+ sev.sigev_notify = SIGEV_THREAD_ID;
+ sev.sigev_signo = kSigno;
+ sev.sigev_value.sival_int = kSigvalue;
+ sev.sigev_notify_thread_id = gettid();
+ auto timer = ASSERT_NO_ERRNO_AND_VALUE(TimerCreate(CLOCK_MONOTONIC, sev));
+
+ constexpr absl::Duration kPeriod = absl::Seconds(1);
+ constexpr int kCycles = 3;
+ struct itimerspec its = {};
+ its.it_value = its.it_interval = absl::ToTimespec(kPeriod);
+ ASSERT_NO_ERRNO(timer.Set(0, its));
+
+ // End the sleep one cycle short; we will sleep for one more cycle below.
+ absl::SleepFor(kPeriod * (kCycles - 1));
+
+ // Dequeue the first signal, which we sent to ourselves with tgkill.
+ siginfo_t si;
+ struct timespec zero_ts = absl::ToTimespec(absl::ZeroDuration());
+ ASSERT_THAT(sigtimedwait(&mask, &si, &zero_ts),
+ SyscallSucceedsWithValue(kSigno));
+ EXPECT_EQ(si.si_signo, kSigno);
+ // glibc sigtimedwait silently replaces SI_TKILL with SI_USER:
+ // sysdeps/unix/sysv/linux/sigtimedwait.c:__sigtimedwait(). This isn't
+ // documented, so we don't depend on it.
+ EXPECT_THAT(si.si_code, AnyOf(SI_USER, SI_TKILL));
+
+ // Sleep for 1 more cycle to give the timer time to send a signal.
+ absl::SleepFor(kPeriod + kTimerSlack);
+
+ // At least kCycles expirations should have occurred, resulting in kCycles-1
+ // overruns (the last expiration sent the signal successfully).
+ ASSERT_THAT(sigtimedwait(&mask, &si, &zero_ts),
+ SyscallSucceedsWithValue(kSigno));
+ EXPECT_EQ(si.si_signo, kSigno);
+ EXPECT_EQ(si.si_code, SI_TIMER);
+ EXPECT_EQ(si.si_timerid, timer.get());
+ EXPECT_GE(si.si_overrun, kCycles - 1);
+ EXPECT_EQ(si.si_int, kSigvalue);
+
+ // Kill the timer, then drain any additional signal it may have enqueued. We
+ // can't do this before the preceding sigtimedwait because stopping or
+ // deleting the timer resets si_overrun to 0.
+ timer.reset();
+ sigtimedwait(&mask, &si, &zero_ts);
+}
+
+TEST(IntervalTimerTest, IgnoredSignalCountsAsOverrun) {
+ constexpr int kSigno = SIGUSR1;
+ constexpr int kSigvalue = 42;
+
+ // Ignore kSigno.
+ struct sigaction sa = {};
+ sa.sa_handler = SIG_IGN;
+ const auto scoped_sigaction =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(kSigno, sa));
+
+ // Unblock kSigno so that ignored signals will be discarded.
+ sigset_t mask;
+ sigemptyset(&mask);
+ sigaddset(&mask, kSigno);
+ auto scoped_sigmask =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_UNBLOCK, mask));
+
+ struct sigevent sev = {};
+ sev.sigev_notify = SIGEV_THREAD_ID;
+ sev.sigev_signo = kSigno;
+ sev.sigev_value.sival_int = kSigvalue;
+ sev.sigev_notify_thread_id = gettid();
+ auto timer = ASSERT_NO_ERRNO_AND_VALUE(TimerCreate(CLOCK_MONOTONIC, sev));
+
+ constexpr absl::Duration kPeriod = absl::Seconds(1);
+ constexpr int kCycles = 3;
+ struct itimerspec its = {};
+ its.it_value = its.it_interval = absl::ToTimespec(kPeriod);
+ ASSERT_NO_ERRNO(timer.Set(0, its));
+
+ // End the sleep one cycle short; we will sleep for one more cycle below.
+ absl::SleepFor(kPeriod * (kCycles - 1));
+
+ // Block kSigno so that ignored signals will be enqueued.
+ scoped_sigmask.Release()();
+ scoped_sigmask = ASSERT_NO_ERRNO_AND_VALUE(ScopedSignalMask(SIG_BLOCK, mask));
+
+ // Sleep for 1 more cycle to give the timer time to send a signal.
+ absl::SleepFor(kPeriod + kTimerSlack);
+
+ // At least kCycles expirations should have occurred, resulting in kCycles-1
+ // overruns (the last expiration sent the signal successfully).
+ siginfo_t si;
+ struct timespec zero_ts = absl::ToTimespec(absl::ZeroDuration());
+ ASSERT_THAT(sigtimedwait(&mask, &si, &zero_ts),
+ SyscallSucceedsWithValue(kSigno));
+ EXPECT_EQ(si.si_signo, kSigno);
+ EXPECT_EQ(si.si_code, SI_TIMER);
+ EXPECT_EQ(si.si_timerid, timer.get());
+ EXPECT_GE(si.si_overrun, kCycles - 1);
+ EXPECT_EQ(si.si_int, kSigvalue);
+
+ // Kill the timer, then drain any additional signal it may have enqueued. We
+ // can't do this before the preceding sigtimedwait because stopping or
+ // deleting the timer resets si_overrun to 0.
+ timer.reset();
+ sigtimedwait(&mask, &si, &zero_ts);
+}
+
+} // namespace
+} // namespace testing
+} // namespace gvisor
+
+int main(int argc, char** argv) {
+ gvisor::testing::TestInit(&argc, &argv);
+
+ if (absl::GetFlag(FLAGS_timers_test_sleep)) {
+ while (true) {
+ absl::SleepFor(absl::Seconds(10));
+ }
+ }
+
+ return gvisor::testing::RunAllTests();
+}
diff --git a/test/syscalls/linux/tkill.cc b/test/syscalls/linux/tkill.cc
new file mode 100644
index 000000000..8d8ebbb24
--- /dev/null
+++ b/test/syscalls/linux/tkill.cc
@@ -0,0 +1,75 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <cerrno>
+#include <csignal>
+
+#include "gtest/gtest.h"
+#include "test/util/logging.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+static int tkill(pid_t tid, int sig) {
+ int ret;
+ do {
+ // NOTE(b/25434735): tkill(2) could return EAGAIN for RT signals.
+ ret = syscall(SYS_tkill, tid, sig);
+ } while (ret == -1 && errno == EAGAIN);
+ return ret;
+}
+
+TEST(TkillTest, InvalidTID) {
+ EXPECT_THAT(tkill(-1, 0), SyscallFailsWithErrno(EINVAL));
+ EXPECT_THAT(tkill(0, 0), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(TkillTest, ValidTID) {
+ EXPECT_THAT(tkill(gettid(), 0), SyscallSucceeds());
+}
+
+void SigHandler(int sig, siginfo_t* info, void* context) {
+ TEST_CHECK(sig == SIGRTMAX);
+ TEST_CHECK(info->si_pid == getpid());
+ TEST_CHECK(info->si_uid == getuid());
+ TEST_CHECK(info->si_code == SI_TKILL);
+}
+
+// Test with a real signal. Regression test for b/24790092.
+TEST(TkillTest, ValidTIDAndRealSignal) {
+ struct sigaction sa;
+ sa.sa_sigaction = SigHandler;
+ sigfillset(&sa.sa_mask);
+ sa.sa_flags = SA_SIGINFO;
+ ASSERT_THAT(sigaction(SIGRTMAX, &sa, nullptr), SyscallSucceeds());
+ // InitGoogle blocks all RT signals, so we need undo it.
+ sigset_t unblock;
+ sigemptyset(&unblock);
+ sigaddset(&unblock, SIGRTMAX);
+ ASSERT_THAT(sigprocmask(SIG_UNBLOCK, &unblock, nullptr), SyscallSucceeds());
+ EXPECT_THAT(tkill(gettid(), SIGRTMAX), SyscallSucceeds());
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/truncate.cc b/test/syscalls/linux/truncate.cc
new file mode 100644
index 000000000..c988c6380
--- /dev/null
+++ b/test/syscalls/linux/truncate.cc
@@ -0,0 +1,218 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <signal.h>
+#include <sys/resource.h>
+#include <sys/stat.h>
+#include <sys/vfs.h>
+#include <time.h>
+#include <unistd.h>
+
+#include <iostream>
+#include <string>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/strings/string_view.h"
+#include "test/syscalls/linux/file_base.h"
+#include "test/util/capability_util.h"
+#include "test/util/cleanup.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+class FixtureTruncateTest : public FileTest {
+ void SetUp() override { FileTest::SetUp(); }
+};
+
+TEST_F(FixtureTruncateTest, Truncate) {
+ // Get the current rlimit and restore after test run.
+ struct rlimit initial_lim;
+ ASSERT_THAT(getrlimit(RLIMIT_FSIZE, &initial_lim), SyscallSucceeds());
+ auto cleanup = Cleanup([&initial_lim] {
+ EXPECT_THAT(setrlimit(RLIMIT_FSIZE, &initial_lim), SyscallSucceeds());
+ });
+
+ // Check that it starts at size zero.
+ struct stat buf;
+ ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
+ EXPECT_EQ(buf.st_size, 0);
+
+ // Stay at size zero.
+ EXPECT_THAT(truncate(test_file_name_.c_str(), 0), SyscallSucceeds());
+ ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
+ EXPECT_EQ(buf.st_size, 0);
+
+ // Grow to ten bytes.
+ EXPECT_THAT(truncate(test_file_name_.c_str(), 10), SyscallSucceeds());
+ ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
+ EXPECT_EQ(buf.st_size, 10);
+
+ // Can't be truncated to a negative number.
+ EXPECT_THAT(truncate(test_file_name_.c_str(), -1),
+ SyscallFailsWithErrno(EINVAL));
+
+ // Try growing past the file size limit.
+ sigset_t new_mask;
+ sigemptyset(&new_mask);
+ sigaddset(&new_mask, SIGXFSZ);
+ sigprocmask(SIG_BLOCK, &new_mask, nullptr);
+ struct timespec timelimit;
+ timelimit.tv_sec = 10;
+ timelimit.tv_nsec = 0;
+
+ struct rlimit setlim;
+ setlim.rlim_cur = 1024;
+ setlim.rlim_max = RLIM_INFINITY;
+ ASSERT_THAT(setrlimit(RLIMIT_FSIZE, &setlim), SyscallSucceeds());
+ EXPECT_THAT(truncate(test_file_name_.c_str(), 1025),
+ SyscallFailsWithErrno(EFBIG));
+ EXPECT_EQ(sigtimedwait(&new_mask, nullptr, &timelimit), SIGXFSZ);
+ ASSERT_THAT(sigprocmask(SIG_UNBLOCK, &new_mask, nullptr), SyscallSucceeds());
+
+ // Shrink back down to zero.
+ EXPECT_THAT(truncate(test_file_name_.c_str(), 0), SyscallSucceeds());
+ ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
+ EXPECT_EQ(buf.st_size, 0);
+}
+
+TEST_F(FixtureTruncateTest, Ftruncate) {
+ // Get the current rlimit and restore after test run.
+ struct rlimit initial_lim;
+ ASSERT_THAT(getrlimit(RLIMIT_FSIZE, &initial_lim), SyscallSucceeds());
+ auto cleanup = Cleanup([&initial_lim] {
+ EXPECT_THAT(setrlimit(RLIMIT_FSIZE, &initial_lim), SyscallSucceeds());
+ });
+
+ // Check that it starts at size zero.
+ struct stat buf;
+ ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
+ EXPECT_EQ(buf.st_size, 0);
+
+ // Stay at size zero.
+ EXPECT_THAT(ftruncate(test_file_fd_.get(), 0), SyscallSucceeds());
+ ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
+ EXPECT_EQ(buf.st_size, 0);
+
+ // Grow to ten bytes.
+ EXPECT_THAT(ftruncate(test_file_fd_.get(), 10), SyscallSucceeds());
+ ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
+ EXPECT_EQ(buf.st_size, 10);
+
+ // Can't be truncated to a negative number.
+ EXPECT_THAT(ftruncate(test_file_fd_.get(), -1),
+ SyscallFailsWithErrno(EINVAL));
+
+ // Try growing past the file size limit.
+ sigset_t new_mask;
+ sigemptyset(&new_mask);
+ sigaddset(&new_mask, SIGXFSZ);
+ sigprocmask(SIG_BLOCK, &new_mask, nullptr);
+ struct timespec timelimit;
+ timelimit.tv_sec = 10;
+ timelimit.tv_nsec = 0;
+
+ struct rlimit setlim;
+ setlim.rlim_cur = 1024;
+ setlim.rlim_max = RLIM_INFINITY;
+ ASSERT_THAT(setrlimit(RLIMIT_FSIZE, &setlim), SyscallSucceeds());
+ EXPECT_THAT(ftruncate(test_file_fd_.get(), 1025),
+ SyscallFailsWithErrno(EFBIG));
+ EXPECT_EQ(sigtimedwait(&new_mask, nullptr, &timelimit), SIGXFSZ);
+ ASSERT_THAT(sigprocmask(SIG_UNBLOCK, &new_mask, nullptr), SyscallSucceeds());
+
+ // Shrink back down to zero.
+ EXPECT_THAT(ftruncate(test_file_fd_.get(), 0), SyscallSucceeds());
+ ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds());
+ EXPECT_EQ(buf.st_size, 0);
+}
+
+// Truncating a file down clears that portion of the file.
+TEST_F(FixtureTruncateTest, FtruncateShrinkGrow) {
+ std::vector<char> buf(10, 'a');
+ EXPECT_THAT(WriteFd(test_file_fd_.get(), buf.data(), buf.size()),
+ SyscallSucceedsWithValue(buf.size()));
+
+ // Shrink then regrow the file. This should clear the second half of the file.
+ EXPECT_THAT(ftruncate(test_file_fd_.get(), 5), SyscallSucceeds());
+ EXPECT_THAT(ftruncate(test_file_fd_.get(), 10), SyscallSucceeds());
+
+ EXPECT_THAT(lseek(test_file_fd_.get(), 0, SEEK_SET), SyscallSucceeds());
+
+ std::vector<char> buf2(10);
+ EXPECT_THAT(ReadFd(test_file_fd_.get(), buf2.data(), buf2.size()),
+ SyscallSucceedsWithValue(buf2.size()));
+
+ std::vector<char> expect = {'a', 'a', 'a', 'a', 'a',
+ '\0', '\0', '\0', '\0', '\0'};
+ EXPECT_EQ(expect, buf2);
+}
+
+TEST(TruncateTest, TruncateDir) {
+ auto temp_dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ EXPECT_THAT(truncate(temp_dir.path().c_str(), 0),
+ SyscallFailsWithErrno(EISDIR));
+}
+
+TEST(TruncateTest, FtruncateDir) {
+ auto temp_dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(temp_dir.path(), O_DIRECTORY | O_RDONLY));
+ EXPECT_THAT(ftruncate(fd.get(), 0), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(TruncateTest, TruncateNonWriteable) {
+ // Make sure we don't have CAP_DAC_OVERRIDE, since that allows the user to
+ // always override write permissions.
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+ auto temp_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), absl::string_view(), 0555 /* mode */));
+ EXPECT_THAT(truncate(temp_file.path().c_str(), 0),
+ SyscallFailsWithErrno(EACCES));
+}
+
+TEST(TruncateTest, FtruncateNonWriteable) {
+ auto temp_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), absl::string_view(), 0555 /* mode */));
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(temp_file.path(), O_RDONLY));
+ EXPECT_THAT(ftruncate(fd.get(), 0), SyscallFailsWithErrno(EINVAL));
+}
+
+TEST(TruncateTest, TruncateNonExist) {
+ EXPECT_THAT(truncate("/foo/bar", 0), SyscallFailsWithErrno(ENOENT));
+}
+
+TEST(TruncateTest, FtruncateVirtualTmp_NoRandomSave) {
+ auto temp_file = NewTempAbsPathInDir("/dev/shm");
+ const DisableSave ds; // Incompatible permissions.
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(temp_file, O_RDWR | O_CREAT | O_EXCL, 0));
+ EXPECT_THAT(ftruncate(fd.get(), 100), SyscallSucceeds());
+}
+
+// NOTE: There are additional truncate(2)/ftruncate(2) tests in mknod.cc
+// which are there to avoid running the tests on a number of different
+// filesystems which may not support mknod.
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/tuntap.cc b/test/syscalls/linux/tuntap.cc
new file mode 100644
index 000000000..97d554e72
--- /dev/null
+++ b/test/syscalls/linux/tuntap.cc
@@ -0,0 +1,422 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arpa/inet.h>
+#include <linux/capability.h>
+#include <linux/if_arp.h>
+#include <linux/if_ether.h>
+#include <linux/if_tun.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/strings/ascii.h"
+#include "absl/strings/str_split.h"
+#include "test/syscalls/linux/socket_netlink_route_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+namespace {
+
+constexpr int kIPLen = 4;
+
+constexpr const char kDevNetTun[] = "/dev/net/tun";
+constexpr const char kTapName[] = "tap0";
+
+constexpr const uint8_t kMacA[ETH_ALEN] = {0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA};
+constexpr const uint8_t kMacB[ETH_ALEN] = {0xBB, 0xBB, 0xBB, 0xBB, 0xBB, 0xBB};
+
+PosixErrorOr<std::set<std::string>> DumpLinkNames() {
+ ASSIGN_OR_RETURN_ERRNO(auto links, DumpLinks());
+ std::set<std::string> names;
+ for (const auto& link : links) {
+ names.emplace(link.name);
+ }
+ return names;
+}
+
+PosixErrorOr<Link> GetLinkByName(const std::string& name) {
+ ASSIGN_OR_RETURN_ERRNO(auto links, DumpLinks());
+ for (const auto& link : links) {
+ if (link.name == name) {
+ return link;
+ }
+ }
+ return PosixError(ENOENT, "interface not found");
+}
+
+struct pihdr {
+ uint16_t pi_flags;
+ uint16_t pi_protocol;
+} __attribute__((packed));
+
+struct ping_pkt {
+ pihdr pi;
+ struct ethhdr eth;
+ struct iphdr ip;
+ struct icmphdr icmp;
+ char payload[64];
+} __attribute__((packed));
+
+ping_pkt CreatePingPacket(const uint8_t srcmac[ETH_ALEN], const char* srcip,
+ const uint8_t dstmac[ETH_ALEN], const char* dstip) {
+ ping_pkt pkt = {};
+
+ pkt.pi.pi_protocol = htons(ETH_P_IP);
+
+ memcpy(pkt.eth.h_dest, dstmac, sizeof(pkt.eth.h_dest));
+ memcpy(pkt.eth.h_source, srcmac, sizeof(pkt.eth.h_source));
+ pkt.eth.h_proto = htons(ETH_P_IP);
+
+ pkt.ip.ihl = 5;
+ pkt.ip.version = 4;
+ pkt.ip.tos = 0;
+ pkt.ip.tot_len = htons(sizeof(struct iphdr) + sizeof(struct icmphdr) +
+ sizeof(pkt.payload));
+ pkt.ip.id = 1;
+ pkt.ip.frag_off = 1 << 6; // Do not fragment
+ pkt.ip.ttl = 64;
+ pkt.ip.protocol = IPPROTO_ICMP;
+ inet_pton(AF_INET, dstip, &pkt.ip.daddr);
+ inet_pton(AF_INET, srcip, &pkt.ip.saddr);
+ pkt.ip.check = IPChecksum(pkt.ip);
+
+ pkt.icmp.type = ICMP_ECHO;
+ pkt.icmp.code = 0;
+ pkt.icmp.checksum = 0;
+ pkt.icmp.un.echo.sequence = 1;
+ pkt.icmp.un.echo.id = 1;
+
+ strncpy(pkt.payload, "abcd", sizeof(pkt.payload));
+ pkt.icmp.checksum = ICMPChecksum(pkt.icmp, pkt.payload, sizeof(pkt.payload));
+
+ return pkt;
+}
+
+struct arp_pkt {
+ pihdr pi;
+ struct ethhdr eth;
+ struct arphdr arp;
+ uint8_t arp_sha[ETH_ALEN];
+ uint8_t arp_spa[kIPLen];
+ uint8_t arp_tha[ETH_ALEN];
+ uint8_t arp_tpa[kIPLen];
+} __attribute__((packed));
+
+std::string CreateArpPacket(const uint8_t srcmac[ETH_ALEN], const char* srcip,
+ const uint8_t dstmac[ETH_ALEN], const char* dstip) {
+ std::string buffer;
+ buffer.resize(sizeof(arp_pkt));
+
+ arp_pkt* pkt = reinterpret_cast<arp_pkt*>(&buffer[0]);
+ {
+ pkt->pi.pi_protocol = htons(ETH_P_ARP);
+
+ memcpy(pkt->eth.h_dest, kMacA, sizeof(pkt->eth.h_dest));
+ memcpy(pkt->eth.h_source, kMacB, sizeof(pkt->eth.h_source));
+ pkt->eth.h_proto = htons(ETH_P_ARP);
+
+ pkt->arp.ar_hrd = htons(ARPHRD_ETHER);
+ pkt->arp.ar_pro = htons(ETH_P_IP);
+ pkt->arp.ar_hln = ETH_ALEN;
+ pkt->arp.ar_pln = kIPLen;
+ pkt->arp.ar_op = htons(ARPOP_REPLY);
+
+ memcpy(pkt->arp_sha, srcmac, sizeof(pkt->arp_sha));
+ inet_pton(AF_INET, srcip, pkt->arp_spa);
+ memcpy(pkt->arp_tha, dstmac, sizeof(pkt->arp_tha));
+ inet_pton(AF_INET, dstip, pkt->arp_tpa);
+ }
+ return buffer;
+}
+
+} // namespace
+
+TEST(TuntapStaticTest, NetTunExists) {
+ struct stat statbuf;
+ ASSERT_THAT(stat(kDevNetTun, &statbuf), SyscallSucceeds());
+ // Check that it's a character device with rw-rw-rw- permissions.
+ EXPECT_EQ(statbuf.st_mode, S_IFCHR | 0666);
+}
+
+class TuntapTest : public ::testing::Test {
+ protected:
+ void TearDown() override {
+ if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN))) {
+ // Bring back capability if we had dropped it in test case.
+ ASSERT_NO_ERRNO(SetCapability(CAP_NET_ADMIN, true));
+ }
+ }
+};
+
+TEST_F(TuntapTest, CreateInterfaceNoCap) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+ ASSERT_NO_ERRNO(SetCapability(CAP_NET_ADMIN, false));
+
+ FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kDevNetTun, O_RDWR));
+
+ struct ifreq ifr = {};
+ ifr.ifr_flags = IFF_TAP;
+ strncpy(ifr.ifr_name, kTapName, IFNAMSIZ);
+
+ EXPECT_THAT(ioctl(fd.get(), TUNSETIFF, &ifr), SyscallFailsWithErrno(EPERM));
+}
+
+TEST_F(TuntapTest, CreateFixedNameInterface) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+ FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kDevNetTun, O_RDWR));
+
+ struct ifreq ifr_set = {};
+ ifr_set.ifr_flags = IFF_TAP;
+ strncpy(ifr_set.ifr_name, kTapName, IFNAMSIZ);
+ EXPECT_THAT(ioctl(fd.get(), TUNSETIFF, &ifr_set),
+ SyscallSucceedsWithValue(0));
+
+ struct ifreq ifr_get = {};
+ EXPECT_THAT(ioctl(fd.get(), TUNGETIFF, &ifr_get),
+ SyscallSucceedsWithValue(0));
+
+ struct ifreq ifr_expect = ifr_set;
+ // See __tun_chr_ioctl() in net/drivers/tun.c.
+ ifr_expect.ifr_flags |= IFF_NOFILTER;
+
+ EXPECT_THAT(DumpLinkNames(),
+ IsPosixErrorOkAndHolds(::testing::Contains(kTapName)));
+ EXPECT_THAT(memcmp(&ifr_expect, &ifr_get, sizeof(ifr_get)), ::testing::Eq(0));
+}
+
+TEST_F(TuntapTest, CreateInterface) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+ FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kDevNetTun, O_RDWR));
+
+ struct ifreq ifr = {};
+ ifr.ifr_flags = IFF_TAP;
+ // Empty ifr.ifr_name. Let kernel assign.
+
+ EXPECT_THAT(ioctl(fd.get(), TUNSETIFF, &ifr), SyscallSucceedsWithValue(0));
+
+ struct ifreq ifr_get = {};
+ EXPECT_THAT(ioctl(fd.get(), TUNGETIFF, &ifr_get),
+ SyscallSucceedsWithValue(0));
+
+ std::string ifname = ifr_get.ifr_name;
+ EXPECT_THAT(ifname, ::testing::StartsWith("tap"));
+ EXPECT_THAT(DumpLinkNames(),
+ IsPosixErrorOkAndHolds(::testing::Contains(ifname)));
+}
+
+TEST_F(TuntapTest, InvalidReadWrite) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+ FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kDevNetTun, O_RDWR));
+
+ char buf[128] = {};
+ EXPECT_THAT(read(fd.get(), buf, sizeof(buf)), SyscallFailsWithErrno(EBADFD));
+ EXPECT_THAT(write(fd.get(), buf, sizeof(buf)), SyscallFailsWithErrno(EBADFD));
+}
+
+TEST_F(TuntapTest, WriteToDownDevice) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+ // FIXME(b/110961832): gVisor always creates enabled/up'd interfaces.
+ SKIP_IF(IsRunningOnGvisor());
+
+ FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kDevNetTun, O_RDWR));
+
+ // Device created should be down by default.
+ struct ifreq ifr = {};
+ ifr.ifr_flags = IFF_TAP;
+ EXPECT_THAT(ioctl(fd.get(), TUNSETIFF, &ifr), SyscallSucceedsWithValue(0));
+
+ char buf[128] = {};
+ EXPECT_THAT(write(fd.get(), buf, sizeof(buf)), SyscallFailsWithErrno(EIO));
+}
+
+PosixErrorOr<FileDescriptor> OpenAndAttachTap(
+ const std::string& dev_name, const std::string& dev_ipv4_addr) {
+ // Interface creation.
+ ASSIGN_OR_RETURN_ERRNO(FileDescriptor fd, Open(kDevNetTun, O_RDWR));
+
+ struct ifreq ifr_set = {};
+ ifr_set.ifr_flags = IFF_TAP;
+ strncpy(ifr_set.ifr_name, dev_name.c_str(), IFNAMSIZ);
+ if (ioctl(fd.get(), TUNSETIFF, &ifr_set) < 0) {
+ return PosixError(errno);
+ }
+
+ ASSIGN_OR_RETURN_ERRNO(auto link, GetLinkByName(dev_name));
+
+ // Interface setup.
+ struct in_addr addr;
+ inet_pton(AF_INET, dev_ipv4_addr.c_str(), &addr);
+ EXPECT_NO_ERRNO(LinkAddLocalAddr(link.index, AF_INET, /*prefixlen=*/24, &addr,
+ sizeof(addr)));
+
+ if (!IsRunningOnGvisor()) {
+ // FIXME(b/110961832): gVisor doesn't support setting MAC address on
+ // interfaces yet.
+ RETURN_IF_ERRNO(LinkSetMacAddr(link.index, kMacA, sizeof(kMacA)));
+
+ // FIXME(b/110961832): gVisor always creates enabled/up'd interfaces.
+ RETURN_IF_ERRNO(LinkChangeFlags(link.index, IFF_UP, IFF_UP));
+ }
+
+ return fd;
+}
+
+// This test sets up a TAP device and pings kernel by sending ICMP echo request.
+//
+// It works as the following:
+// * Open /dev/net/tun, and create kTapName interface.
+// * Use rtnetlink to do initial setup of the interface:
+// * Assign IP address 10.0.0.1/24 to kernel.
+// * MAC address: kMacA
+// * Bring up the interface.
+// * Send an ICMP echo reqest (ping) packet from 10.0.0.2 (kMacB) to kernel.
+// * Loop to receive packets from TAP device/fd:
+// * If packet is an ICMP echo reply, it stops and passes the test.
+// * If packet is an ARP request, it responds with canned reply and resends
+// the
+// ICMP request packet.
+TEST_F(TuntapTest, PingKernel) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(OpenAndAttachTap(kTapName, "10.0.0.1"));
+ ping_pkt ping_req = CreatePingPacket(kMacB, "10.0.0.2", kMacA, "10.0.0.1");
+ std::string arp_rep = CreateArpPacket(kMacB, "10.0.0.2", kMacA, "10.0.0.1");
+
+ // Send ping, this would trigger an ARP request on Linux.
+ EXPECT_THAT(write(fd.get(), &ping_req, sizeof(ping_req)),
+ SyscallSucceedsWithValue(sizeof(ping_req)));
+
+ // Receive loop to process inbound packets.
+ struct inpkt {
+ union {
+ pihdr pi;
+ ping_pkt ping;
+ arp_pkt arp;
+ };
+ };
+ while (1) {
+ inpkt r = {};
+ int n = read(fd.get(), &r, sizeof(r));
+ EXPECT_THAT(n, SyscallSucceeds());
+
+ if (n < sizeof(pihdr)) {
+ std::cerr << "Ignored packet, protocol: " << r.pi.pi_protocol
+ << " len: " << n << std::endl;
+ continue;
+ }
+
+ // Process ARP packet.
+ if (n >= sizeof(arp_pkt) && r.pi.pi_protocol == htons(ETH_P_ARP)) {
+ // Respond with canned ARP reply.
+ EXPECT_THAT(write(fd.get(), arp_rep.data(), arp_rep.size()),
+ SyscallSucceedsWithValue(arp_rep.size()));
+ // First ping request might have been dropped due to mac address not in
+ // ARP cache. Send it again.
+ EXPECT_THAT(write(fd.get(), &ping_req, sizeof(ping_req)),
+ SyscallSucceedsWithValue(sizeof(ping_req)));
+ }
+
+ // Process ping response packet.
+ if (n >= sizeof(ping_pkt) && r.pi.pi_protocol == ping_req.pi.pi_protocol &&
+ r.ping.ip.protocol == ping_req.ip.protocol &&
+ !memcmp(&r.ping.ip.saddr, &ping_req.ip.daddr, kIPLen) &&
+ !memcmp(&r.ping.ip.daddr, &ping_req.ip.saddr, kIPLen) &&
+ r.ping.icmp.type == 0 && r.ping.icmp.code == 0) {
+ // Ends and passes the test.
+ break;
+ }
+ }
+}
+
+TEST_F(TuntapTest, SendUdpTriggersArpResolution) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(OpenAndAttachTap(kTapName, "10.0.0.1"));
+
+ // Send a UDP packet to remote.
+ int sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
+ ASSERT_THAT(sock, SyscallSucceeds());
+
+ struct sockaddr_in remote = {};
+ remote.sin_family = AF_INET;
+ remote.sin_port = htons(42);
+ inet_pton(AF_INET, "10.0.0.2", &remote.sin_addr);
+ int ret = sendto(sock, "hello", 5, 0, reinterpret_cast<sockaddr*>(&remote),
+ sizeof(remote));
+ ASSERT_THAT(ret, ::testing::AnyOf(SyscallSucceeds(),
+ SyscallFailsWithErrno(EHOSTDOWN)));
+
+ struct inpkt {
+ union {
+ pihdr pi;
+ arp_pkt arp;
+ };
+ };
+ while (1) {
+ inpkt r = {};
+ int n = read(fd.get(), &r, sizeof(r));
+ EXPECT_THAT(n, SyscallSucceeds());
+
+ if (n < sizeof(pihdr)) {
+ std::cerr << "Ignored packet, protocol: " << r.pi.pi_protocol
+ << " len: " << n << std::endl;
+ continue;
+ }
+
+ if (n >= sizeof(arp_pkt) && r.pi.pi_protocol == htons(ETH_P_ARP)) {
+ break;
+ }
+ }
+}
+
+// Write hang bug found by syskaller: b/155928773
+// https://syzkaller.appspot.com/bug?id=065b893bd8d1d04a4e0a1d53c578537cde1efe99
+TEST_F(TuntapTest, WriteHangBug155928773) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN)));
+
+ FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(OpenAndAttachTap(kTapName, "10.0.0.1"));
+
+ int sock = socket(AF_INET, SOCK_DGRAM, 0);
+ ASSERT_THAT(sock, SyscallSucceeds());
+
+ struct sockaddr_in remote = {};
+ remote.sin_family = AF_INET;
+ remote.sin_port = htons(42);
+ inet_pton(AF_INET, "10.0.0.1", &remote.sin_addr);
+ // Return values do not matter in this test.
+ connect(sock, reinterpret_cast<struct sockaddr*>(&remote), sizeof(remote));
+ write(sock, "hello", 5);
+}
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/tuntap_hostinet.cc b/test/syscalls/linux/tuntap_hostinet.cc
new file mode 100644
index 000000000..1513fb9d5
--- /dev/null
+++ b/test/syscalls/linux/tuntap_hostinet.cc
@@ -0,0 +1,38 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(TuntapHostInetTest, NoNetTun) {
+ SKIP_IF(!IsRunningOnGvisor());
+ SKIP_IF(!IsRunningWithHostinet());
+
+ struct stat statbuf;
+ ASSERT_THAT(stat("/dev/net/tun", &statbuf), SyscallFailsWithErrno(ENOENT));
+}
+
+} // namespace
+} // namespace testing
+
+} // namespace gvisor
diff --git a/test/syscalls/linux/udp_bind.cc b/test/syscalls/linux/udp_bind.cc
new file mode 100644
index 000000000..6d92bdbeb
--- /dev/null
+++ b/test/syscalls/linux/udp_bind.cc
@@ -0,0 +1,316 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arpa/inet.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+struct sockaddr_in_common {
+ sa_family_t sin_family;
+ in_port_t sin_port;
+};
+
+struct SendtoTestParam {
+ // Human readable description of test parameter.
+ std::string description;
+
+ // Test is broken in gVisor, skip.
+ bool skip_on_gvisor;
+
+ // Domain for the socket that will do the sending.
+ int send_domain;
+
+ // Address to bind for the socket that will do the sending.
+ struct sockaddr_storage send_addr;
+ socklen_t send_addr_len; // 0 for unbound.
+
+ // Address to connect to for the socket that will do the sending.
+ struct sockaddr_storage connect_addr;
+ socklen_t connect_addr_len; // 0 for no connection.
+
+ // Domain for the socket that will do the receiving.
+ int recv_domain;
+
+ // Address to bind for the socket that will do the receiving.
+ struct sockaddr_storage recv_addr;
+ socklen_t recv_addr_len;
+
+ // Address to send to.
+ struct sockaddr_storage sendto_addr;
+ socklen_t sendto_addr_len;
+
+ // Expected errno for the sendto call.
+ std::vector<int> sendto_errnos; // empty on success.
+};
+
+class SendtoTest : public ::testing::TestWithParam<SendtoTestParam> {
+ protected:
+ SendtoTest() {
+ // gUnit uses printf, so so will we.
+ printf("Testing with %s\n", GetParam().description.c_str());
+ }
+};
+
+TEST_P(SendtoTest, Sendto) {
+ auto param = GetParam();
+
+ SKIP_IF(param.skip_on_gvisor && IsRunningOnGvisor());
+
+ const FileDescriptor s1 =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(param.send_domain, SOCK_DGRAM, 0));
+ const FileDescriptor s2 =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(param.recv_domain, SOCK_DGRAM, 0));
+
+ if (param.send_addr_len > 0) {
+ ASSERT_THAT(bind(s1.get(), reinterpret_cast<sockaddr*>(&param.send_addr),
+ param.send_addr_len),
+ SyscallSucceeds());
+ }
+
+ if (param.connect_addr_len > 0) {
+ ASSERT_THAT(
+ connect(s1.get(), reinterpret_cast<sockaddr*>(&param.connect_addr),
+ param.connect_addr_len),
+ SyscallSucceeds());
+ }
+
+ ASSERT_THAT(bind(s2.get(), reinterpret_cast<sockaddr*>(&param.recv_addr),
+ param.recv_addr_len),
+ SyscallSucceeds());
+
+ struct sockaddr_storage real_recv_addr = {};
+ socklen_t real_recv_addr_len = param.recv_addr_len;
+ ASSERT_THAT(
+ getsockname(s2.get(), reinterpret_cast<sockaddr*>(&real_recv_addr),
+ &real_recv_addr_len),
+ SyscallSucceeds());
+
+ ASSERT_EQ(real_recv_addr_len, param.recv_addr_len);
+
+ int recv_port =
+ reinterpret_cast<sockaddr_in_common*>(&real_recv_addr)->sin_port;
+
+ struct sockaddr_storage sendto_addr = param.sendto_addr;
+ reinterpret_cast<sockaddr_in_common*>(&sendto_addr)->sin_port = recv_port;
+
+ char buf[20] = {};
+ if (!param.sendto_errnos.empty()) {
+ ASSERT_THAT(RetryEINTR(sendto)(s1.get(), buf, sizeof(buf), 0,
+ reinterpret_cast<sockaddr*>(&sendto_addr),
+ param.sendto_addr_len),
+ SyscallFailsWithErrno(ElementOf(param.sendto_errnos)));
+ return;
+ }
+
+ ASSERT_THAT(RetryEINTR(sendto)(s1.get(), buf, sizeof(buf), 0,
+ reinterpret_cast<sockaddr*>(&sendto_addr),
+ param.sendto_addr_len),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ struct sockaddr_storage got_addr = {};
+ socklen_t got_addr_len = sizeof(sockaddr_storage);
+ ASSERT_THAT(RetryEINTR(recvfrom)(s2.get(), buf, sizeof(buf), 0,
+ reinterpret_cast<sockaddr*>(&got_addr),
+ &got_addr_len),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ ASSERT_GT(got_addr_len, sizeof(sockaddr_in_common));
+ int got_port = reinterpret_cast<sockaddr_in_common*>(&got_addr)->sin_port;
+
+ struct sockaddr_storage sender_addr = {};
+ socklen_t sender_addr_len = sizeof(sockaddr_storage);
+ ASSERT_THAT(getsockname(s1.get(), reinterpret_cast<sockaddr*>(&sender_addr),
+ &sender_addr_len),
+ SyscallSucceeds());
+
+ ASSERT_GT(sender_addr_len, sizeof(sockaddr_in_common));
+ int sender_port =
+ reinterpret_cast<sockaddr_in_common*>(&sender_addr)->sin_port;
+
+ EXPECT_EQ(got_port, sender_port);
+}
+
+socklen_t Ipv4Addr(sockaddr_storage* addr, int port = 0) {
+ auto addr4 = reinterpret_cast<sockaddr_in*>(addr);
+ addr4->sin_family = AF_INET;
+ addr4->sin_port = port;
+ inet_pton(AF_INET, "127.0.0.1", &addr4->sin_addr.s_addr);
+ return sizeof(struct sockaddr_in);
+}
+
+socklen_t Ipv6Addr(sockaddr_storage* addr, int port = 0) {
+ auto addr6 = reinterpret_cast<sockaddr_in6*>(addr);
+ addr6->sin6_family = AF_INET6;
+ addr6->sin6_port = port;
+ inet_pton(AF_INET6, "::1", &addr6->sin6_addr.s6_addr);
+ return sizeof(struct sockaddr_in6);
+}
+
+socklen_t Ipv4MappedIpv6Addr(sockaddr_storage* addr, int port = 0) {
+ auto addr6 = reinterpret_cast<sockaddr_in6*>(addr);
+ addr6->sin6_family = AF_INET6;
+ addr6->sin6_port = port;
+ inet_pton(AF_INET6, "::ffff:127.0.0.1", &addr6->sin6_addr.s6_addr);
+ return sizeof(struct sockaddr_in6);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ UdpBindTest, SendtoTest,
+ ::testing::Values(
+ []() {
+ SendtoTestParam param = {};
+ param.description = "IPv4 mapped IPv6 sendto IPv4 mapped IPv6";
+ param.send_domain = AF_INET6;
+ param.send_addr_len = Ipv4MappedIpv6Addr(&param.send_addr);
+ param.recv_domain = AF_INET6;
+ param.recv_addr_len = Ipv4MappedIpv6Addr(&param.recv_addr);
+ param.sendto_addr_len = Ipv4MappedIpv6Addr(&param.sendto_addr);
+ return param;
+ }(),
+ []() {
+ SendtoTestParam param = {};
+ param.description = "IPv6 sendto IPv6";
+ param.send_domain = AF_INET6;
+ param.send_addr_len = Ipv6Addr(&param.send_addr);
+ param.recv_domain = AF_INET6;
+ param.recv_addr_len = Ipv6Addr(&param.recv_addr);
+ param.sendto_addr_len = Ipv6Addr(&param.sendto_addr);
+ return param;
+ }(),
+ []() {
+ SendtoTestParam param = {};
+ param.description = "IPv4 sendto IPv4";
+ param.send_domain = AF_INET;
+ param.send_addr_len = Ipv4Addr(&param.send_addr);
+ param.recv_domain = AF_INET;
+ param.recv_addr_len = Ipv4Addr(&param.recv_addr);
+ param.sendto_addr_len = Ipv4Addr(&param.sendto_addr);
+ return param;
+ }(),
+ []() {
+ SendtoTestParam param = {};
+ param.description = "IPv4 mapped IPv6 sendto IPv4";
+ param.send_domain = AF_INET6;
+ param.send_addr_len = Ipv4MappedIpv6Addr(&param.send_addr);
+ param.recv_domain = AF_INET;
+ param.recv_addr_len = Ipv4Addr(&param.recv_addr);
+ param.sendto_addr_len = Ipv4MappedIpv6Addr(&param.sendto_addr);
+ return param;
+ }(),
+ []() {
+ SendtoTestParam param = {};
+ param.description = "IPv4 sendto IPv4 mapped IPv6";
+ param.send_domain = AF_INET;
+ param.send_addr_len = Ipv4Addr(&param.send_addr);
+ param.recv_domain = AF_INET6;
+ param.recv_addr_len = Ipv4MappedIpv6Addr(&param.recv_addr);
+ param.sendto_addr_len = Ipv4Addr(&param.sendto_addr);
+ return param;
+ }(),
+ []() {
+ SendtoTestParam param = {};
+ param.description = "unbound IPv6 sendto IPv4 mapped IPv6";
+ param.send_domain = AF_INET6;
+ param.recv_domain = AF_INET6;
+ param.recv_addr_len = Ipv4MappedIpv6Addr(&param.recv_addr);
+ param.sendto_addr_len = Ipv4MappedIpv6Addr(&param.sendto_addr);
+ return param;
+ }(),
+ []() {
+ SendtoTestParam param = {};
+ param.description = "unbound IPv6 sendto IPv4";
+ param.send_domain = AF_INET6;
+ param.recv_domain = AF_INET;
+ param.recv_addr_len = Ipv4Addr(&param.recv_addr);
+ param.sendto_addr_len = Ipv4MappedIpv6Addr(&param.sendto_addr);
+ return param;
+ }(),
+ []() {
+ SendtoTestParam param = {};
+ param.description = "IPv6 sendto IPv4";
+ param.send_domain = AF_INET6;
+ param.send_addr_len = Ipv6Addr(&param.send_addr);
+ param.recv_domain = AF_INET;
+ param.recv_addr_len = Ipv4Addr(&param.recv_addr);
+ param.sendto_addr_len = Ipv4MappedIpv6Addr(&param.sendto_addr);
+ param.sendto_errnos = {ENETUNREACH};
+ return param;
+ }(),
+ []() {
+ SendtoTestParam param = {};
+ param.description = "IPv4 mapped IPv6 sendto IPv6";
+ param.send_domain = AF_INET6;
+ param.send_addr_len = Ipv4MappedIpv6Addr(&param.send_addr);
+ param.recv_domain = AF_INET6;
+ param.recv_addr_len = Ipv6Addr(&param.recv_addr);
+ param.sendto_addr_len = Ipv6Addr(&param.sendto_addr);
+ param.sendto_errnos = {EAFNOSUPPORT};
+ // The errno returned changed in Linux commit c8e6ad0829a723.
+ param.sendto_errnos = {EINVAL, EAFNOSUPPORT};
+ return param;
+ }(),
+ []() {
+ SendtoTestParam param = {};
+ param.description = "connected IPv4 mapped IPv6 sendto IPv6";
+ param.send_domain = AF_INET6;
+ param.connect_addr_len =
+ Ipv4MappedIpv6Addr(&param.connect_addr, 5000);
+ param.recv_domain = AF_INET6;
+ param.recv_addr_len = Ipv6Addr(&param.recv_addr);
+ param.sendto_addr_len = Ipv6Addr(&param.sendto_addr);
+ // The errno returned changed in Linux commit c8e6ad0829a723.
+ param.sendto_errnos = {EINVAL, EAFNOSUPPORT};
+ return param;
+ }(),
+ []() {
+ SendtoTestParam param = {};
+ param.description = "connected IPv6 sendto IPv4 mapped IPv6";
+ // TODO(igudger): Determine if this inconsistent behavior is worth
+ // implementing.
+ param.skip_on_gvisor = true;
+ param.send_domain = AF_INET6;
+ param.connect_addr_len = Ipv6Addr(&param.connect_addr, 5000);
+ param.recv_domain = AF_INET6;
+ param.recv_addr_len = Ipv4MappedIpv6Addr(&param.recv_addr);
+ param.sendto_addr_len = Ipv4MappedIpv6Addr(&param.sendto_addr);
+ return param;
+ }(),
+ []() {
+ SendtoTestParam param = {};
+ param.description = "connected IPv6 sendto IPv4";
+ // TODO(igudger): Determine if this inconsistent behavior is worth
+ // implementing.
+ param.skip_on_gvisor = true;
+ param.send_domain = AF_INET6;
+ param.connect_addr_len = Ipv6Addr(&param.connect_addr, 5000);
+ param.recv_domain = AF_INET;
+ param.recv_addr_len = Ipv4Addr(&param.recv_addr);
+ param.sendto_addr_len = Ipv4MappedIpv6Addr(&param.sendto_addr);
+ return param;
+ }()));
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/udp_socket.cc b/test/syscalls/linux/udp_socket.cc
new file mode 100644
index 000000000..7a8ac30a4
--- /dev/null
+++ b/test/syscalls/linux/udp_socket.cc
@@ -0,0 +1,30 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/udp_socket_test_cases.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+INSTANTIATE_TEST_SUITE_P(AllInetTests, UdpSocketTest,
+ ::testing::Values(AddressFamily::kIpv4,
+ AddressFamily::kIpv6,
+ AddressFamily::kDualStack));
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/udp_socket_errqueue_test_case.cc b/test/syscalls/linux/udp_socket_errqueue_test_case.cc
new file mode 100644
index 000000000..54a0594f7
--- /dev/null
+++ b/test/syscalls/linux/udp_socket_errqueue_test_case.cc
@@ -0,0 +1,57 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef __fuchsia__
+
+#include <arpa/inet.h>
+#include <fcntl.h>
+#include <linux/errqueue.h>
+#include <netinet/in.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include "gtest/gtest.h"
+#include "absl/base/macros.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/udp_socket_test_cases.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+TEST_P(UdpSocketTest, ErrorQueue) {
+ char cmsgbuf[CMSG_SPACE(sizeof(sock_extended_err))];
+ msghdr msg;
+ memset(&msg, 0, sizeof(msg));
+ iovec iov;
+ memset(&iov, 0, sizeof(iov));
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsgbuf;
+ msg.msg_controllen = sizeof(cmsgbuf);
+
+ // recv*(MSG_ERRQUEUE) never blocks, even without MSG_DONTWAIT.
+ EXPECT_THAT(RetryEINTR(recvmsg)(bind_.get(), &msg, MSG_ERRQUEUE),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+} // namespace testing
+} // namespace gvisor
+
+#endif // __fuchsia__
diff --git a/test/syscalls/linux/udp_socket_test_cases.cc b/test/syscalls/linux/udp_socket_test_cases.cc
new file mode 100644
index 000000000..9cc6be4fb
--- /dev/null
+++ b/test/syscalls/linux/udp_socket_test_cases.cc
@@ -0,0 +1,1727 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/udp_socket_test_cases.h"
+
+#include <arpa/inet.h>
+#include <fcntl.h>
+#include <netinet/in.h>
+#include <poll.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include "absl/strings/str_format.h"
+#ifndef SIOCGSTAMP
+#include <linux/sockios.h>
+#endif
+
+#include "gtest/gtest.h"
+#include "absl/base/macros.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/syscalls/linux/ip_socket_test_util.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// Gets a pointer to the port component of the given address.
+uint16_t* Port(struct sockaddr_storage* addr) {
+ switch (addr->ss_family) {
+ case AF_INET: {
+ auto sin = reinterpret_cast<struct sockaddr_in*>(addr);
+ return &sin->sin_port;
+ }
+ case AF_INET6: {
+ auto sin6 = reinterpret_cast<struct sockaddr_in6*>(addr);
+ return &sin6->sin6_port;
+ }
+ }
+
+ return nullptr;
+}
+
+// Sets addr port to "port".
+void SetPort(struct sockaddr_storage* addr, uint16_t port) {
+ switch (addr->ss_family) {
+ case AF_INET: {
+ auto sin = reinterpret_cast<struct sockaddr_in*>(addr);
+ sin->sin_port = port;
+ break;
+ }
+ case AF_INET6: {
+ auto sin6 = reinterpret_cast<struct sockaddr_in6*>(addr);
+ sin6->sin6_port = port;
+ break;
+ }
+ }
+}
+
+void UdpSocketTest::SetUp() {
+ addrlen_ = GetAddrLength();
+
+ bind_ =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(GetFamily(), SOCK_DGRAM, IPPROTO_UDP));
+ memset(&bind_addr_storage_, 0, sizeof(bind_addr_storage_));
+ bind_addr_ = reinterpret_cast<struct sockaddr*>(&bind_addr_storage_);
+
+ sock_ =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(GetFamily(), SOCK_DGRAM, IPPROTO_UDP));
+}
+
+int UdpSocketTest::GetFamily() {
+ if (GetParam() == AddressFamily::kIpv4) {
+ return AF_INET;
+ }
+ return AF_INET6;
+}
+
+PosixError UdpSocketTest::BindLoopback() {
+ bind_addr_storage_ = InetLoopbackAddr();
+ struct sockaddr* bind_addr_ =
+ reinterpret_cast<struct sockaddr*>(&bind_addr_storage_);
+ return BindSocket(bind_.get(), bind_addr_);
+}
+
+PosixError UdpSocketTest::BindAny() {
+ bind_addr_storage_ = InetAnyAddr();
+ struct sockaddr* bind_addr_ =
+ reinterpret_cast<struct sockaddr*>(&bind_addr_storage_);
+ return BindSocket(bind_.get(), bind_addr_);
+}
+
+PosixError UdpSocketTest::BindSocket(int socket, struct sockaddr* addr) {
+ socklen_t len = sizeof(bind_addr_storage_);
+
+ // Bind, then check that we get the right address.
+ RETURN_ERROR_IF_SYSCALL_FAIL(bind(socket, addr, addrlen_));
+
+ RETURN_ERROR_IF_SYSCALL_FAIL(getsockname(socket, addr, &len));
+
+ if (addrlen_ != len) {
+ return PosixError(
+ EINVAL,
+ absl::StrFormat("getsockname len: %u expected: %u", len, addrlen_));
+ }
+ return PosixError(0);
+}
+
+socklen_t UdpSocketTest::GetAddrLength() {
+ struct sockaddr_storage addr;
+ if (GetFamily() == AF_INET) {
+ auto sin = reinterpret_cast<struct sockaddr_in*>(&addr);
+ return sizeof(*sin);
+ }
+
+ auto sin6 = reinterpret_cast<struct sockaddr_in6*>(&addr);
+ return sizeof(*sin6);
+}
+
+sockaddr_storage UdpSocketTest::InetAnyAddr() {
+ struct sockaddr_storage addr;
+ memset(&addr, 0, sizeof(addr));
+ reinterpret_cast<struct sockaddr*>(&addr)->sa_family = GetFamily();
+
+ if (GetFamily() == AF_INET) {
+ auto sin = reinterpret_cast<struct sockaddr_in*>(&addr);
+ sin->sin_addr.s_addr = htonl(INADDR_ANY);
+ sin->sin_port = htons(0);
+ return addr;
+ }
+
+ auto sin6 = reinterpret_cast<struct sockaddr_in6*>(&addr);
+ sin6->sin6_addr = IN6ADDR_ANY_INIT;
+ sin6->sin6_port = htons(0);
+ return addr;
+}
+
+sockaddr_storage UdpSocketTest::InetLoopbackAddr() {
+ struct sockaddr_storage addr;
+ memset(&addr, 0, sizeof(addr));
+ reinterpret_cast<struct sockaddr*>(&addr)->sa_family = GetFamily();
+
+ if (GetFamily() == AF_INET) {
+ auto sin = reinterpret_cast<struct sockaddr_in*>(&addr);
+ sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ sin->sin_port = htons(0);
+ return addr;
+ }
+ auto sin6 = reinterpret_cast<struct sockaddr_in6*>(&addr);
+ sin6->sin6_addr = in6addr_loopback;
+ sin6->sin6_port = htons(0);
+ return addr;
+}
+
+void UdpSocketTest::Disconnect(int sockfd) {
+ sockaddr_storage addr_storage = InetAnyAddr();
+ sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_storage);
+ socklen_t addrlen = sizeof(addr_storage);
+
+ addr->sa_family = AF_UNSPEC;
+ ASSERT_THAT(connect(sockfd, addr, addrlen), SyscallSucceeds());
+
+ // Check that after disconnect the socket is bound to the ANY address.
+ EXPECT_THAT(getsockname(sockfd, addr, &addrlen), SyscallSucceeds());
+ if (GetParam() == AddressFamily::kIpv4) {
+ auto addr_out = reinterpret_cast<struct sockaddr_in*>(addr);
+ EXPECT_EQ(addrlen, sizeof(*addr_out));
+ EXPECT_EQ(addr_out->sin_addr.s_addr, htonl(INADDR_ANY));
+ } else {
+ auto addr_out = reinterpret_cast<struct sockaddr_in6*>(addr);
+ EXPECT_EQ(addrlen, sizeof(*addr_out));
+ struct in6_addr loopback = IN6ADDR_ANY_INIT;
+
+ EXPECT_EQ(memcmp(&addr_out->sin6_addr, &loopback, sizeof(in6_addr)), 0);
+ }
+}
+
+TEST_P(UdpSocketTest, Creation) {
+ FileDescriptor sock =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(GetFamily(), SOCK_DGRAM, IPPROTO_UDP));
+ EXPECT_THAT(close(sock.release()), SyscallSucceeds());
+
+ sock = ASSERT_NO_ERRNO_AND_VALUE(Socket(GetFamily(), SOCK_DGRAM, 0));
+ EXPECT_THAT(close(sock.release()), SyscallSucceeds());
+
+ ASSERT_THAT(socket(GetFamily(), SOCK_STREAM, IPPROTO_UDP), SyscallFails());
+}
+
+TEST_P(UdpSocketTest, Getsockname) {
+ // Check that we're not bound.
+ struct sockaddr_storage addr;
+ socklen_t addrlen = sizeof(addr);
+ EXPECT_THAT(
+ getsockname(bind_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
+ SyscallSucceeds());
+ EXPECT_EQ(addrlen, addrlen_);
+ struct sockaddr_storage any = InetAnyAddr();
+ EXPECT_EQ(memcmp(&addr, reinterpret_cast<struct sockaddr*>(&any), addrlen_),
+ 0);
+
+ ASSERT_NO_ERRNO(BindLoopback());
+
+ EXPECT_THAT(
+ getsockname(bind_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
+ SyscallSucceeds());
+
+ EXPECT_EQ(addrlen, addrlen_);
+ EXPECT_EQ(memcmp(&addr, bind_addr_, addrlen_), 0);
+}
+
+TEST_P(UdpSocketTest, Getpeername) {
+ ASSERT_NO_ERRNO(BindLoopback());
+
+ // Check that we're not connected.
+ struct sockaddr_storage addr;
+ socklen_t addrlen = sizeof(addr);
+ EXPECT_THAT(
+ getpeername(sock_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
+ SyscallFailsWithErrno(ENOTCONN));
+
+ // Connect, then check that we get the right address.
+ ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+ addrlen = sizeof(addr);
+ EXPECT_THAT(
+ getpeername(sock_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
+ SyscallSucceeds());
+ EXPECT_EQ(addrlen, addrlen_);
+ EXPECT_EQ(memcmp(&addr, bind_addr_, addrlen_), 0);
+}
+
+TEST_P(UdpSocketTest, SendNotConnected) {
+ ASSERT_NO_ERRNO(BindLoopback());
+
+ // Do send & write, they must fail.
+ char buf[512];
+ EXPECT_THAT(send(sock_.get(), buf, sizeof(buf), 0),
+ SyscallFailsWithErrno(EDESTADDRREQ));
+
+ EXPECT_THAT(write(sock_.get(), buf, sizeof(buf)),
+ SyscallFailsWithErrno(EDESTADDRREQ));
+
+ // Use sendto.
+ ASSERT_THAT(sendto(sock_.get(), buf, sizeof(buf), 0, bind_addr_, addrlen_),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ // Check that we're bound now.
+ struct sockaddr_storage addr;
+ socklen_t addrlen = sizeof(addr);
+ EXPECT_THAT(
+ getsockname(sock_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
+ SyscallSucceeds());
+ EXPECT_EQ(addrlen, addrlen_);
+ EXPECT_NE(*Port(&addr), 0);
+}
+
+TEST_P(UdpSocketTest, ConnectBinds) {
+ ASSERT_NO_ERRNO(BindLoopback());
+
+ // Connect the socket.
+ ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+ // Check that we're bound now.
+ struct sockaddr_storage addr;
+ socklen_t addrlen = sizeof(addr);
+ EXPECT_THAT(
+ getsockname(sock_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
+ SyscallSucceeds());
+ EXPECT_EQ(addrlen, addrlen_);
+ EXPECT_NE(*Port(&addr), 0);
+}
+
+TEST_P(UdpSocketTest, ReceiveNotBound) {
+ char buf[512];
+ EXPECT_THAT(recv(sock_.get(), buf, sizeof(buf), MSG_DONTWAIT),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+}
+
+TEST_P(UdpSocketTest, Bind) {
+ ASSERT_NO_ERRNO(BindLoopback());
+
+ // Try to bind again.
+ EXPECT_THAT(bind(bind_.get(), bind_addr_, addrlen_),
+ SyscallFailsWithErrno(EINVAL));
+
+ // Check that we're still bound to the original address.
+ struct sockaddr_storage addr;
+ socklen_t addrlen = sizeof(addr);
+ EXPECT_THAT(
+ getsockname(bind_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
+ SyscallSucceeds());
+ EXPECT_EQ(addrlen, addrlen_);
+ EXPECT_EQ(memcmp(&addr, bind_addr_, addrlen_), 0);
+}
+
+TEST_P(UdpSocketTest, BindInUse) {
+ ASSERT_NO_ERRNO(BindLoopback());
+
+ // Try to bind again.
+ EXPECT_THAT(bind(sock_.get(), bind_addr_, addrlen_),
+ SyscallFailsWithErrno(EADDRINUSE));
+}
+
+TEST_P(UdpSocketTest, ReceiveAfterConnect) {
+ ASSERT_NO_ERRNO(BindLoopback());
+ ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+ // Send from sock_ to bind_
+ char buf[512];
+ RandomizeBuffer(buf, sizeof(buf));
+ ASSERT_THAT(sendto(sock_.get(), buf, sizeof(buf), 0, bind_addr_, addrlen_),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ // Receive the data.
+ char received[sizeof(buf)];
+ EXPECT_THAT(recv(bind_.get(), received, sizeof(received), 0),
+ SyscallSucceedsWithValue(sizeof(received)));
+ EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
+}
+
+TEST_P(UdpSocketTest, ReceiveAfterDisconnect) {
+ ASSERT_NO_ERRNO(BindLoopback());
+
+ for (int i = 0; i < 2; i++) {
+ // Connet sock_ to bound address.
+ ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+ struct sockaddr_storage addr;
+ socklen_t addrlen = sizeof(addr);
+ EXPECT_THAT(
+ getsockname(sock_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
+ SyscallSucceeds());
+ EXPECT_EQ(addrlen, addrlen_);
+
+ // Send from sock to bind_.
+ char buf[512];
+ RandomizeBuffer(buf, sizeof(buf));
+
+ ASSERT_THAT(sendto(bind_.get(), buf, sizeof(buf), 0,
+ reinterpret_cast<sockaddr*>(&addr), addrlen),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ // Receive the data.
+ char received[sizeof(buf)];
+ EXPECT_THAT(recv(sock_.get(), received, sizeof(received), 0),
+ SyscallSucceedsWithValue(sizeof(received)));
+ EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
+
+ // Disconnect sock_.
+ struct sockaddr unspec = {};
+ unspec.sa_family = AF_UNSPEC;
+ ASSERT_THAT(connect(sock_.get(), &unspec, sizeof(unspec.sa_family)),
+ SyscallSucceeds());
+ }
+}
+
+TEST_P(UdpSocketTest, Connect) {
+ ASSERT_NO_ERRNO(BindLoopback());
+ ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+ // Check that we're connected to the right peer.
+ struct sockaddr_storage peer;
+ socklen_t peerlen = sizeof(peer);
+ EXPECT_THAT(
+ getpeername(sock_.get(), reinterpret_cast<sockaddr*>(&peer), &peerlen),
+ SyscallSucceeds());
+ EXPECT_EQ(peerlen, addrlen_);
+ EXPECT_EQ(memcmp(&peer, bind_addr_, addrlen_), 0);
+
+ // Try to bind after connect.
+ struct sockaddr_storage any = InetAnyAddr();
+ EXPECT_THAT(
+ bind(sock_.get(), reinterpret_cast<struct sockaddr*>(&any), addrlen_),
+ SyscallFailsWithErrno(EINVAL));
+
+ struct sockaddr_storage bind2_storage = InetLoopbackAddr();
+ struct sockaddr* bind2_addr =
+ reinterpret_cast<struct sockaddr*>(&bind2_storage);
+ FileDescriptor bind2 =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(GetFamily(), SOCK_DGRAM, IPPROTO_UDP));
+ ASSERT_NO_ERRNO(BindSocket(bind2.get(), bind2_addr));
+
+ // Try to connect again.
+ EXPECT_THAT(connect(sock_.get(), bind2_addr, addrlen_), SyscallSucceeds());
+
+ // Check that peer name changed.
+ peerlen = sizeof(peer);
+ EXPECT_THAT(
+ getpeername(sock_.get(), reinterpret_cast<sockaddr*>(&peer), &peerlen),
+ SyscallSucceeds());
+ EXPECT_EQ(peerlen, addrlen_);
+ EXPECT_EQ(memcmp(&peer, bind2_addr, addrlen_), 0);
+}
+
+TEST_P(UdpSocketTest, ConnectAnyZero) {
+ // TODO(138658473): Enable when we can connect to port 0 with gVisor.
+ SKIP_IF(IsRunningOnGvisor());
+
+ struct sockaddr_storage any = InetAnyAddr();
+ EXPECT_THAT(
+ connect(sock_.get(), reinterpret_cast<struct sockaddr*>(&any), addrlen_),
+ SyscallSucceeds());
+
+ struct sockaddr_storage addr;
+ socklen_t addrlen = sizeof(addr);
+ EXPECT_THAT(
+ getpeername(sock_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
+ SyscallFailsWithErrno(ENOTCONN));
+}
+
+TEST_P(UdpSocketTest, ConnectAnyWithPort) {
+ ASSERT_NO_ERRNO(BindAny());
+ ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+ struct sockaddr_storage addr;
+ socklen_t addrlen = sizeof(addr);
+ EXPECT_THAT(
+ getpeername(sock_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
+ SyscallSucceeds());
+}
+
+TEST_P(UdpSocketTest, DisconnectAfterConnectAny) {
+ // TODO(138658473): Enable when we can connect to port 0 with gVisor.
+ SKIP_IF(IsRunningOnGvisor());
+ struct sockaddr_storage any = InetAnyAddr();
+ EXPECT_THAT(
+ connect(sock_.get(), reinterpret_cast<struct sockaddr*>(&any), addrlen_),
+ SyscallSucceeds());
+
+ struct sockaddr_storage addr;
+ socklen_t addrlen = sizeof(addr);
+ EXPECT_THAT(
+ getpeername(sock_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
+ SyscallFailsWithErrno(ENOTCONN));
+
+ Disconnect(sock_.get());
+}
+
+TEST_P(UdpSocketTest, DisconnectAfterConnectAnyWithPort) {
+ ASSERT_NO_ERRNO(BindAny());
+ EXPECT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+ struct sockaddr_storage addr;
+ socklen_t addrlen = sizeof(addr);
+ EXPECT_THAT(
+ getpeername(sock_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
+ SyscallSucceeds());
+
+ EXPECT_EQ(addrlen, addrlen_);
+ EXPECT_EQ(*Port(&bind_addr_storage_), *Port(&addr));
+
+ Disconnect(sock_.get());
+}
+
+TEST_P(UdpSocketTest, DisconnectAfterBind) {
+ ASSERT_NO_ERRNO(BindLoopback());
+
+ // Bind to the next port above bind_.
+ struct sockaddr_storage addr_storage = InetLoopbackAddr();
+ struct sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_storage);
+ SetPort(&addr_storage, *Port(&bind_addr_storage_) + 1);
+ ASSERT_NO_ERRNO(BindSocket(sock_.get(), addr));
+
+ // Connect the socket.
+ ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+ struct sockaddr_storage unspec = {};
+ unspec.ss_family = AF_UNSPEC;
+ EXPECT_THAT(connect(sock_.get(), reinterpret_cast<sockaddr*>(&unspec),
+ sizeof(unspec.ss_family)),
+ SyscallSucceeds());
+
+ // Check that we're still bound.
+ socklen_t addrlen = sizeof(unspec);
+ EXPECT_THAT(
+ getsockname(sock_.get(), reinterpret_cast<sockaddr*>(&unspec), &addrlen),
+ SyscallSucceeds());
+
+ EXPECT_EQ(addrlen, addrlen_);
+ EXPECT_EQ(memcmp(addr, &unspec, addrlen_), 0);
+
+ addrlen = sizeof(addr);
+ EXPECT_THAT(getpeername(sock_.get(), addr, &addrlen),
+ SyscallFailsWithErrno(ENOTCONN));
+}
+
+TEST_P(UdpSocketTest, BindToAnyConnnectToLocalhost) {
+ ASSERT_NO_ERRNO(BindAny());
+
+ struct sockaddr_storage addr_storage = InetLoopbackAddr();
+ struct sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_storage);
+ SetPort(&addr_storage, *Port(&bind_addr_storage_) + 1);
+ socklen_t addrlen = sizeof(addr);
+
+ // Connect the socket.
+ ASSERT_THAT(connect(bind_.get(), addr, addrlen_), SyscallSucceeds());
+
+ EXPECT_THAT(getsockname(bind_.get(), addr, &addrlen), SyscallSucceeds());
+
+ // If the socket is bound to ANY and connected to a loopback address,
+ // getsockname() has to return the loopback address.
+ if (GetParam() == AddressFamily::kIpv4) {
+ auto addr_out = reinterpret_cast<struct sockaddr_in*>(addr);
+ EXPECT_EQ(addrlen, sizeof(*addr_out));
+ EXPECT_EQ(addr_out->sin_addr.s_addr, htonl(INADDR_LOOPBACK));
+ } else {
+ auto addr_out = reinterpret_cast<struct sockaddr_in6*>(addr);
+ struct in6_addr loopback = IN6ADDR_LOOPBACK_INIT;
+ EXPECT_EQ(addrlen, sizeof(*addr_out));
+ EXPECT_EQ(memcmp(&addr_out->sin6_addr, &loopback, sizeof(in6_addr)), 0);
+ }
+}
+
+TEST_P(UdpSocketTest, DisconnectAfterBindToAny) {
+ ASSERT_NO_ERRNO(BindLoopback());
+
+ struct sockaddr_storage any_storage = InetAnyAddr();
+ struct sockaddr* any = reinterpret_cast<struct sockaddr*>(&any_storage);
+ SetPort(&any_storage, *Port(&bind_addr_storage_) + 1);
+
+ ASSERT_NO_ERRNO(BindSocket(sock_.get(), any));
+
+ // Connect the socket.
+ ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+ Disconnect(sock_.get());
+
+ // Check that we're still bound.
+ struct sockaddr_storage addr;
+ socklen_t addrlen = sizeof(addr);
+ EXPECT_THAT(
+ getsockname(sock_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
+ SyscallSucceeds());
+
+ EXPECT_EQ(addrlen, addrlen_);
+ EXPECT_EQ(memcmp(&addr, any, addrlen), 0);
+
+ addrlen = sizeof(addr);
+ EXPECT_THAT(
+ getpeername(sock_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
+ SyscallFailsWithErrno(ENOTCONN));
+}
+
+TEST_P(UdpSocketTest, Disconnect) {
+ ASSERT_NO_ERRNO(BindLoopback());
+
+ struct sockaddr_storage any_storage = InetAnyAddr();
+ struct sockaddr* any = reinterpret_cast<struct sockaddr*>(&any_storage);
+ SetPort(&any_storage, *Port(&bind_addr_storage_) + 1);
+ ASSERT_NO_ERRNO(BindSocket(sock_.get(), any));
+
+ for (int i = 0; i < 2; i++) {
+ // Try to connect again.
+ EXPECT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+ // Check that we're connected to the right peer.
+ struct sockaddr_storage peer;
+ socklen_t peerlen = sizeof(peer);
+ EXPECT_THAT(
+ getpeername(sock_.get(), reinterpret_cast<sockaddr*>(&peer), &peerlen),
+ SyscallSucceeds());
+ EXPECT_EQ(peerlen, addrlen_);
+ EXPECT_EQ(memcmp(&peer, bind_addr_, addrlen_), 0);
+
+ // Try to disconnect.
+ struct sockaddr_storage addr = {};
+ addr.ss_family = AF_UNSPEC;
+ EXPECT_THAT(connect(sock_.get(), reinterpret_cast<sockaddr*>(&addr),
+ sizeof(addr.ss_family)),
+ SyscallSucceeds());
+
+ peerlen = sizeof(peer);
+ EXPECT_THAT(
+ getpeername(sock_.get(), reinterpret_cast<sockaddr*>(&peer), &peerlen),
+ SyscallFailsWithErrno(ENOTCONN));
+
+ // Check that we're still bound.
+ socklen_t addrlen = sizeof(addr);
+ EXPECT_THAT(
+ getsockname(sock_.get(), reinterpret_cast<sockaddr*>(&addr), &addrlen),
+ SyscallSucceeds());
+ EXPECT_EQ(addrlen, addrlen_);
+ EXPECT_EQ(*Port(&addr), *Port(&any_storage));
+ }
+}
+
+TEST_P(UdpSocketTest, ConnectBadAddress) {
+ struct sockaddr addr = {};
+ addr.sa_family = GetFamily();
+ ASSERT_THAT(connect(sock_.get(), &addr, sizeof(addr.sa_family)),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(UdpSocketTest, SendToAddressOtherThanConnected) {
+ ASSERT_NO_ERRNO(BindLoopback());
+
+ struct sockaddr_storage addr_storage = InetAnyAddr();
+ struct sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_storage);
+ SetPort(&addr_storage, *Port(&bind_addr_storage_) + 1);
+
+ ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+ // Send to a different destination than we're connected to.
+ char buf[512];
+ EXPECT_THAT(sendto(sock_.get(), buf, sizeof(buf), 0, addr, addrlen_),
+ SyscallSucceedsWithValue(sizeof(buf)));
+}
+
+TEST_P(UdpSocketTest, ZerolengthWriteAllowed) {
+ // TODO(gvisor.dev/issue/1202): Hostinet does not support zero length writes.
+ SKIP_IF(IsRunningWithHostinet());
+
+ ASSERT_NO_ERRNO(BindLoopback());
+ // Connect to loopback:bind_addr_+1.
+ struct sockaddr_storage addr_storage = InetLoopbackAddr();
+ struct sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_storage);
+ SetPort(&addr_storage, *Port(&bind_addr_storage_) + 1);
+ ASSERT_THAT(connect(bind_.get(), addr, addrlen_), SyscallSucceeds());
+
+ // Bind sock to loopback:bind_addr_+1.
+ ASSERT_THAT(bind(sock_.get(), addr, addrlen_), SyscallSucceeds());
+
+ char buf[3];
+ // Send zero length packet from bind_ to sock_.
+ ASSERT_THAT(write(bind_.get(), buf, 0), SyscallSucceedsWithValue(0));
+
+ struct pollfd pfd = {sock_.get(), POLLIN, 0};
+ ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout*/ 1000),
+ SyscallSucceedsWithValue(1));
+
+ // Receive the packet.
+ char received[3];
+ EXPECT_THAT(read(sock_.get(), received, sizeof(received)),
+ SyscallSucceedsWithValue(0));
+}
+
+TEST_P(UdpSocketTest, ZerolengthWriteAllowedNonBlockRead) {
+ // TODO(gvisor.dev/issue/1202): Hostinet does not support zero length writes.
+ SKIP_IF(IsRunningWithHostinet());
+
+ ASSERT_NO_ERRNO(BindLoopback());
+
+ // Connect to loopback:bind_addr_port+1.
+ struct sockaddr_storage addr_storage = InetLoopbackAddr();
+ struct sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_storage);
+ SetPort(&addr_storage, *Port(&bind_addr_storage_) + 1);
+ ASSERT_THAT(connect(bind_.get(), addr, addrlen_), SyscallSucceeds());
+
+ // Bind sock to loopback:bind_addr_port+1.
+ ASSERT_THAT(bind(sock_.get(), addr, addrlen_), SyscallSucceeds());
+
+ // Set sock to non-blocking.
+ int opts = 0;
+ ASSERT_THAT(opts = fcntl(sock_.get(), F_GETFL), SyscallSucceeds());
+ ASSERT_THAT(fcntl(sock_.get(), F_SETFL, opts | O_NONBLOCK),
+ SyscallSucceeds());
+
+ char buf[3];
+ // Send zero length packet from bind_ to sock_.
+ ASSERT_THAT(write(bind_.get(), buf, 0), SyscallSucceedsWithValue(0));
+
+ struct pollfd pfd = {sock_.get(), POLLIN, 0};
+ ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
+ SyscallSucceedsWithValue(1));
+
+ // Receive the packet.
+ char received[3];
+ EXPECT_THAT(read(sock_.get(), received, sizeof(received)),
+ SyscallSucceedsWithValue(0));
+ EXPECT_THAT(read(sock_.get(), received, sizeof(received)),
+ SyscallFailsWithErrno(EAGAIN));
+}
+
+TEST_P(UdpSocketTest, SendAndReceiveNotConnected) {
+ ASSERT_NO_ERRNO(BindLoopback());
+
+ // Send some data to bind_.
+ char buf[512];
+ RandomizeBuffer(buf, sizeof(buf));
+
+ ASSERT_THAT(sendto(sock_.get(), buf, sizeof(buf), 0, bind_addr_, addrlen_),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ // Receive the data.
+ char received[sizeof(buf)];
+ EXPECT_THAT(recv(bind_.get(), received, sizeof(received), 0),
+ SyscallSucceedsWithValue(sizeof(received)));
+ EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
+}
+
+TEST_P(UdpSocketTest, SendAndReceiveConnected) {
+ ASSERT_NO_ERRNO(BindLoopback());
+
+ // Connect to loopback:bind_addr_port+1.
+ struct sockaddr_storage addr_storage = InetLoopbackAddr();
+ struct sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_storage);
+ SetPort(&addr_storage, *Port(&bind_addr_storage_) + 1);
+ ASSERT_THAT(connect(bind_.get(), addr, addrlen_), SyscallSucceeds());
+
+ // Bind sock to loopback:TestPort+1.
+ ASSERT_THAT(bind(sock_.get(), addr, addrlen_), SyscallSucceeds());
+
+ // Send some data from sock to bind_.
+ char buf[512];
+ RandomizeBuffer(buf, sizeof(buf));
+
+ ASSERT_THAT(sendto(sock_.get(), buf, sizeof(buf), 0, bind_addr_, addrlen_),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ // Receive the data.
+ char received[sizeof(buf)];
+ EXPECT_THAT(recv(bind_.get(), received, sizeof(received), 0),
+ SyscallSucceedsWithValue(sizeof(received)));
+ EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
+}
+
+TEST_P(UdpSocketTest, ReceiveFromNotConnected) {
+ ASSERT_NO_ERRNO(BindLoopback());
+
+ // Connect to loopback:bind_addr_port+1.
+ struct sockaddr_storage addr_storage = InetLoopbackAddr();
+ struct sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_storage);
+ SetPort(&addr_storage, *Port(&bind_addr_storage_) + 1);
+ ASSERT_THAT(connect(bind_.get(), addr, addrlen_), SyscallSucceeds());
+
+ // Bind sock to loopback:bind_addr_port+2.
+ struct sockaddr_storage addr2_storage = InetLoopbackAddr();
+ struct sockaddr* addr2 = reinterpret_cast<struct sockaddr*>(&addr2_storage);
+ SetPort(&addr2_storage, *Port(&bind_addr_storage_) + 2);
+ ASSERT_THAT(bind(sock_.get(), addr2, addrlen_), SyscallSucceeds());
+
+ // Send some data from sock to bind_.
+ char buf[512];
+ ASSERT_THAT(sendto(sock_.get(), buf, sizeof(buf), 0, bind_addr_, addrlen_),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ // Check that the data isn't received because it was sent from a different
+ // address than we're connected.
+ EXPECT_THAT(recv(sock_.get(), buf, sizeof(buf), MSG_DONTWAIT),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+}
+
+TEST_P(UdpSocketTest, ReceiveBeforeConnect) {
+ ASSERT_NO_ERRNO(BindLoopback());
+
+ // Bind sock to loopback:bind_addr_port+2.
+ struct sockaddr_storage addr2_storage = InetLoopbackAddr();
+ struct sockaddr* addr2 = reinterpret_cast<struct sockaddr*>(&addr2_storage);
+ SetPort(&addr2_storage, *Port(&bind_addr_storage_) + 2);
+ ASSERT_THAT(bind(sock_.get(), addr2, addrlen_), SyscallSucceeds());
+
+ // Send some data from sock to bind_.
+ char buf[512];
+ RandomizeBuffer(buf, sizeof(buf));
+
+ ASSERT_THAT(sendto(sock_.get(), buf, sizeof(buf), 0, bind_addr_, addrlen_),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ // Connect to loopback:TestPort+1.
+ struct sockaddr_storage addr_storage = InetLoopbackAddr();
+ struct sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_storage);
+ SetPort(&addr_storage, *Port(&bind_addr_storage_) + 1);
+ ASSERT_THAT(connect(bind_.get(), addr, addrlen_), SyscallSucceeds());
+
+ // Receive the data. It works because it was sent before the connect.
+ char received[sizeof(buf)];
+ EXPECT_THAT(recv(bind_.get(), received, sizeof(received), 0),
+ SyscallSucceedsWithValue(sizeof(received)));
+ EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
+
+ // Send again. This time it should not be received.
+ ASSERT_THAT(sendto(sock_.get(), buf, sizeof(buf), 0, bind_addr_, addrlen_),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ EXPECT_THAT(recv(bind_.get(), buf, sizeof(buf), MSG_DONTWAIT),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+}
+
+TEST_P(UdpSocketTest, ReceiveFrom) {
+ ASSERT_NO_ERRNO(BindLoopback());
+
+ // Connect to loopback:bind_addr_port+1.
+ struct sockaddr_storage addr_storage = InetLoopbackAddr();
+ struct sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_storage);
+ SetPort(&addr_storage, *Port(&bind_addr_storage_) + 1);
+ ASSERT_THAT(connect(bind_.get(), addr, addrlen_), SyscallSucceeds());
+
+ // Bind sock to loopback:TestPort+1.
+ ASSERT_THAT(bind(sock_.get(), addr, addrlen_), SyscallSucceeds());
+
+ // Send some data from sock to bind_.
+ char buf[512];
+ RandomizeBuffer(buf, sizeof(buf));
+
+ ASSERT_THAT(sendto(sock_.get(), buf, sizeof(buf), 0, bind_addr_, addrlen_),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ // Receive the data and sender address.
+ char received[sizeof(buf)];
+ struct sockaddr_storage addr2;
+ socklen_t addr2len = sizeof(addr2);
+ EXPECT_THAT(recvfrom(bind_.get(), received, sizeof(received), 0,
+ reinterpret_cast<sockaddr*>(&addr2), &addr2len),
+ SyscallSucceedsWithValue(sizeof(received)));
+ EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
+ EXPECT_EQ(addr2len, addrlen_);
+ EXPECT_EQ(memcmp(addr, &addr2, addrlen_), 0);
+}
+
+TEST_P(UdpSocketTest, Listen) {
+ ASSERT_THAT(listen(sock_.get(), SOMAXCONN),
+ SyscallFailsWithErrno(EOPNOTSUPP));
+}
+
+TEST_P(UdpSocketTest, Accept) {
+ ASSERT_THAT(accept(sock_.get(), nullptr, nullptr),
+ SyscallFailsWithErrno(EOPNOTSUPP));
+}
+
+// This test validates that a read shutdown with pending data allows the read
+// to proceed with the data before returning EAGAIN.
+TEST_P(UdpSocketTest, ReadShutdownNonblockPendingData) {
+ ASSERT_NO_ERRNO(BindLoopback());
+
+ // Connect to loopback:bind_addr_port+1.
+ struct sockaddr_storage addr_storage = InetLoopbackAddr();
+ struct sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_storage);
+ SetPort(&addr_storage, *Port(&bind_addr_storage_) + 1);
+ ASSERT_THAT(connect(bind_.get(), addr, addrlen_), SyscallSucceeds());
+
+ // Bind to loopback:bind_addr_port+1 and connect to bind_addr_.
+ ASSERT_THAT(bind(sock_.get(), addr, addrlen_), SyscallSucceeds());
+ ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+ // Verify that we get EWOULDBLOCK when there is nothing to read.
+ char received[512];
+ EXPECT_THAT(recv(bind_.get(), received, sizeof(received), MSG_DONTWAIT),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+
+ const char* buf = "abc";
+ EXPECT_THAT(write(sock_.get(), buf, 3), SyscallSucceedsWithValue(3));
+
+ int opts = 0;
+ ASSERT_THAT(opts = fcntl(bind_.get(), F_GETFL), SyscallSucceeds());
+ ASSERT_THAT(fcntl(bind_.get(), F_SETFL, opts | O_NONBLOCK),
+ SyscallSucceeds());
+ ASSERT_THAT(opts = fcntl(bind_.get(), F_GETFL), SyscallSucceeds());
+ ASSERT_NE(opts & O_NONBLOCK, 0);
+
+ EXPECT_THAT(shutdown(bind_.get(), SHUT_RD), SyscallSucceeds());
+
+ struct pollfd pfd = {bind_.get(), POLLIN, 0};
+ ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
+ SyscallSucceedsWithValue(1));
+
+ // We should get the data even though read has been shutdown.
+ EXPECT_THAT(recv(bind_.get(), received, 2, 0), SyscallSucceedsWithValue(2));
+
+ // Because we read less than the entire packet length, since it's a packet
+ // based socket any subsequent reads should return EWOULDBLOCK.
+ EXPECT_THAT(recv(bind_.get(), received, 1, 0),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+}
+
+// This test is validating that even after a socket is shutdown if it's
+// reconnected it will reset the shutdown state.
+TEST_P(UdpSocketTest, ReadShutdownSameSocketResetsShutdownState) {
+ char received[512];
+ EXPECT_THAT(recv(bind_.get(), received, sizeof(received), MSG_DONTWAIT),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+
+ EXPECT_THAT(shutdown(bind_.get(), SHUT_RD), SyscallFailsWithErrno(ENOTCONN));
+
+ EXPECT_THAT(recv(bind_.get(), received, sizeof(received), MSG_DONTWAIT),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+
+ // Connect the socket, then try to shutdown again.
+ ASSERT_NO_ERRNO(BindLoopback());
+
+ // Connect to loopback:bind_addr_port+1.
+ struct sockaddr_storage addr_storage = InetLoopbackAddr();
+ struct sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_storage);
+ SetPort(&addr_storage, *Port(&bind_addr_storage_) + 1);
+ ASSERT_THAT(connect(bind_.get(), addr, addrlen_), SyscallSucceeds());
+
+ EXPECT_THAT(recv(bind_.get(), received, sizeof(received), MSG_DONTWAIT),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+}
+
+TEST_P(UdpSocketTest, ReadShutdown) {
+ // TODO(gvisor.dev/issue/1202): Calling recv() after shutdown without
+ // MSG_DONTWAIT blocks indefinitely.
+ SKIP_IF(IsRunningWithHostinet());
+
+ ASSERT_NO_ERRNO(BindLoopback());
+
+ char received[512];
+ EXPECT_THAT(recv(sock_.get(), received, sizeof(received), MSG_DONTWAIT),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+
+ EXPECT_THAT(shutdown(sock_.get(), SHUT_RD), SyscallFailsWithErrno(ENOTCONN));
+
+ EXPECT_THAT(recv(sock_.get(), received, sizeof(received), MSG_DONTWAIT),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+
+ // Connect the socket, then try to shutdown again.
+ ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+ EXPECT_THAT(recv(sock_.get(), received, sizeof(received), MSG_DONTWAIT),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+
+ EXPECT_THAT(shutdown(sock_.get(), SHUT_RD), SyscallSucceeds());
+
+ EXPECT_THAT(recv(sock_.get(), received, sizeof(received), 0),
+ SyscallSucceedsWithValue(0));
+}
+
+TEST_P(UdpSocketTest, ReadShutdownDifferentThread) {
+ // TODO(gvisor.dev/issue/1202): Calling recv() after shutdown without
+ // MSG_DONTWAIT blocks indefinitely.
+ SKIP_IF(IsRunningWithHostinet());
+ ASSERT_NO_ERRNO(BindLoopback());
+
+ char received[512];
+ EXPECT_THAT(recv(sock_.get(), received, sizeof(received), MSG_DONTWAIT),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+
+ // Connect the socket, then shutdown from another thread.
+ ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+ EXPECT_THAT(recv(sock_.get(), received, sizeof(received), MSG_DONTWAIT),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+
+ ScopedThread t([&] {
+ absl::SleepFor(absl::Milliseconds(200));
+ EXPECT_THAT(shutdown(sock_.get(), SHUT_RD), SyscallSucceeds());
+ });
+ EXPECT_THAT(RetryEINTR(recv)(sock_.get(), received, sizeof(received), 0),
+ SyscallSucceedsWithValue(0));
+ t.Join();
+
+ EXPECT_THAT(RetryEINTR(recv)(sock_.get(), received, sizeof(received), 0),
+ SyscallSucceedsWithValue(0));
+}
+
+TEST_P(UdpSocketTest, WriteShutdown) {
+ ASSERT_NO_ERRNO(BindLoopback());
+ EXPECT_THAT(shutdown(sock_.get(), SHUT_WR), SyscallFailsWithErrno(ENOTCONN));
+ ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+ EXPECT_THAT(shutdown(sock_.get(), SHUT_WR), SyscallSucceeds());
+}
+
+TEST_P(UdpSocketTest, SynchronousReceive) {
+ ASSERT_NO_ERRNO(BindLoopback());
+
+ // Send some data to bind_ from another thread.
+ char buf[512];
+ RandomizeBuffer(buf, sizeof(buf));
+
+ // Receive the data prior to actually starting the other thread.
+ char received[512];
+ EXPECT_THAT(
+ RetryEINTR(recv)(bind_.get(), received, sizeof(received), MSG_DONTWAIT),
+ SyscallFailsWithErrno(EWOULDBLOCK));
+
+ // Start the thread.
+ ScopedThread t([&] {
+ absl::SleepFor(absl::Milliseconds(200));
+ ASSERT_THAT(sendto(sock_.get(), buf, sizeof(buf), 0, this->bind_addr_,
+ this->addrlen_),
+ SyscallSucceedsWithValue(sizeof(buf)));
+ });
+
+ EXPECT_THAT(RetryEINTR(recv)(bind_.get(), received, sizeof(received), 0),
+ SyscallSucceedsWithValue(512));
+ EXPECT_EQ(memcmp(buf, received, sizeof(buf)), 0);
+}
+
+TEST_P(UdpSocketTest, BoundaryPreserved_SendRecv) {
+ ASSERT_NO_ERRNO(BindLoopback());
+
+ // Send 3 packets from sock to bind_.
+ constexpr int psize = 100;
+ char buf[3 * psize];
+ RandomizeBuffer(buf, sizeof(buf));
+
+ for (int i = 0; i < 3; ++i) {
+ ASSERT_THAT(
+ sendto(sock_.get(), buf + i * psize, psize, 0, bind_addr_, addrlen_),
+ SyscallSucceedsWithValue(psize));
+ }
+
+ // Receive the data as 3 separate packets.
+ char received[6 * psize];
+ for (int i = 0; i < 3; ++i) {
+ EXPECT_THAT(recv(bind_.get(), received + i * psize, 3 * psize, 0),
+ SyscallSucceedsWithValue(psize));
+ }
+ EXPECT_EQ(memcmp(buf, received, 3 * psize), 0);
+}
+
+TEST_P(UdpSocketTest, BoundaryPreserved_WritevReadv) {
+ ASSERT_NO_ERRNO(BindLoopback());
+
+ // Direct writes from sock to bind_.
+ ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+ // Send 2 packets from sock to bind_, where each packet's data consists of
+ // 2 discontiguous iovecs.
+ constexpr size_t kPieceSize = 100;
+ char buf[4 * kPieceSize];
+ RandomizeBuffer(buf, sizeof(buf));
+
+ for (int i = 0; i < 2; i++) {
+ struct iovec iov[2];
+ for (int j = 0; j < 2; j++) {
+ iov[j].iov_base = reinterpret_cast<void*>(
+ reinterpret_cast<uintptr_t>(buf) + (i + 2 * j) * kPieceSize);
+ iov[j].iov_len = kPieceSize;
+ }
+ ASSERT_THAT(writev(sock_.get(), iov, 2),
+ SyscallSucceedsWithValue(2 * kPieceSize));
+ }
+
+ // Receive the data as 2 separate packets.
+ char received[6 * kPieceSize];
+ for (int i = 0; i < 2; i++) {
+ struct iovec iov[3];
+ for (int j = 0; j < 3; j++) {
+ iov[j].iov_base = reinterpret_cast<void*>(
+ reinterpret_cast<uintptr_t>(received) + (i + 2 * j) * kPieceSize);
+ iov[j].iov_len = kPieceSize;
+ }
+ ASSERT_THAT(readv(bind_.get(), iov, 3),
+ SyscallSucceedsWithValue(2 * kPieceSize));
+ }
+ EXPECT_EQ(memcmp(buf, received, 4 * kPieceSize), 0);
+}
+
+TEST_P(UdpSocketTest, BoundaryPreserved_SendMsgRecvMsg) {
+ ASSERT_NO_ERRNO(BindLoopback());
+
+ // Send 2 packets from sock to bind_, where each packet's data consists of
+ // 2 discontiguous iovecs.
+ constexpr size_t kPieceSize = 100;
+ char buf[4 * kPieceSize];
+ RandomizeBuffer(buf, sizeof(buf));
+
+ for (int i = 0; i < 2; i++) {
+ struct iovec iov[2];
+ for (int j = 0; j < 2; j++) {
+ iov[j].iov_base = reinterpret_cast<void*>(
+ reinterpret_cast<uintptr_t>(buf) + (i + 2 * j) * kPieceSize);
+ iov[j].iov_len = kPieceSize;
+ }
+ struct msghdr msg = {};
+ msg.msg_name = bind_addr_;
+ msg.msg_namelen = addrlen_;
+ msg.msg_iov = iov;
+ msg.msg_iovlen = 2;
+ ASSERT_THAT(sendmsg(sock_.get(), &msg, 0),
+ SyscallSucceedsWithValue(2 * kPieceSize));
+ }
+
+ // Receive the data as 2 separate packets.
+ char received[6 * kPieceSize];
+ for (int i = 0; i < 2; i++) {
+ struct iovec iov[3];
+ for (int j = 0; j < 3; j++) {
+ iov[j].iov_base = reinterpret_cast<void*>(
+ reinterpret_cast<uintptr_t>(received) + (i + 2 * j) * kPieceSize);
+ iov[j].iov_len = kPieceSize;
+ }
+ struct msghdr msg = {};
+ msg.msg_iov = iov;
+ msg.msg_iovlen = 3;
+ ASSERT_THAT(recvmsg(bind_.get(), &msg, 0),
+ SyscallSucceedsWithValue(2 * kPieceSize));
+ }
+ EXPECT_EQ(memcmp(buf, received, 4 * kPieceSize), 0);
+}
+
+TEST_P(UdpSocketTest, FIONREADShutdown) {
+ ASSERT_NO_ERRNO(BindLoopback());
+
+ int n = -1;
+ EXPECT_THAT(ioctl(sock_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
+ EXPECT_EQ(n, 0);
+
+ // A UDP socket must be connected before it can be shutdown.
+ ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+ n = -1;
+ EXPECT_THAT(ioctl(sock_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
+ EXPECT_EQ(n, 0);
+
+ EXPECT_THAT(shutdown(sock_.get(), SHUT_RD), SyscallSucceeds());
+
+ n = -1;
+ EXPECT_THAT(ioctl(sock_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
+ EXPECT_EQ(n, 0);
+}
+
+TEST_P(UdpSocketTest, FIONREADWriteShutdown) {
+ int n = -1;
+ EXPECT_THAT(ioctl(bind_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
+ EXPECT_EQ(n, 0);
+
+ ASSERT_NO_ERRNO(BindLoopback());
+
+ // A UDP socket must be connected before it can be shutdown.
+ ASSERT_THAT(connect(bind_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+ n = -1;
+ EXPECT_THAT(ioctl(bind_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
+ EXPECT_EQ(n, 0);
+
+ const char str[] = "abc";
+ ASSERT_THAT(send(bind_.get(), str, sizeof(str), 0),
+ SyscallSucceedsWithValue(sizeof(str)));
+
+ struct pollfd pfd = {bind_.get(), POLLIN, 0};
+ ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
+ SyscallSucceedsWithValue(1));
+
+ n = -1;
+ EXPECT_THAT(ioctl(bind_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
+ EXPECT_EQ(n, sizeof(str));
+
+ EXPECT_THAT(shutdown(bind_.get(), SHUT_RD), SyscallSucceeds());
+
+ n = -1;
+ EXPECT_THAT(ioctl(bind_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
+ EXPECT_EQ(n, sizeof(str));
+}
+
+// NOTE: Do not use `FIONREAD` as test name because it will be replaced by the
+// corresponding macro and become `0x541B`.
+TEST_P(UdpSocketTest, Fionread) {
+ ASSERT_NO_ERRNO(BindLoopback());
+
+ // Check that the bound socket with an empty buffer reports an empty first
+ // packet.
+ int n = -1;
+ EXPECT_THAT(ioctl(bind_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
+ EXPECT_EQ(n, 0);
+
+ // Send 3 packets from sock to bind_.
+ constexpr int psize = 100;
+ char buf[3 * psize];
+ RandomizeBuffer(buf, sizeof(buf));
+
+ struct pollfd pfd = {bind_.get(), POLLIN, 0};
+ for (int i = 0; i < 3; ++i) {
+ ASSERT_THAT(
+ sendto(sock_.get(), buf + i * psize, psize, 0, bind_addr_, addrlen_),
+ SyscallSucceedsWithValue(psize));
+
+ ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
+ SyscallSucceedsWithValue(1));
+
+ // Check that regardless of how many packets are in the queue, the size
+ // reported is that of a single packet.
+ n = -1;
+ EXPECT_THAT(ioctl(bind_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
+ EXPECT_EQ(n, psize);
+ }
+}
+
+TEST_P(UdpSocketTest, FIONREADZeroLengthPacket) {
+ ASSERT_NO_ERRNO(BindLoopback());
+
+ // Check that the bound socket with an empty buffer reports an empty first
+ // packet.
+ int n = -1;
+ EXPECT_THAT(ioctl(bind_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
+ EXPECT_EQ(n, 0);
+
+ // Send 3 packets from sock to bind_.
+ constexpr int psize = 100;
+ char buf[3 * psize];
+ RandomizeBuffer(buf, sizeof(buf));
+
+ struct pollfd pfd = {bind_.get(), POLLIN, 0};
+ for (int i = 0; i < 3; ++i) {
+ ASSERT_THAT(
+ sendto(sock_.get(), buf + i * psize, 0, 0, bind_addr_, addrlen_),
+ SyscallSucceedsWithValue(0));
+
+ // TODO(gvisor.dev/issue/2726): sending a zero-length message to a hostinet
+ // socket does not cause a poll event to be triggered.
+ if (!IsRunningWithHostinet()) {
+ ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
+ SyscallSucceedsWithValue(1));
+ }
+
+ // Check that regardless of how many packets are in the queue, the size
+ // reported is that of a single packet.
+ n = -1;
+ EXPECT_THAT(ioctl(bind_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
+ EXPECT_EQ(n, 0);
+ }
+}
+
+TEST_P(UdpSocketTest, FIONREADZeroLengthWriteShutdown) {
+ int n = -1;
+ EXPECT_THAT(ioctl(bind_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
+ EXPECT_EQ(n, 0);
+
+ ASSERT_NO_ERRNO(BindLoopback());
+
+ // A UDP socket must be connected before it can be shutdown.
+ ASSERT_THAT(connect(bind_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+ n = -1;
+ EXPECT_THAT(ioctl(bind_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
+ EXPECT_EQ(n, 0);
+
+ const char str[] = "abc";
+ ASSERT_THAT(send(bind_.get(), str, 0, 0), SyscallSucceedsWithValue(0));
+
+ n = -1;
+ EXPECT_THAT(ioctl(bind_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
+ EXPECT_EQ(n, 0);
+
+ EXPECT_THAT(shutdown(bind_.get(), SHUT_RD), SyscallSucceeds());
+
+ n = -1;
+ EXPECT_THAT(ioctl(bind_.get(), FIONREAD, &n), SyscallSucceedsWithValue(0));
+ EXPECT_EQ(n, 0);
+}
+
+TEST_P(UdpSocketTest, SoNoCheckOffByDefault) {
+ // TODO(gvisor.dev/issue/1202): SO_NO_CHECK socket option not supported by
+ // hostinet.
+ SKIP_IF(IsRunningWithHostinet());
+
+ int v = -1;
+ socklen_t optlen = sizeof(v);
+ ASSERT_THAT(getsockopt(bind_.get(), SOL_SOCKET, SO_NO_CHECK, &v, &optlen),
+ SyscallSucceeds());
+ ASSERT_EQ(v, kSockOptOff);
+ ASSERT_EQ(optlen, sizeof(v));
+}
+
+TEST_P(UdpSocketTest, SoNoCheck) {
+ // TODO(gvisor.dev/issue/1202): SO_NO_CHECK socket option not supported by
+ // hostinet.
+ SKIP_IF(IsRunningWithHostinet());
+
+ int v = kSockOptOn;
+ socklen_t optlen = sizeof(v);
+ ASSERT_THAT(setsockopt(bind_.get(), SOL_SOCKET, SO_NO_CHECK, &v, optlen),
+ SyscallSucceeds());
+ v = -1;
+ ASSERT_THAT(getsockopt(bind_.get(), SOL_SOCKET, SO_NO_CHECK, &v, &optlen),
+ SyscallSucceeds());
+ ASSERT_EQ(v, kSockOptOn);
+ ASSERT_EQ(optlen, sizeof(v));
+
+ v = kSockOptOff;
+ ASSERT_THAT(setsockopt(bind_.get(), SOL_SOCKET, SO_NO_CHECK, &v, optlen),
+ SyscallSucceeds());
+ v = -1;
+ ASSERT_THAT(getsockopt(bind_.get(), SOL_SOCKET, SO_NO_CHECK, &v, &optlen),
+ SyscallSucceeds());
+ ASSERT_EQ(v, kSockOptOff);
+ ASSERT_EQ(optlen, sizeof(v));
+}
+
+TEST_P(UdpSocketTest, SoTimestampOffByDefault) {
+ // TODO(gvisor.dev/issue/1202): SO_TIMESTAMP socket option not supported by
+ // hostinet.
+ SKIP_IF(IsRunningWithHostinet());
+
+ int v = -1;
+ socklen_t optlen = sizeof(v);
+ ASSERT_THAT(getsockopt(bind_.get(), SOL_SOCKET, SO_TIMESTAMP, &v, &optlen),
+ SyscallSucceeds());
+ ASSERT_EQ(v, kSockOptOff);
+ ASSERT_EQ(optlen, sizeof(v));
+}
+
+TEST_P(UdpSocketTest, SoTimestamp) {
+ // TODO(gvisor.dev/issue/1202): ioctl() and SO_TIMESTAMP socket option are not
+ // supported by hostinet.
+ SKIP_IF(IsRunningWithHostinet());
+
+ ASSERT_NO_ERRNO(BindLoopback());
+ ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+ int v = 1;
+ ASSERT_THAT(setsockopt(bind_.get(), SOL_SOCKET, SO_TIMESTAMP, &v, sizeof(v)),
+ SyscallSucceeds());
+
+ char buf[3];
+ // Send zero length packet from sock to bind_.
+ ASSERT_THAT(RetryEINTR(write)(sock_.get(), buf, 0),
+ SyscallSucceedsWithValue(0));
+
+ struct pollfd pfd = {bind_.get(), POLLIN, 0};
+ ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
+ SyscallSucceedsWithValue(1));
+
+ char cmsgbuf[CMSG_SPACE(sizeof(struct timeval))];
+ msghdr msg;
+ memset(&msg, 0, sizeof(msg));
+ iovec iov;
+ memset(&iov, 0, sizeof(iov));
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsgbuf;
+ msg.msg_controllen = sizeof(cmsgbuf);
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(bind_.get(), &msg, 0),
+ SyscallSucceedsWithValue(0));
+
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, nullptr);
+ ASSERT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+ ASSERT_EQ(cmsg->cmsg_type, SO_TIMESTAMP);
+ ASSERT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(struct timeval)));
+
+ struct timeval tv = {};
+ memcpy(&tv, CMSG_DATA(cmsg), sizeof(struct timeval));
+
+ ASSERT_TRUE(tv.tv_sec != 0 || tv.tv_usec != 0);
+
+ // There should be nothing to get via ioctl.
+ ASSERT_THAT(ioctl(bind_.get(), SIOCGSTAMP, &tv),
+ SyscallFailsWithErrno(ENOENT));
+}
+
+TEST_P(UdpSocketTest, WriteShutdownNotConnected) {
+ EXPECT_THAT(shutdown(bind_.get(), SHUT_WR), SyscallFailsWithErrno(ENOTCONN));
+}
+
+TEST_P(UdpSocketTest, TimestampIoctl) {
+ // TODO(gvisor.dev/issue/1202): ioctl() is not supported by hostinet.
+ SKIP_IF(IsRunningWithHostinet());
+
+ ASSERT_NO_ERRNO(BindLoopback());
+ ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+ char buf[3];
+ // Send packet from sock to bind_.
+ ASSERT_THAT(RetryEINTR(write)(sock_.get(), buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+
+ struct pollfd pfd = {bind_.get(), POLLIN, 0};
+ ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
+ SyscallSucceedsWithValue(1));
+
+ // There should be no control messages.
+ char recv_buf[sizeof(buf)];
+ ASSERT_NO_FATAL_FAILURE(RecvNoCmsg(bind_.get(), recv_buf, sizeof(recv_buf)));
+
+ // A nonzero timeval should be available via ioctl.
+ struct timeval tv = {};
+ ASSERT_THAT(ioctl(bind_.get(), SIOCGSTAMP, &tv), SyscallSucceeds());
+ ASSERT_TRUE(tv.tv_sec != 0 || tv.tv_usec != 0);
+}
+
+TEST_P(UdpSocketTest, TimestampIoctlNothingRead) {
+ // TODO(gvisor.dev/issue/1202): ioctl() is not supported by hostinet.
+ SKIP_IF(IsRunningWithHostinet());
+
+ ASSERT_NO_ERRNO(BindLoopback());
+ ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+ struct timeval tv = {};
+ ASSERT_THAT(ioctl(sock_.get(), SIOCGSTAMP, &tv),
+ SyscallFailsWithErrno(ENOENT));
+}
+
+// Test that the timestamp accessed via SIOCGSTAMP is still accessible after
+// SO_TIMESTAMP is enabled and used to retrieve a timestamp.
+TEST_P(UdpSocketTest, TimestampIoctlPersistence) {
+ // TODO(gvisor.dev/issue/1202): ioctl() and SO_TIMESTAMP socket option are not
+ // supported by hostinet.
+ SKIP_IF(IsRunningWithHostinet());
+
+ ASSERT_NO_ERRNO(BindLoopback());
+ ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+ char buf[3];
+ // Send packet from sock to bind_.
+ ASSERT_THAT(RetryEINTR(write)(sock_.get(), buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+ ASSERT_THAT(RetryEINTR(write)(sock_.get(), buf, 0),
+ SyscallSucceedsWithValue(0));
+
+ struct pollfd pfd = {bind_.get(), POLLIN, 0};
+ ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
+ SyscallSucceedsWithValue(1));
+
+ // There should be no control messages.
+ char recv_buf[sizeof(buf)];
+ ASSERT_NO_FATAL_FAILURE(RecvNoCmsg(bind_.get(), recv_buf, sizeof(recv_buf)));
+
+ // A nonzero timeval should be available via ioctl.
+ struct timeval tv = {};
+ ASSERT_THAT(ioctl(bind_.get(), SIOCGSTAMP, &tv), SyscallSucceeds());
+ ASSERT_TRUE(tv.tv_sec != 0 || tv.tv_usec != 0);
+
+ // Enable SO_TIMESTAMP and send a message.
+ int v = 1;
+ EXPECT_THAT(setsockopt(bind_.get(), SOL_SOCKET, SO_TIMESTAMP, &v, sizeof(v)),
+ SyscallSucceeds());
+ ASSERT_THAT(RetryEINTR(write)(sock_.get(), buf, 0),
+ SyscallSucceedsWithValue(0));
+
+ ASSERT_THAT(RetryEINTR(poll)(&pfd, 1, /*timeout=*/1000),
+ SyscallSucceedsWithValue(1));
+
+ // There should be a message for SO_TIMESTAMP.
+ char cmsgbuf[CMSG_SPACE(sizeof(struct timeval))];
+ msghdr msg = {};
+ iovec iov = {};
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsgbuf;
+ msg.msg_controllen = sizeof(cmsgbuf);
+ ASSERT_THAT(RetryEINTR(recvmsg)(bind_.get(), &msg, 0),
+ SyscallSucceedsWithValue(0));
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, nullptr);
+
+ // The ioctl should return the exact same values as before.
+ struct timeval tv2 = {};
+ ASSERT_THAT(ioctl(bind_.get(), SIOCGSTAMP, &tv2), SyscallSucceeds());
+ ASSERT_EQ(tv.tv_sec, tv2.tv_sec);
+ ASSERT_EQ(tv.tv_usec, tv2.tv_usec);
+}
+
+// Test that a socket with IP_TOS or IPV6_TCLASS set will set the TOS byte on
+// outgoing packets, and that a receiving socket with IP_RECVTOS or
+// IPV6_RECVTCLASS will create the corresponding control message.
+TEST_P(UdpSocketTest, SetAndReceiveTOS) {
+ ASSERT_NO_ERRNO(BindLoopback());
+ ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+ // Allow socket to receive control message.
+ int recv_level = SOL_IP;
+ int recv_type = IP_RECVTOS;
+ if (GetParam() != AddressFamily::kIpv4) {
+ recv_level = SOL_IPV6;
+ recv_type = IPV6_RECVTCLASS;
+ }
+ ASSERT_THAT(setsockopt(bind_.get(), recv_level, recv_type, &kSockOptOn,
+ sizeof(kSockOptOn)),
+ SyscallSucceeds());
+
+ // Set socket TOS.
+ int sent_level = recv_level;
+ int sent_type = IP_TOS;
+ if (sent_level == SOL_IPV6) {
+ sent_type = IPV6_TCLASS;
+ }
+ int sent_tos = IPTOS_LOWDELAY; // Choose some TOS value.
+ ASSERT_THAT(setsockopt(sock_.get(), sent_level, sent_type, &sent_tos,
+ sizeof(sent_tos)),
+ SyscallSucceeds());
+
+ // Prepare message to send.
+ constexpr size_t kDataLength = 1024;
+ struct msghdr sent_msg = {};
+ struct iovec sent_iov = {};
+ char sent_data[kDataLength];
+ sent_iov.iov_base = &sent_data[0];
+ sent_iov.iov_len = kDataLength;
+ sent_msg.msg_iov = &sent_iov;
+ sent_msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(sendmsg)(sock_.get(), &sent_msg, 0),
+ SyscallSucceedsWithValue(kDataLength));
+
+ // Receive message.
+ struct msghdr received_msg = {};
+ struct iovec received_iov = {};
+ char received_data[kDataLength];
+ received_iov.iov_base = &received_data[0];
+ received_iov.iov_len = kDataLength;
+ received_msg.msg_iov = &received_iov;
+ received_msg.msg_iovlen = 1;
+ size_t cmsg_data_len = sizeof(int8_t);
+ if (sent_type == IPV6_TCLASS) {
+ cmsg_data_len = sizeof(int);
+ }
+ std::vector<char> received_cmsgbuf(CMSG_SPACE(cmsg_data_len));
+ received_msg.msg_control = &received_cmsgbuf[0];
+ received_msg.msg_controllen = received_cmsgbuf.size();
+ ASSERT_THAT(RetryEINTR(recvmsg)(bind_.get(), &received_msg, 0),
+ SyscallSucceedsWithValue(kDataLength));
+
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&received_msg);
+ ASSERT_NE(cmsg, nullptr);
+ EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(cmsg_data_len));
+ EXPECT_EQ(cmsg->cmsg_level, sent_level);
+ EXPECT_EQ(cmsg->cmsg_type, sent_type);
+ int8_t received_tos = 0;
+ memcpy(&received_tos, CMSG_DATA(cmsg), sizeof(received_tos));
+ EXPECT_EQ(received_tos, sent_tos);
+}
+
+// Test that sendmsg with IP_TOS and IPV6_TCLASS control messages will set the
+// TOS byte on outgoing packets, and that a receiving socket with IP_RECVTOS or
+// IPV6_RECVTCLASS will create the corresponding control message.
+TEST_P(UdpSocketTest, SendAndReceiveTOS) {
+ // TODO(b/146661005): Setting TOS via cmsg not supported for netstack.
+ SKIP_IF(IsRunningOnGvisor() && !IsRunningWithHostinet());
+
+ ASSERT_NO_ERRNO(BindLoopback());
+ ASSERT_THAT(connect(sock_.get(), bind_addr_, addrlen_), SyscallSucceeds());
+
+ // Allow socket to receive control message.
+ int recv_level = SOL_IP;
+ int recv_type = IP_RECVTOS;
+ if (GetParam() != AddressFamily::kIpv4) {
+ recv_level = SOL_IPV6;
+ recv_type = IPV6_RECVTCLASS;
+ }
+ int recv_opt = kSockOptOn;
+ ASSERT_THAT(setsockopt(bind_.get(), recv_level, recv_type, &recv_opt,
+ sizeof(recv_opt)),
+ SyscallSucceeds());
+
+ // Prepare message to send.
+ constexpr size_t kDataLength = 1024;
+ int sent_level = recv_level;
+ int sent_type = IP_TOS;
+ int sent_tos = IPTOS_LOWDELAY; // Choose some TOS value.
+
+ struct msghdr sent_msg = {};
+ struct iovec sent_iov = {};
+ char sent_data[kDataLength];
+ sent_iov.iov_base = &sent_data[0];
+ sent_iov.iov_len = kDataLength;
+ sent_msg.msg_iov = &sent_iov;
+ sent_msg.msg_iovlen = 1;
+ size_t cmsg_data_len = sizeof(int8_t);
+ if (sent_level == SOL_IPV6) {
+ sent_type = IPV6_TCLASS;
+ cmsg_data_len = sizeof(int);
+ }
+ std::vector<char> sent_cmsgbuf(CMSG_SPACE(cmsg_data_len));
+ sent_msg.msg_control = &sent_cmsgbuf[0];
+ sent_msg.msg_controllen = CMSG_LEN(cmsg_data_len);
+
+ // Manually add control message.
+ struct cmsghdr* sent_cmsg = CMSG_FIRSTHDR(&sent_msg);
+ sent_cmsg->cmsg_len = CMSG_LEN(cmsg_data_len);
+ sent_cmsg->cmsg_level = sent_level;
+ sent_cmsg->cmsg_type = sent_type;
+ *(int8_t*)CMSG_DATA(sent_cmsg) = sent_tos;
+
+ ASSERT_THAT(RetryEINTR(sendmsg)(sock_.get(), &sent_msg, 0),
+ SyscallSucceedsWithValue(kDataLength));
+
+ // Receive message.
+ struct msghdr received_msg = {};
+ struct iovec received_iov = {};
+ char received_data[kDataLength];
+ received_iov.iov_base = &received_data[0];
+ received_iov.iov_len = kDataLength;
+ received_msg.msg_iov = &received_iov;
+ received_msg.msg_iovlen = 1;
+ std::vector<char> received_cmsgbuf(CMSG_SPACE(cmsg_data_len));
+ received_msg.msg_control = &received_cmsgbuf[0];
+ received_msg.msg_controllen = CMSG_LEN(cmsg_data_len);
+ ASSERT_THAT(RetryEINTR(recvmsg)(bind_.get(), &received_msg, 0),
+ SyscallSucceedsWithValue(kDataLength));
+
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&received_msg);
+ ASSERT_NE(cmsg, nullptr);
+ EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(cmsg_data_len));
+ EXPECT_EQ(cmsg->cmsg_level, sent_level);
+ EXPECT_EQ(cmsg->cmsg_type, sent_type);
+ int8_t received_tos = 0;
+ memcpy(&received_tos, CMSG_DATA(cmsg), sizeof(received_tos));
+ EXPECT_EQ(received_tos, sent_tos);
+}
+
+TEST_P(UdpSocketTest, RecvBufLimitsEmptyRcvBuf) {
+ // Discover minimum buffer size by setting it to zero.
+ constexpr int kRcvBufSz = 0;
+ ASSERT_THAT(setsockopt(bind_.get(), SOL_SOCKET, SO_RCVBUF, &kRcvBufSz,
+ sizeof(kRcvBufSz)),
+ SyscallSucceeds());
+
+ int min = 0;
+ socklen_t min_len = sizeof(min);
+ ASSERT_THAT(getsockopt(bind_.get(), SOL_SOCKET, SO_RCVBUF, &min, &min_len),
+ SyscallSucceeds());
+
+ // Bind bind_ to loopback.
+ ASSERT_NO_ERRNO(BindLoopback());
+
+ {
+ // Send data of size min and verify that it's received.
+ std::vector<char> buf(min);
+ RandomizeBuffer(buf.data(), buf.size());
+ ASSERT_THAT(
+ sendto(sock_.get(), buf.data(), buf.size(), 0, bind_addr_, addrlen_),
+ SyscallSucceedsWithValue(buf.size()));
+ std::vector<char> received(buf.size());
+ EXPECT_THAT(
+ recv(bind_.get(), received.data(), received.size(), MSG_DONTWAIT),
+ SyscallSucceedsWithValue(received.size()));
+ }
+
+ {
+ // Send data of size min + 1 and verify that its received. Both linux and
+ // Netstack accept a dgram that exceeds rcvBuf limits if the receive buffer
+ // is currently empty.
+ std::vector<char> buf(min + 1);
+ RandomizeBuffer(buf.data(), buf.size());
+ ASSERT_THAT(
+ sendto(sock_.get(), buf.data(), buf.size(), 0, bind_addr_, addrlen_),
+ SyscallSucceedsWithValue(buf.size()));
+
+ std::vector<char> received(buf.size());
+ EXPECT_THAT(
+ recv(bind_.get(), received.data(), received.size(), MSG_DONTWAIT),
+ SyscallSucceedsWithValue(received.size()));
+ }
+}
+
+// Test that receive buffer limits are enforced.
+TEST_P(UdpSocketTest, RecvBufLimits) {
+ // Bind s_ to loopback.
+ ASSERT_NO_ERRNO(BindLoopback());
+
+ int min = 0;
+ {
+ // Discover minimum buffer size by trying to set it to zero.
+ constexpr int kRcvBufSz = 0;
+ ASSERT_THAT(setsockopt(bind_.get(), SOL_SOCKET, SO_RCVBUF, &kRcvBufSz,
+ sizeof(kRcvBufSz)),
+ SyscallSucceeds());
+
+ socklen_t min_len = sizeof(min);
+ ASSERT_THAT(getsockopt(bind_.get(), SOL_SOCKET, SO_RCVBUF, &min, &min_len),
+ SyscallSucceeds());
+ }
+
+ // Now set the limit to min * 4.
+ int new_rcv_buf_sz = min * 4;
+ if (!IsRunningOnGvisor() || IsRunningWithHostinet()) {
+ // Linux doubles the value specified so just set to min * 2.
+ new_rcv_buf_sz = min * 2;
+ }
+
+ ASSERT_THAT(setsockopt(bind_.get(), SOL_SOCKET, SO_RCVBUF, &new_rcv_buf_sz,
+ sizeof(new_rcv_buf_sz)),
+ SyscallSucceeds());
+ int rcv_buf_sz = 0;
+ {
+ socklen_t rcv_buf_len = sizeof(rcv_buf_sz);
+ ASSERT_THAT(getsockopt(bind_.get(), SOL_SOCKET, SO_RCVBUF, &rcv_buf_sz,
+ &rcv_buf_len),
+ SyscallSucceeds());
+ }
+
+ {
+ std::vector<char> buf(min);
+ RandomizeBuffer(buf.data(), buf.size());
+
+ ASSERT_THAT(
+ sendto(sock_.get(), buf.data(), buf.size(), 0, bind_addr_, addrlen_),
+ SyscallSucceedsWithValue(buf.size()));
+ ASSERT_THAT(
+ sendto(sock_.get(), buf.data(), buf.size(), 0, bind_addr_, addrlen_),
+ SyscallSucceedsWithValue(buf.size()));
+ ASSERT_THAT(
+ sendto(sock_.get(), buf.data(), buf.size(), 0, bind_addr_, addrlen_),
+ SyscallSucceedsWithValue(buf.size()));
+ ASSERT_THAT(
+ sendto(sock_.get(), buf.data(), buf.size(), 0, bind_addr_, addrlen_),
+ SyscallSucceedsWithValue(buf.size()));
+ int sent = 4;
+ if (IsRunningOnGvisor() && !IsRunningWithHostinet()) {
+ // Linux seems to drop the 4th packet even though technically it should
+ // fit in the receive buffer.
+ ASSERT_THAT(
+ sendto(sock_.get(), buf.data(), buf.size(), 0, bind_addr_, addrlen_),
+ SyscallSucceedsWithValue(buf.size()));
+ sent++;
+ }
+
+ for (int i = 0; i < sent - 1; i++) {
+ // Receive the data.
+ std::vector<char> received(buf.size());
+ EXPECT_THAT(
+ recv(bind_.get(), received.data(), received.size(), MSG_DONTWAIT),
+ SyscallSucceedsWithValue(received.size()));
+ EXPECT_EQ(memcmp(buf.data(), received.data(), buf.size()), 0);
+ }
+
+ // The last receive should fail with EAGAIN as the last packet should have
+ // been dropped due to lack of space in the receive buffer.
+ std::vector<char> received(buf.size());
+ EXPECT_THAT(
+ recv(bind_.get(), received.data(), received.size(), MSG_DONTWAIT),
+ SyscallFailsWithErrno(EAGAIN));
+ }
+}
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/udp_socket_test_cases.h b/test/syscalls/linux/udp_socket_test_cases.h
new file mode 100644
index 000000000..f7e25c805
--- /dev/null
+++ b/test/syscalls/linux/udp_socket_test_cases.h
@@ -0,0 +1,82 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef THIRD_PARTY_GOLANG_GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_UDP_UNBOUND_H_
+#define THIRD_PARTY_GOLANG_GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_UDP_UNBOUND_H_
+
+#include <sys/socket.h>
+
+#include "gtest/gtest.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/posix_error.h"
+
+namespace gvisor {
+namespace testing {
+
+// The initial port to be be used on gvisor.
+constexpr int TestPort = 40000;
+
+// Fixture for tests parameterized by the address family to use (AF_INET and
+// AF_INET6) when creating sockets.
+class UdpSocketTest
+ : public ::testing::TestWithParam<gvisor::testing::AddressFamily> {
+ protected:
+ // Creates two sockets that will be used by test cases.
+ void SetUp() override;
+
+ // Binds the socket bind_ to the loopback and updates bind_addr_.
+ PosixError BindLoopback();
+
+ // Binds the socket bind_ to Any and updates bind_addr_.
+ PosixError BindAny();
+
+ // Binds given socket to address addr and updates.
+ PosixError BindSocket(int socket, struct sockaddr* addr);
+
+ // Return initialized Any address to port 0.
+ struct sockaddr_storage InetAnyAddr();
+
+ // Return initialized Loopback address to port 0.
+ struct sockaddr_storage InetLoopbackAddr();
+
+ // Disconnects socket sockfd.
+ void Disconnect(int sockfd);
+
+ // Get family for the test.
+ int GetFamily();
+
+ // Socket used by Bind methods
+ FileDescriptor bind_;
+
+ // Second socket used for tests.
+ FileDescriptor sock_;
+
+ // Address for bind_ socket.
+ struct sockaddr* bind_addr_;
+
+ // Initialized to the length based on GetFamily().
+ socklen_t addrlen_;
+
+ // Storage for bind_addr_.
+ struct sockaddr_storage bind_addr_storage_;
+
+ private:
+ // Helper to initialize addrlen_ for the test case.
+ socklen_t GetAddrLength();
+};
+} // namespace testing
+} // namespace gvisor
+
+#endif // THIRD_PARTY_GOLANG_GVISOR_TEST_SYSCALLS_LINUX_SOCKET_IPV4_UDP_UNBOUND_H_
diff --git a/test/syscalls/linux/uidgid.cc b/test/syscalls/linux/uidgid.cc
new file mode 100644
index 000000000..64d6d0b8f
--- /dev/null
+++ b/test/syscalls/linux/uidgid.cc
@@ -0,0 +1,276 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <grp.h>
+#include <sys/resource.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "absl/flags/flag.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "test/util/capability_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+#include "test/util/uid_util.h"
+
+ABSL_FLAG(int32_t, scratch_uid1, 65534, "first scratch UID");
+ABSL_FLAG(int32_t, scratch_uid2, 65533, "second scratch UID");
+ABSL_FLAG(int32_t, scratch_gid1, 65534, "first scratch GID");
+ABSL_FLAG(int32_t, scratch_gid2, 65533, "second scratch GID");
+
+using ::testing::UnorderedElementsAreArray;
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(UidGidTest, Getuid) {
+ uid_t ruid, euid, suid;
+ EXPECT_THAT(getresuid(&ruid, &euid, &suid), SyscallSucceeds());
+ EXPECT_THAT(getuid(), SyscallSucceedsWithValue(ruid));
+ EXPECT_THAT(geteuid(), SyscallSucceedsWithValue(euid));
+}
+
+TEST(UidGidTest, Getgid) {
+ gid_t rgid, egid, sgid;
+ EXPECT_THAT(getresgid(&rgid, &egid, &sgid), SyscallSucceeds());
+ EXPECT_THAT(getgid(), SyscallSucceedsWithValue(rgid));
+ EXPECT_THAT(getegid(), SyscallSucceedsWithValue(egid));
+}
+
+TEST(UidGidTest, Getgroups) {
+ // "If size is zero, list is not modified, but the total number of
+ // supplementary group IDs for the process is returned." - getgroups(2)
+ int nr_groups;
+ ASSERT_THAT(nr_groups = getgroups(0, nullptr), SyscallSucceeds());
+ std::vector<gid_t> list(nr_groups);
+ EXPECT_THAT(getgroups(list.size(), list.data()), SyscallSucceeds());
+
+ // "EINVAL: size is less than the number of supplementary group IDs, but is
+ // not zero."
+ EXPECT_THAT(getgroups(-1, nullptr), SyscallFailsWithErrno(EINVAL));
+
+ // Testing for EFAULT requires actually having groups, which isn't guaranteed
+ // here; see the setgroups test below.
+}
+
+// Checks that the calling process' real/effective/saved user IDs are
+// ruid/euid/suid respectively.
+PosixError CheckUIDs(uid_t ruid, uid_t euid, uid_t suid) {
+ uid_t actual_ruid, actual_euid, actual_suid;
+ int rc = getresuid(&actual_ruid, &actual_euid, &actual_suid);
+ MaybeSave();
+ if (rc < 0) {
+ return PosixError(errno, "getresuid");
+ }
+ if (ruid != actual_ruid || euid != actual_euid || suid != actual_suid) {
+ return PosixError(
+ EPERM, absl::StrCat(
+ "incorrect user IDs: got (",
+ absl::StrJoin({actual_ruid, actual_euid, actual_suid}, ", "),
+ ", wanted (", absl::StrJoin({ruid, euid, suid}, ", "), ")"));
+ }
+ return NoError();
+}
+
+PosixError CheckGIDs(gid_t rgid, gid_t egid, gid_t sgid) {
+ gid_t actual_rgid, actual_egid, actual_sgid;
+ int rc = getresgid(&actual_rgid, &actual_egid, &actual_sgid);
+ MaybeSave();
+ if (rc < 0) {
+ return PosixError(errno, "getresgid");
+ }
+ if (rgid != actual_rgid || egid != actual_egid || sgid != actual_sgid) {
+ return PosixError(
+ EPERM, absl::StrCat(
+ "incorrect group IDs: got (",
+ absl::StrJoin({actual_rgid, actual_egid, actual_sgid}, ", "),
+ ", wanted (", absl::StrJoin({rgid, egid, sgid}, ", "), ")"));
+ }
+ return NoError();
+}
+
+// N.B. These tests may break horribly unless run via a gVisor test runner,
+// because changing UID in one test may forfeit permissions required by other
+// tests. (The test runner runs each test in a separate process.)
+
+TEST(UidGidRootTest, Setuid) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsRoot()));
+
+ // Do setuid in a separate thread so that after finishing this test, the
+ // process can still open files the test harness created before starting this
+ // test. Otherwise, the files are created by root (UID before the test), but
+ // cannot be opened by the `uid` set below after the test. After calling
+ // setuid(non-zero-UID), there is no way to get root privileges back.
+ ScopedThread([&] {
+ // Use syscall instead of glibc setuid wrapper because we want this setuid
+ // call to only apply to this task. POSIX threads, however, require that all
+ // threads have the same UIDs, so using the setuid wrapper sets all threads'
+ // real UID.
+ EXPECT_THAT(syscall(SYS_setuid, -1), SyscallFailsWithErrno(EINVAL));
+
+ const uid_t uid = absl::GetFlag(FLAGS_scratch_uid1);
+ EXPECT_THAT(syscall(SYS_setuid, uid), SyscallSucceeds());
+ // "If the effective UID of the caller is root (more precisely: if the
+ // caller has the CAP_SETUID capability), the real UID and saved set-user-ID
+ // are also set." - setuid(2)
+ EXPECT_NO_ERRNO(CheckUIDs(uid, uid, uid));
+ });
+}
+
+TEST(UidGidRootTest, Setgid) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsRoot()));
+
+ EXPECT_THAT(setgid(-1), SyscallFailsWithErrno(EINVAL));
+
+ const gid_t gid = absl::GetFlag(FLAGS_scratch_gid1);
+ ASSERT_THAT(setgid(gid), SyscallSucceeds());
+ EXPECT_NO_ERRNO(CheckGIDs(gid, gid, gid));
+}
+
+TEST(UidGidRootTest, SetgidNotFromThreadGroupLeader) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsRoot()));
+
+ const gid_t gid = absl::GetFlag(FLAGS_scratch_gid1);
+ // NOTE(b/64676707): Do setgid in a separate thread so that we can test if
+ // info.si_pid is set correctly.
+ ScopedThread([gid] { ASSERT_THAT(setgid(gid), SyscallSucceeds()); });
+ EXPECT_NO_ERRNO(CheckGIDs(gid, gid, gid));
+}
+
+TEST(UidGidRootTest, Setreuid) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsRoot()));
+
+ // "Supplying a value of -1 for either the real or effective user ID forces
+ // the system to leave that ID unchanged." - setreuid(2)
+ EXPECT_THAT(setreuid(-1, -1), SyscallSucceeds());
+ EXPECT_NO_ERRNO(CheckUIDs(0, 0, 0));
+
+ // Do setuid in a separate thread so that after finishing this test, the
+ // process can still open files the test harness created before starting this
+ // test. Otherwise, the files are created by root (UID before the test), but
+ // cannot be opened by the `uid` set below after the test. After calling
+ // setuid(non-zero-UID), there is no way to get root privileges back.
+ ScopedThread([&] {
+ const uid_t ruid = absl::GetFlag(FLAGS_scratch_uid1);
+ const uid_t euid = absl::GetFlag(FLAGS_scratch_uid2);
+
+ // Use syscall instead of glibc setuid wrapper because we want this setuid
+ // call to only apply to this task. posix threads, however, require that all
+ // threads have the same UIDs, so using the setuid wrapper sets all threads'
+ // real UID.
+ EXPECT_THAT(syscall(SYS_setreuid, ruid, euid), SyscallSucceeds());
+
+ // "If the real user ID is set or the effective user ID is set to a value
+ // not equal to the previous real user ID, the saved set-user-ID will be set
+ // to the new effective user ID." - setreuid(2)
+ EXPECT_NO_ERRNO(CheckUIDs(ruid, euid, euid));
+ });
+}
+
+TEST(UidGidRootTest, Setregid) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsRoot()));
+
+ EXPECT_THAT(setregid(-1, -1), SyscallSucceeds());
+ EXPECT_NO_ERRNO(CheckGIDs(0, 0, 0));
+
+ const gid_t rgid = absl::GetFlag(FLAGS_scratch_gid1);
+ const gid_t egid = absl::GetFlag(FLAGS_scratch_gid2);
+ ASSERT_THAT(setregid(rgid, egid), SyscallSucceeds());
+ EXPECT_NO_ERRNO(CheckGIDs(rgid, egid, egid));
+}
+
+TEST(UidGidRootTest, Setresuid) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsRoot()));
+
+ // "If one of the arguments equals -1, the corresponding value is not
+ // changed." - setresuid(2)
+ EXPECT_THAT(setresuid(-1, -1, -1), SyscallSucceeds());
+ EXPECT_NO_ERRNO(CheckUIDs(0, 0, 0));
+
+ // Do setuid in a separate thread so that after finishing this test, the
+ // process can still open files the test harness created before starting this
+ // test. Otherwise, the files are created by root (UID before the test), but
+ // cannot be opened by the `uid` set below after the test. After calling
+ // setuid(non-zero-UID), there is no way to get root privileges back.
+ ScopedThread([&] {
+ const uid_t ruid = 12345;
+ const uid_t euid = 23456;
+ const uid_t suid = 34567;
+
+ // Use syscall instead of glibc setuid wrapper because we want this setuid
+ // call to only apply to this task. posix threads, however, require that all
+ // threads have the same UIDs, so using the setuid wrapper sets all threads'
+ // real UID.
+ EXPECT_THAT(syscall(SYS_setresuid, ruid, euid, suid), SyscallSucceeds());
+ EXPECT_NO_ERRNO(CheckUIDs(ruid, euid, suid));
+ });
+}
+
+TEST(UidGidRootTest, Setresgid) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsRoot()));
+
+ EXPECT_THAT(setresgid(-1, -1, -1), SyscallSucceeds());
+ EXPECT_NO_ERRNO(CheckGIDs(0, 0, 0));
+
+ const gid_t rgid = 12345;
+ const gid_t egid = 23456;
+ const gid_t sgid = 34567;
+ ASSERT_THAT(setresgid(rgid, egid, sgid), SyscallSucceeds());
+ EXPECT_NO_ERRNO(CheckGIDs(rgid, egid, sgid));
+}
+
+TEST(UidGidRootTest, Setgroups) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsRoot()));
+
+ std::vector<gid_t> list = {123, 500};
+ ASSERT_THAT(setgroups(list.size(), list.data()), SyscallSucceeds());
+ std::vector<gid_t> list2(list.size());
+ ASSERT_THAT(getgroups(list2.size(), list2.data()), SyscallSucceeds());
+ EXPECT_THAT(list, UnorderedElementsAreArray(list2));
+
+ // "EFAULT: list has an invalid address."
+ EXPECT_THAT(getgroups(100, reinterpret_cast<gid_t*>(-1)),
+ SyscallFailsWithErrno(EFAULT));
+}
+
+TEST(UidGidRootTest, Setuid_prlimit) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(IsRoot()));
+
+ // Do seteuid in a separate thread so that after finishing this test, the
+ // process can still open files the test harness created before starting this
+ // test. Otherwise, the files are created by root (UID before the test), but
+ // cannot be opened by the `uid` set below after the test.
+ ScopedThread([&] {
+ // Use syscall instead of glibc setuid wrapper because we want this seteuid
+ // call to only apply to this task. POSIX threads, however, require that all
+ // threads have the same UIDs, so using the seteuid wrapper sets all
+ // threads' UID.
+ EXPECT_THAT(syscall(SYS_setreuid, -1, 65534), SyscallSucceeds());
+
+ // Despite the UID change, we should be able to get our own limits.
+ struct rlimit rl = {};
+ EXPECT_THAT(prlimit(0, RLIMIT_NOFILE, NULL, &rl), SyscallSucceeds());
+ });
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/uname.cc b/test/syscalls/linux/uname.cc
new file mode 100644
index 000000000..d8824b171
--- /dev/null
+++ b/test/syscalls/linux/uname.cc
@@ -0,0 +1,111 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sched.h>
+#include <sys/utsname.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "absl/strings/string_view.h"
+#include "test/util/capability_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(UnameTest, Sanity) {
+ struct utsname buf;
+ ASSERT_THAT(uname(&buf), SyscallSucceeds());
+ EXPECT_NE(strlen(buf.release), 0);
+ EXPECT_NE(strlen(buf.version), 0);
+ EXPECT_NE(strlen(buf.machine), 0);
+ EXPECT_NE(strlen(buf.sysname), 0);
+ EXPECT_NE(strlen(buf.nodename), 0);
+ EXPECT_NE(strlen(buf.domainname), 0);
+}
+
+TEST(UnameTest, SetNames) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+ char hostname[65];
+ ASSERT_THAT(sethostname("0123456789", 3), SyscallSucceeds());
+ EXPECT_THAT(gethostname(hostname, sizeof(hostname)), SyscallSucceeds());
+ EXPECT_EQ(absl::string_view(hostname), "012");
+
+ ASSERT_THAT(sethostname("0123456789\0xxx", 11), SyscallSucceeds());
+ EXPECT_THAT(gethostname(hostname, sizeof(hostname)), SyscallSucceeds());
+ EXPECT_EQ(absl::string_view(hostname), "0123456789");
+
+ ASSERT_THAT(sethostname("0123456789\0xxx", 12), SyscallSucceeds());
+ EXPECT_THAT(gethostname(hostname, sizeof(hostname)), SyscallSucceeds());
+ EXPECT_EQ(absl::string_view(hostname), "0123456789");
+
+ constexpr char kHostname[] = "wubbalubba";
+ ASSERT_THAT(sethostname(kHostname, sizeof(kHostname)), SyscallSucceeds());
+
+ constexpr char kDomainname[] = "dubdub.com";
+ ASSERT_THAT(setdomainname(kDomainname, sizeof(kDomainname)),
+ SyscallSucceeds());
+
+ struct utsname buf;
+ EXPECT_THAT(uname(&buf), SyscallSucceeds());
+ EXPECT_EQ(absl::string_view(buf.nodename), kHostname);
+ EXPECT_EQ(absl::string_view(buf.domainname), kDomainname);
+
+ // These should just be glibc wrappers that also call uname(2).
+ EXPECT_THAT(gethostname(hostname, sizeof(hostname)), SyscallSucceeds());
+ EXPECT_EQ(absl::string_view(hostname), kHostname);
+
+ char domainname[65];
+ EXPECT_THAT(getdomainname(domainname, sizeof(domainname)), SyscallSucceeds());
+ EXPECT_EQ(absl::string_view(domainname), kDomainname);
+}
+
+TEST(UnameTest, UnprivilegedSetNames) {
+ if (ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))) {
+ EXPECT_NO_ERRNO(SetCapability(CAP_SYS_ADMIN, false));
+ }
+
+ EXPECT_THAT(sethostname("", 0), SyscallFailsWithErrno(EPERM));
+ EXPECT_THAT(setdomainname("", 0), SyscallFailsWithErrno(EPERM));
+}
+
+TEST(UnameTest, UnshareUTS) {
+ SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+ struct utsname init;
+ ASSERT_THAT(uname(&init), SyscallSucceeds());
+
+ ScopedThread([&]() {
+ EXPECT_THAT(unshare(CLONE_NEWUTS), SyscallSucceeds());
+
+ constexpr char kHostname[] = "wubbalubba";
+ EXPECT_THAT(sethostname(kHostname, sizeof(kHostname)), SyscallSucceeds());
+
+ char hostname[65];
+ EXPECT_THAT(gethostname(hostname, sizeof(hostname)), SyscallSucceeds());
+ });
+
+ struct utsname after;
+ EXPECT_THAT(uname(&after), SyscallSucceeds());
+ EXPECT_EQ(absl::string_view(after.nodename), init.nodename);
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/unix_domain_socket_test_util.cc b/test/syscalls/linux/unix_domain_socket_test_util.cc
new file mode 100644
index 000000000..b05ab2900
--- /dev/null
+++ b/test/syscalls/linux/unix_domain_socket_test_util.cc
@@ -0,0 +1,351 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+
+#include <sys/un.h>
+
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "absl/strings/str_cat.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+std::string DescribeUnixDomainSocketType(int type) {
+ const char* type_str = nullptr;
+ switch (type & ~(SOCK_NONBLOCK | SOCK_CLOEXEC)) {
+ case SOCK_STREAM:
+ type_str = "SOCK_STREAM";
+ break;
+ case SOCK_DGRAM:
+ type_str = "SOCK_DGRAM";
+ break;
+ case SOCK_SEQPACKET:
+ type_str = "SOCK_SEQPACKET";
+ break;
+ }
+ if (!type_str) {
+ return absl::StrCat("Unix domain socket with unknown type ", type);
+ } else {
+ return absl::StrCat(((type & SOCK_NONBLOCK) != 0) ? "non-blocking " : "",
+ ((type & SOCK_CLOEXEC) != 0) ? "close-on-exec " : "",
+ type_str, " Unix domain socket");
+ }
+}
+
+SocketPairKind UnixDomainSocketPair(int type) {
+ return SocketPairKind{DescribeUnixDomainSocketType(type), AF_UNIX, type, 0,
+ SyscallSocketPairCreator(AF_UNIX, type, 0)};
+}
+
+SocketPairKind FilesystemBoundUnixDomainSocketPair(int type) {
+ std::string description = absl::StrCat(DescribeUnixDomainSocketType(type),
+ " created with filesystem binding");
+ if ((type & SOCK_DGRAM) == SOCK_DGRAM) {
+ return SocketPairKind{
+ description, AF_UNIX, type, 0,
+ FilesystemBidirectionalBindSocketPairCreator(AF_UNIX, type, 0)};
+ }
+ return SocketPairKind{
+ description, AF_UNIX, type, 0,
+ FilesystemAcceptBindSocketPairCreator(AF_UNIX, type, 0)};
+}
+
+SocketPairKind AbstractBoundUnixDomainSocketPair(int type) {
+ std::string description =
+ absl::StrCat(DescribeUnixDomainSocketType(type),
+ " created with abstract namespace binding");
+ if ((type & SOCK_DGRAM) == SOCK_DGRAM) {
+ return SocketPairKind{
+ description, AF_UNIX, type, 0,
+ AbstractBidirectionalBindSocketPairCreator(AF_UNIX, type, 0)};
+ }
+ return SocketPairKind{description, AF_UNIX, type, 0,
+ AbstractAcceptBindSocketPairCreator(AF_UNIX, type, 0)};
+}
+
+SocketPairKind SocketpairGoferUnixDomainSocketPair(int type) {
+ std::string description = absl::StrCat(DescribeUnixDomainSocketType(type),
+ " created with the socketpair gofer");
+ return SocketPairKind{description, AF_UNIX, type, 0,
+ SocketpairGoferSocketPairCreator(AF_UNIX, type, 0)};
+}
+
+SocketPairKind SocketpairGoferFileSocketPair(int type) {
+ std::string description =
+ absl::StrCat(((type & O_NONBLOCK) != 0) ? "non-blocking " : "",
+ ((type & O_CLOEXEC) != 0) ? "close-on-exec " : "",
+ "file socket created with the socketpair gofer");
+ // The socketpair gofer always creates SOCK_STREAM sockets on open(2).
+ return SocketPairKind{description, AF_UNIX, SOCK_STREAM, 0,
+ SocketpairGoferFileSocketPairCreator(type)};
+}
+
+SocketPairKind FilesystemUnboundUnixDomainSocketPair(int type) {
+ return SocketPairKind{absl::StrCat(DescribeUnixDomainSocketType(type),
+ " unbound with a filesystem address"),
+ AF_UNIX, type, 0,
+ FilesystemUnboundSocketPairCreator(AF_UNIX, type, 0)};
+}
+
+SocketPairKind AbstractUnboundUnixDomainSocketPair(int type) {
+ return SocketPairKind{
+ absl::StrCat(DescribeUnixDomainSocketType(type),
+ " unbound with an abstract namespace address"),
+ AF_UNIX, type, 0, AbstractUnboundSocketPairCreator(AF_UNIX, type, 0)};
+}
+
+void SendSingleFD(int sock, int fd, char buf[], int buf_size) {
+ ASSERT_NO_FATAL_FAILURE(SendFDs(sock, &fd, 1, buf, buf_size));
+}
+
+void SendFDs(int sock, int fds[], int fds_size, char buf[], int buf_size) {
+ struct msghdr msg = {};
+ std::vector<char> control(CMSG_SPACE(fds_size * sizeof(int)));
+ msg.msg_control = &control[0];
+ msg.msg_controllen = control.size();
+
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_len = CMSG_LEN(fds_size * sizeof(int));
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ for (int i = 0; i < fds_size; i++) {
+ memcpy(CMSG_DATA(cmsg) + i * sizeof(int), &fds[i], sizeof(int));
+ }
+
+ ASSERT_THAT(SendMsg(sock, &msg, buf, buf_size),
+ IsPosixErrorOkAndHolds(buf_size));
+}
+
+void RecvSingleFD(int sock, int* fd, char buf[], int buf_size) {
+ ASSERT_NO_FATAL_FAILURE(RecvFDs(sock, fd, 1, buf, buf_size, buf_size));
+}
+
+void RecvSingleFD(int sock, int* fd, char buf[], int buf_size,
+ int expected_size) {
+ ASSERT_NO_FATAL_FAILURE(RecvFDs(sock, fd, 1, buf, buf_size, expected_size));
+}
+
+void RecvFDs(int sock, int fds[], int fds_size, char buf[], int buf_size) {
+ ASSERT_NO_FATAL_FAILURE(
+ RecvFDs(sock, fds, fds_size, buf, buf_size, buf_size));
+}
+
+void RecvFDs(int sock, int fds[], int fds_size, char buf[], int buf_size,
+ int expected_size, bool peek) {
+ struct msghdr msg = {};
+ std::vector<char> control(CMSG_SPACE(fds_size * sizeof(int)));
+ msg.msg_control = &control[0];
+ msg.msg_controllen = control.size();
+
+ struct iovec iov;
+ iov.iov_base = buf;
+ iov.iov_len = buf_size;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ int flags = 0;
+ if (peek) {
+ flags |= MSG_PEEK;
+ }
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sock, &msg, flags),
+ SyscallSucceedsWithValue(expected_size));
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, nullptr);
+ ASSERT_EQ(cmsg->cmsg_len, CMSG_LEN(fds_size * sizeof(int)));
+ ASSERT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+ ASSERT_EQ(cmsg->cmsg_type, SCM_RIGHTS);
+
+ for (int i = 0; i < fds_size; i++) {
+ memcpy(&fds[i], CMSG_DATA(cmsg) + i * sizeof(int), sizeof(int));
+ }
+}
+
+void RecvFDs(int sock, int fds[], int fds_size, char buf[], int buf_size,
+ int expected_size) {
+ ASSERT_NO_FATAL_FAILURE(
+ RecvFDs(sock, fds, fds_size, buf, buf_size, expected_size, false));
+}
+
+void PeekSingleFD(int sock, int* fd, char buf[], int buf_size) {
+ ASSERT_NO_FATAL_FAILURE(RecvFDs(sock, fd, 1, buf, buf_size, buf_size, true));
+}
+
+void RecvNoCmsg(int sock, char buf[], int buf_size, int expected_size) {
+ struct msghdr msg = {};
+ char control[CMSG_SPACE(sizeof(int)) + CMSG_SPACE(sizeof(struct ucred))];
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ struct iovec iov;
+ iov.iov_base = buf;
+ iov.iov_len = buf_size;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sock, &msg, 0),
+ SyscallSucceedsWithValue(expected_size));
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+ EXPECT_EQ(cmsg, nullptr);
+}
+
+void SendNullCmsg(int sock, char buf[], int buf_size) {
+ struct msghdr msg = {};
+ msg.msg_control = nullptr;
+ msg.msg_controllen = 0;
+
+ ASSERT_THAT(SendMsg(sock, &msg, buf, buf_size),
+ IsPosixErrorOkAndHolds(buf_size));
+}
+
+void SendCreds(int sock, ucred creds, char buf[], int buf_size) {
+ struct msghdr msg = {};
+
+ char control[CMSG_SPACE(sizeof(struct ucred))];
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_CREDENTIALS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred));
+ memcpy(CMSG_DATA(cmsg), &creds, sizeof(struct ucred));
+
+ ASSERT_THAT(SendMsg(sock, &msg, buf, buf_size),
+ IsPosixErrorOkAndHolds(buf_size));
+}
+
+void SendCredsAndFD(int sock, ucred creds, int fd, char buf[], int buf_size) {
+ struct msghdr msg = {};
+
+ char control[CMSG_SPACE(sizeof(struct ucred)) + CMSG_SPACE(sizeof(int))] = {};
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ struct cmsghdr* cmsg1 = CMSG_FIRSTHDR(&msg);
+ cmsg1->cmsg_level = SOL_SOCKET;
+ cmsg1->cmsg_type = SCM_CREDENTIALS;
+ cmsg1->cmsg_len = CMSG_LEN(sizeof(struct ucred));
+ memcpy(CMSG_DATA(cmsg1), &creds, sizeof(struct ucred));
+
+ struct cmsghdr* cmsg2 = CMSG_NXTHDR(&msg, cmsg1);
+ cmsg2->cmsg_level = SOL_SOCKET;
+ cmsg2->cmsg_type = SCM_RIGHTS;
+ cmsg2->cmsg_len = CMSG_LEN(sizeof(int));
+ memcpy(CMSG_DATA(cmsg2), &fd, sizeof(int));
+
+ ASSERT_THAT(SendMsg(sock, &msg, buf, buf_size),
+ IsPosixErrorOkAndHolds(buf_size));
+}
+
+void RecvCreds(int sock, ucred* creds, char buf[], int buf_size) {
+ ASSERT_NO_FATAL_FAILURE(RecvCreds(sock, creds, buf, buf_size, buf_size));
+}
+
+void RecvCreds(int sock, ucred* creds, char buf[], int buf_size,
+ int expected_size) {
+ struct msghdr msg = {};
+ char control[CMSG_SPACE(sizeof(struct ucred))];
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ struct iovec iov;
+ iov.iov_base = buf;
+ iov.iov_len = buf_size;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sock, &msg, 0),
+ SyscallSucceedsWithValue(expected_size));
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, nullptr);
+ ASSERT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(struct ucred)));
+ ASSERT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+ ASSERT_EQ(cmsg->cmsg_type, SCM_CREDENTIALS);
+
+ memcpy(creds, CMSG_DATA(cmsg), sizeof(struct ucred));
+}
+
+void RecvCredsAndFD(int sock, ucred* creds, int* fd, char buf[], int buf_size) {
+ struct msghdr msg = {};
+ char control[CMSG_SPACE(sizeof(struct ucred)) + CMSG_SPACE(sizeof(int))];
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ struct iovec iov;
+ iov.iov_base = buf;
+ iov.iov_len = buf_size;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sock, &msg, 0),
+ SyscallSucceedsWithValue(buf_size));
+
+ struct cmsghdr* cmsg1 = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg1, nullptr);
+ ASSERT_EQ(cmsg1->cmsg_len, CMSG_LEN(sizeof(struct ucred)));
+ ASSERT_EQ(cmsg1->cmsg_level, SOL_SOCKET);
+ ASSERT_EQ(cmsg1->cmsg_type, SCM_CREDENTIALS);
+ memcpy(creds, CMSG_DATA(cmsg1), sizeof(struct ucred));
+
+ struct cmsghdr* cmsg2 = CMSG_NXTHDR(&msg, cmsg1);
+ ASSERT_NE(cmsg2, nullptr);
+ ASSERT_EQ(cmsg2->cmsg_len, CMSG_LEN(sizeof(int)));
+ ASSERT_EQ(cmsg2->cmsg_level, SOL_SOCKET);
+ ASSERT_EQ(cmsg2->cmsg_type, SCM_RIGHTS);
+ memcpy(fd, CMSG_DATA(cmsg2), sizeof(int));
+}
+
+void RecvSingleFDUnaligned(int sock, int* fd, char buf[], int buf_size) {
+ struct msghdr msg = {};
+ char control[CMSG_SPACE(sizeof(int)) - sizeof(int)];
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ struct iovec iov;
+ iov.iov_base = buf;
+ iov.iov_len = buf_size;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sock, &msg, 0),
+ SyscallSucceedsWithValue(buf_size));
+
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, nullptr);
+ ASSERT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(int)));
+ ASSERT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+ ASSERT_EQ(cmsg->cmsg_type, SCM_RIGHTS);
+
+ memcpy(fd, CMSG_DATA(cmsg), sizeof(int));
+}
+
+void SetSoPassCred(int sock) {
+ int one = 1;
+ EXPECT_THAT(setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &one, sizeof(one)),
+ SyscallSucceeds());
+}
+
+void UnsetSoPassCred(int sock) {
+ int zero = 0;
+ EXPECT_THAT(setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &zero, sizeof(zero)),
+ SyscallSucceeds());
+}
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/unix_domain_socket_test_util.h b/test/syscalls/linux/unix_domain_socket_test_util.h
new file mode 100644
index 000000000..b8073db17
--- /dev/null
+++ b/test/syscalls/linux/unix_domain_socket_test_util.h
@@ -0,0 +1,162 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GVISOR_TEST_SYSCALLS_UNIX_DOMAIN_SOCKET_TEST_UTIL_H_
+#define GVISOR_TEST_SYSCALLS_UNIX_DOMAIN_SOCKET_TEST_UTIL_H_
+
+#include <string>
+
+#include "test/syscalls/linux/socket_test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+// DescribeUnixDomainSocketType returns a human-readable string explaining the
+// given Unix domain socket type.
+std::string DescribeUnixDomainSocketType(int type);
+
+// UnixDomainSocketPair returns a SocketPairKind that represents SocketPairs
+// created by invoking the socketpair() syscall with AF_UNIX and the given type.
+SocketPairKind UnixDomainSocketPair(int type);
+
+// FilesystemBoundUnixDomainSocketPair returns a SocketPairKind that represents
+// SocketPairs created with bind() and accept() syscalls with a temp file path,
+// AF_UNIX and the given type.
+SocketPairKind FilesystemBoundUnixDomainSocketPair(int type);
+
+// AbstractBoundUnixDomainSocketPair returns a SocketPairKind that represents
+// SocketPairs created with bind() and accept() syscalls with a temp abstract
+// path, AF_UNIX and the given type.
+SocketPairKind AbstractBoundUnixDomainSocketPair(int type);
+
+// SocketpairGoferUnixDomainSocketPair returns a SocketPairKind that was created
+// with two sockets connected to the socketpair gofer.
+SocketPairKind SocketpairGoferUnixDomainSocketPair(int type);
+
+// SocketpairGoferFileSocketPair returns a SocketPairKind that was created with
+// two open() calls on paths backed by the socketpair gofer.
+SocketPairKind SocketpairGoferFileSocketPair(int type);
+
+// FilesystemUnboundUnixDomainSocketPair returns a SocketPairKind that
+// represents two unbound sockets and a filesystem path for binding.
+SocketPairKind FilesystemUnboundUnixDomainSocketPair(int type);
+
+// AbstractUnboundUnixDomainSocketPair returns a SocketPairKind that represents
+// two unbound sockets and an abstract namespace path for binding.
+SocketPairKind AbstractUnboundUnixDomainSocketPair(int type);
+
+// SendSingleFD sends both a single FD and some data over a unix domain socket
+// specified by an FD. Note that calls to this function must be wrapped in
+// ASSERT_NO_FATAL_FAILURE for internal assertions to halt the test.
+void SendSingleFD(int sock, int fd, char buf[], int buf_size);
+
+// SendFDs sends an arbitrary number of FDs and some data over a unix domain
+// socket specified by an FD. Note that calls to this function must be wrapped
+// in ASSERT_NO_FATAL_FAILURE for internal assertions to halt the test.
+void SendFDs(int sock, int fds[], int fds_size, char buf[], int buf_size);
+
+// RecvSingleFD receives both a single FD and some data over a unix domain
+// socket specified by an FD. Note that calls to this function must be wrapped
+// in ASSERT_NO_FATAL_FAILURE for internal assertions to halt the test.
+void RecvSingleFD(int sock, int* fd, char buf[], int buf_size);
+
+// RecvSingleFD receives both a single FD and some data over a unix domain
+// socket specified by an FD. This version allows the expected amount of data
+// received to be different than the buffer size. Note that calls to this
+// function must be wrapped in ASSERT_NO_FATAL_FAILURE for internal assertions
+// to halt the test.
+void RecvSingleFD(int sock, int* fd, char buf[], int buf_size,
+ int expected_size);
+
+// PeekSingleFD peeks at both a single FD and some data over a unix domain
+// socket specified by an FD. Note that calls to this function must be wrapped
+// in ASSERT_NO_FATAL_FAILURE for internal assertions to halt the test.
+void PeekSingleFD(int sock, int* fd, char buf[], int buf_size);
+
+// RecvFDs receives both an arbitrary number of FDs and some data over a unix
+// domain socket specified by an FD. Note that calls to this function must be
+// wrapped in ASSERT_NO_FATAL_FAILURE for internal assertions to halt the test.
+void RecvFDs(int sock, int fds[], int fds_size, char buf[], int buf_size);
+
+// RecvFDs receives both an arbitrary number of FDs and some data over a unix
+// domain socket specified by an FD. This version allows the expected amount of
+// data received to be different than the buffer size. Note that calls to this
+// function must be wrapped in ASSERT_NO_FATAL_FAILURE for internal assertions
+// to halt the test.
+void RecvFDs(int sock, int fds[], int fds_size, char buf[], int buf_size,
+ int expected_size);
+
+// RecvNoCmsg receives some data over a unix domain socket specified by an FD
+// and asserts that no control messages are available for receiving. Note that
+// calls to this function must be wrapped in ASSERT_NO_FATAL_FAILURE for
+// internal assertions to halt the test.
+void RecvNoCmsg(int sock, char buf[], int buf_size, int expected_size);
+
+inline void RecvNoCmsg(int sock, char buf[], int buf_size) {
+ RecvNoCmsg(sock, buf, buf_size, buf_size);
+}
+
+// SendCreds sends the credentials of the current process and some data over a
+// unix domain socket specified by an FD. Note that calls to this function must
+// be wrapped in ASSERT_NO_FATAL_FAILURE for internal assertions to halt the
+// test.
+void SendCreds(int sock, ucred creds, char buf[], int buf_size);
+
+// SendCredsAndFD sends the credentials of the current process, a single FD, and
+// some data over a unix domain socket specified by an FD. Note that calls to
+// this function must be wrapped in ASSERT_NO_FATAL_FAILURE for internal
+// assertions to halt the test.
+void SendCredsAndFD(int sock, ucred creds, int fd, char buf[], int buf_size);
+
+// RecvCreds receives some credentials and some data over a unix domain socket
+// specified by an FD. Note that calls to this function must be wrapped in
+// ASSERT_NO_FATAL_FAILURE for internal assertions to halt the test.
+void RecvCreds(int sock, ucred* creds, char buf[], int buf_size);
+
+// RecvCreds receives some credentials and some data over a unix domain socket
+// specified by an FD. This version allows the expected amount of data received
+// to be different than the buffer size. Note that calls to this function must
+// be wrapped in ASSERT_NO_FATAL_FAILURE for internal assertions to halt the
+// test.
+void RecvCreds(int sock, ucred* creds, char buf[], int buf_size,
+ int expected_size);
+
+// RecvCredsAndFD receives some credentials, a single FD, and some data over a
+// unix domain socket specified by an FD. Note that calls to this function must
+// be wrapped in ASSERT_NO_FATAL_FAILURE for internal assertions to halt the
+// test.
+void RecvCredsAndFD(int sock, ucred* creds, int* fd, char buf[], int buf_size);
+
+// SendNullCmsg sends a null control message and some data over a unix domain
+// socket specified by an FD. Note that calls to this function must be wrapped
+// in ASSERT_NO_FATAL_FAILURE for internal assertions to halt the test.
+void SendNullCmsg(int sock, char buf[], int buf_size);
+
+// RecvSingleFDUnaligned sends both a single FD and some data over a unix domain
+// socket specified by an FD. This function does not obey the spec, but Linux
+// allows it and the apphosting code depends on this quirk. Note that calls to
+// this function must be wrapped in ASSERT_NO_FATAL_FAILURE for internal
+// assertions to halt the test.
+void RecvSingleFDUnaligned(int sock, int* fd, char buf[], int buf_size);
+
+// SetSoPassCred sets the SO_PASSCRED option on the specified socket.
+void SetSoPassCred(int sock);
+
+// UnsetSoPassCred clears the SO_PASSCRED option on the specified socket.
+void UnsetSoPassCred(int sock);
+
+} // namespace testing
+} // namespace gvisor
+
+#endif // GVISOR_TEST_SYSCALLS_UNIX_DOMAIN_SOCKET_TEST_UTIL_H_
diff --git a/test/syscalls/linux/unlink.cc b/test/syscalls/linux/unlink.cc
new file mode 100644
index 000000000..2040375c9
--- /dev/null
+++ b/test/syscalls/linux/unlink.cc
@@ -0,0 +1,214 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include "gtest/gtest.h"
+#include "absl/strings/str_cat.h"
+#include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(UnlinkTest, IsDir) {
+ auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+ EXPECT_THAT(unlink(dir.path().c_str()), SyscallFailsWithErrno(EISDIR));
+}
+
+TEST(UnlinkTest, DirNotEmpty) {
+ auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+ int fd;
+ std::string path = JoinPath(dir.path(), "ExistingFile");
+ EXPECT_THAT(fd = open(path.c_str(), O_RDWR | O_CREAT, 0666),
+ SyscallSucceeds());
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+ EXPECT_THAT(rmdir(dir.path().c_str()), SyscallFailsWithErrno(ENOTEMPTY));
+}
+
+TEST(UnlinkTest, Rmdir) {
+ auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ EXPECT_THAT(rmdir(dir.path().c_str()), SyscallSucceeds());
+}
+
+TEST(UnlinkTest, AtDir) {
+ int dirfd;
+ auto tmpdir = GetAbsoluteTestTmpdir();
+ EXPECT_THAT(dirfd = open(tmpdir.c_str(), O_DIRECTORY, 0), SyscallSucceeds());
+
+ auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(tmpdir));
+ auto dir_relpath =
+ ASSERT_NO_ERRNO_AND_VALUE(GetRelativePath(tmpdir, dir.path()));
+ EXPECT_THAT(unlinkat(dirfd, dir_relpath.c_str(), AT_REMOVEDIR),
+ SyscallSucceeds());
+ ASSERT_THAT(close(dirfd), SyscallSucceeds());
+}
+
+TEST(UnlinkTest, AtDirDegradedPermissions_NoRandomSave) {
+ // Drop capabilities that allow us to override file and directory permissions.
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+ auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+
+ int dirfd;
+ ASSERT_THAT(dirfd = open(dir.path().c_str(), O_DIRECTORY, 0),
+ SyscallSucceeds());
+
+ std::string sub_dir = JoinPath(dir.path(), "NewDir");
+ EXPECT_THAT(mkdir(sub_dir.c_str(), 0755), SyscallSucceeds());
+ EXPECT_THAT(fchmod(dirfd, 0444), SyscallSucceeds());
+ EXPECT_THAT(unlinkat(dirfd, "NewDir", AT_REMOVEDIR),
+ SyscallFailsWithErrno(EACCES));
+ ASSERT_THAT(close(dirfd), SyscallSucceeds());
+}
+
+// Files cannot be unlinked if the parent is not writable and executable.
+TEST(UnlinkTest, ParentDegradedPermissions) {
+ // Drop capabilities that allow us to override file and directory permissions.
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+ auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir.path()));
+
+ ASSERT_THAT(chmod(dir.path().c_str(), 0000), SyscallSucceeds());
+
+ struct stat st;
+ ASSERT_THAT(stat(file.path().c_str(), &st), SyscallFailsWithErrno(EACCES));
+ ASSERT_THAT(unlinkat(AT_FDCWD, file.path().c_str(), 0),
+ SyscallFailsWithErrno(EACCES));
+
+ // Non-existent files also return EACCES.
+ const std::string nonexist = JoinPath(dir.path(), "doesnotexist");
+ ASSERT_THAT(stat(nonexist.c_str(), &st), SyscallFailsWithErrno(EACCES));
+ ASSERT_THAT(unlinkat(AT_FDCWD, nonexist.c_str(), 0),
+ SyscallFailsWithErrno(EACCES));
+}
+
+TEST(UnlinkTest, AtBad) {
+ int dirfd;
+ EXPECT_THAT(dirfd = open(GetAbsoluteTestTmpdir().c_str(), O_DIRECTORY, 0),
+ SyscallSucceeds());
+
+ // Try removing a directory as a file.
+ std::string path = JoinPath(GetAbsoluteTestTmpdir(), "NewDir");
+ EXPECT_THAT(mkdir(path.c_str(), 0755), SyscallSucceeds());
+ EXPECT_THAT(unlinkat(dirfd, "NewDir", 0), SyscallFailsWithErrno(EISDIR));
+ EXPECT_THAT(unlinkat(dirfd, "NewDir", AT_REMOVEDIR), SyscallSucceeds());
+
+ // Try removing a file as a directory.
+ int fd;
+ EXPECT_THAT(fd = openat(dirfd, "UnlinkAtFile", O_RDWR | O_CREAT, 0666),
+ SyscallSucceeds());
+ EXPECT_THAT(unlinkat(dirfd, "UnlinkAtFile", AT_REMOVEDIR),
+ SyscallFailsWithErrno(ENOTDIR));
+ EXPECT_THAT(unlinkat(dirfd, "UnlinkAtFile/", 0),
+ SyscallFailsWithErrno(ENOTDIR));
+ ASSERT_THAT(close(fd), SyscallSucceeds());
+ EXPECT_THAT(unlinkat(dirfd, "UnlinkAtFile", 0), SyscallSucceeds());
+
+ // Cleanup.
+ ASSERT_THAT(close(dirfd), SyscallSucceeds());
+}
+
+TEST(UnlinkTest, AbsTmpFile) {
+ int fd;
+ std::string path = JoinPath(GetAbsoluteTestTmpdir(), "ExistingFile");
+ EXPECT_THAT(fd = open(path.c_str(), O_RDWR | O_CREAT, 0666),
+ SyscallSucceeds());
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+ EXPECT_THAT(unlink(path.c_str()), SyscallSucceeds());
+}
+
+TEST(UnlinkTest, TooLongName) {
+ EXPECT_THAT(unlink(std::vector<char>(16384, '0').data()),
+ SyscallFailsWithErrno(ENAMETOOLONG));
+}
+
+TEST(UnlinkTest, BadNamePtr) {
+ EXPECT_THAT(unlink(reinterpret_cast<char*>(1)),
+ SyscallFailsWithErrno(EFAULT));
+}
+
+TEST(UnlinkTest, AtFile) {
+ int dirfd;
+ EXPECT_THAT(dirfd = open(GetAbsoluteTestTmpdir().c_str(), O_DIRECTORY, 0666),
+ SyscallSucceeds());
+ int fd;
+ EXPECT_THAT(fd = openat(dirfd, "UnlinkAtFile", O_RDWR | O_CREAT, 0666),
+ SyscallSucceeds());
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+ EXPECT_THAT(unlinkat(dirfd, "UnlinkAtFile", 0), SyscallSucceeds());
+}
+
+TEST(UnlinkTest, OpenFile_NoRandomSave) {
+ // We can't save unlinked file unless they are on tmpfs.
+ const DisableSave ds;
+ auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ int fd;
+ EXPECT_THAT(fd = open(file.path().c_str(), O_RDWR, 0666), SyscallSucceeds());
+ EXPECT_THAT(unlink(file.path().c_str()), SyscallSucceeds());
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
+TEST(UnlinkTest, CannotRemoveDots) {
+ auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const std::string self = JoinPath(file.path(), ".");
+ ASSERT_THAT(unlink(self.c_str()), SyscallFailsWithErrno(ENOTDIR));
+ const std::string parent = JoinPath(file.path(), "..");
+ ASSERT_THAT(unlink(parent.c_str()), SyscallFailsWithErrno(ENOTDIR));
+}
+
+TEST(UnlinkTest, CannotRemoveRoot) {
+ ASSERT_THAT(unlinkat(-1, "/", AT_REMOVEDIR), SyscallFailsWithErrno(EBUSY));
+}
+
+TEST(UnlinkTest, CannotRemoveRootWithAtDir) {
+ const FileDescriptor dirfd = ASSERT_NO_ERRNO_AND_VALUE(
+ Open(GetAbsoluteTestTmpdir(), O_DIRECTORY, 0666));
+ ASSERT_THAT(unlinkat(dirfd.get(), "/", AT_REMOVEDIR),
+ SyscallFailsWithErrno(EBUSY));
+}
+
+TEST(RmdirTest, CannotRemoveDots) {
+ auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const std::string self = JoinPath(dir.path(), ".");
+ ASSERT_THAT(rmdir(self.c_str()), SyscallFailsWithErrno(EINVAL));
+ const std::string parent = JoinPath(dir.path(), "..");
+ ASSERT_THAT(rmdir(parent.c_str()), SyscallFailsWithErrno(ENOTEMPTY));
+}
+
+TEST(RmdirTest, CanRemoveWithTrailingSlashes) {
+ auto dir1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const std::string slash = absl::StrCat(dir1.path(), "/");
+ ASSERT_THAT(rmdir(slash.c_str()), SyscallSucceeds());
+ auto dir2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const std::string slashslash = absl::StrCat(dir2.path(), "//");
+ ASSERT_THAT(rmdir(slashslash.c_str()), SyscallSucceeds());
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/unshare.cc b/test/syscalls/linux/unshare.cc
new file mode 100644
index 000000000..e32619efe
--- /dev/null
+++ b/test/syscalls/linux/unshare.cc
@@ -0,0 +1,50 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <sched.h>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/synchronization/mutex.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST(UnshareTest, AllowsZeroFlags) {
+ ASSERT_THAT(unshare(0), SyscallSucceeds());
+}
+
+TEST(UnshareTest, ThreadFlagFailsIfMultithreaded) {
+ absl::Mutex mu;
+ bool finished = false;
+ ScopedThread t([&] {
+ mu.Lock();
+ mu.Await(absl::Condition(&finished));
+ mu.Unlock();
+ });
+ ASSERT_THAT(unshare(CLONE_THREAD), SyscallFailsWithErrno(EINVAL));
+ mu.Lock();
+ finished = true;
+ mu.Unlock();
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/utimes.cc b/test/syscalls/linux/utimes.cc
new file mode 100644
index 000000000..e647d2896
--- /dev/null
+++ b/test/syscalls/linux/utimes.cc
@@ -0,0 +1,319 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <time.h>
+#include <unistd.h>
+#include <utime.h>
+
+#include <string>
+
+#include "absl/time/time.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/fs_util.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// TimeBoxed runs fn, setting before and after to (coarse realtime) times
+// guaranteed* to come before and after fn started and completed, respectively.
+//
+// fn may be called more than once if the clock is adjusted.
+void TimeBoxed(absl::Time* before, absl::Time* after,
+ std::function<void()> const& fn) {
+ do {
+ // N.B. utimes and friends use CLOCK_REALTIME_COARSE for setting time (i.e.,
+ // current_kernel_time()). See fs/attr.c:notify_change.
+ //
+ // notify_change truncates the time to a multiple of s_time_gran, but most
+ // filesystems set it to 1, so we don't do any truncation.
+ struct timespec ts;
+ EXPECT_THAT(clock_gettime(CLOCK_REALTIME_COARSE, &ts), SyscallSucceeds());
+ // FIXME(b/132819225): gVisor filesystem timestamps inconsistently use the
+ // internal or host clock, which may diverge slightly. Allow some slack on
+ // times to account for the difference.
+ *before = absl::TimeFromTimespec(ts) - absl::Seconds(1);
+
+ fn();
+
+ EXPECT_THAT(clock_gettime(CLOCK_REALTIME_COARSE, &ts), SyscallSucceeds());
+ *after = absl::TimeFromTimespec(ts) + absl::Seconds(1);
+
+ if (*after < *before) {
+ // Clock jumped backwards; retry.
+ //
+ // Technically this misses jumps small enough to keep after > before,
+ // which could lead to test failures, but that is very unlikely to happen.
+ continue;
+ }
+ } while (*after < *before);
+}
+
+void TestUtimesOnPath(std::string const& path) {
+ struct stat statbuf;
+
+ struct timeval times[2] = {{10, 0}, {20, 0}};
+ EXPECT_THAT(utimes(path.c_str(), times), SyscallSucceeds());
+ EXPECT_THAT(stat(path.c_str(), &statbuf), SyscallSucceeds());
+ EXPECT_EQ(10, statbuf.st_atime);
+ EXPECT_EQ(20, statbuf.st_mtime);
+
+ absl::Time before;
+ absl::Time after;
+ TimeBoxed(&before, &after, [&] {
+ EXPECT_THAT(utimes(path.c_str(), nullptr), SyscallSucceeds());
+ });
+
+ EXPECT_THAT(stat(path.c_str(), &statbuf), SyscallSucceeds());
+
+ absl::Time atime = absl::TimeFromTimespec(statbuf.st_atim);
+ EXPECT_GE(atime, before);
+ EXPECT_LE(atime, after);
+
+ absl::Time mtime = absl::TimeFromTimespec(statbuf.st_mtim);
+ EXPECT_GE(mtime, before);
+ EXPECT_LE(mtime, after);
+}
+
+TEST(UtimesTest, OnFile) {
+ auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ TestUtimesOnPath(f.path());
+}
+
+TEST(UtimesTest, OnDir) {
+ auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ TestUtimesOnPath(dir.path());
+}
+
+TEST(UtimesTest, MissingPath) {
+ auto path = NewTempAbsPath();
+ struct timeval times[2] = {{10, 0}, {20, 0}};
+ EXPECT_THAT(utimes(path.c_str(), times), SyscallFailsWithErrno(ENOENT));
+}
+
+void TestFutimesat(int dirFd, std::string const& path) {
+ struct stat statbuf;
+
+ struct timeval times[2] = {{10, 0}, {20, 0}};
+ EXPECT_THAT(futimesat(dirFd, path.c_str(), times), SyscallSucceeds());
+ EXPECT_THAT(fstatat(dirFd, path.c_str(), &statbuf, 0), SyscallSucceeds());
+ EXPECT_EQ(10, statbuf.st_atime);
+ EXPECT_EQ(20, statbuf.st_mtime);
+
+ absl::Time before;
+ absl::Time after;
+ TimeBoxed(&before, &after, [&] {
+ EXPECT_THAT(futimesat(dirFd, path.c_str(), nullptr), SyscallSucceeds());
+ });
+
+ EXPECT_THAT(fstatat(dirFd, path.c_str(), &statbuf, 0), SyscallSucceeds());
+
+ absl::Time atime = absl::TimeFromTimespec(statbuf.st_atim);
+ EXPECT_GE(atime, before);
+ EXPECT_LE(atime, after);
+
+ absl::Time mtime = absl::TimeFromTimespec(statbuf.st_mtim);
+ EXPECT_GE(mtime, before);
+ EXPECT_LE(mtime, after);
+}
+
+TEST(FutimesatTest, OnAbsPath) {
+ auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ TestFutimesat(0, f.path());
+}
+
+TEST(FutimesatTest, OnRelPath) {
+ auto d = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(d.path()));
+ auto basename = std::string(Basename(f.path()));
+ const FileDescriptor dirFd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(d.path(), O_RDONLY | O_DIRECTORY));
+ TestFutimesat(dirFd.get(), basename);
+}
+
+TEST(FutimesatTest, InvalidNsec) {
+ auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ struct timeval times[4][2] = {{
+ {0, 1}, // Valid
+ {1, static_cast<int64_t>(1e7)} // Invalid
+ },
+ {
+ {1, static_cast<int64_t>(1e7)}, // Invalid
+ {0, 1} // Valid
+ },
+ {
+ {0, 1}, // Valid
+ {1, -1} // Invalid
+ },
+ {
+ {1, -1}, // Invalid
+ {0, 1} // Valid
+ }};
+
+ for (unsigned int i = 0; i < sizeof(times) / sizeof(times[0]); i++) {
+ std::cout << "test:" << i << "\n";
+ EXPECT_THAT(futimesat(0, f.path().c_str(), times[i]),
+ SyscallFailsWithErrno(EINVAL));
+ }
+}
+
+void TestUtimensat(int dirFd, std::string const& path) {
+ struct stat statbuf;
+ const struct timespec times[2] = {{10, 0}, {20, 0}};
+ EXPECT_THAT(utimensat(dirFd, path.c_str(), times, 0), SyscallSucceeds());
+ EXPECT_THAT(fstatat(dirFd, path.c_str(), &statbuf, 0), SyscallSucceeds());
+ EXPECT_EQ(10, statbuf.st_atime);
+ EXPECT_EQ(20, statbuf.st_mtime);
+
+ // Test setting with UTIME_NOW and UTIME_OMIT.
+ struct stat statbuf2;
+ const struct timespec times2[2] = {
+ {0, UTIME_NOW}, // Should set atime to now.
+ {0, UTIME_OMIT} // Should not change mtime.
+ };
+
+ absl::Time before;
+ absl::Time after;
+ TimeBoxed(&before, &after, [&] {
+ EXPECT_THAT(utimensat(dirFd, path.c_str(), times2, 0), SyscallSucceeds());
+ });
+
+ EXPECT_THAT(fstatat(dirFd, path.c_str(), &statbuf2, 0), SyscallSucceeds());
+
+ absl::Time atime2 = absl::TimeFromTimespec(statbuf2.st_atim);
+ EXPECT_GE(atime2, before);
+ EXPECT_LE(atime2, after);
+
+ absl::Time mtime = absl::TimeFromTimespec(statbuf.st_mtim);
+ absl::Time mtime2 = absl::TimeFromTimespec(statbuf2.st_mtim);
+ // mtime should not be changed.
+ EXPECT_EQ(mtime, mtime2);
+
+ // Test setting with times = NULL. Should set both atime and mtime to the
+ // current system time.
+ struct stat statbuf3;
+ TimeBoxed(&before, &after, [&] {
+ EXPECT_THAT(utimensat(dirFd, path.c_str(), nullptr, 0), SyscallSucceeds());
+ });
+
+ EXPECT_THAT(fstatat(dirFd, path.c_str(), &statbuf3, 0), SyscallSucceeds());
+
+ absl::Time atime3 = absl::TimeFromTimespec(statbuf3.st_atim);
+ EXPECT_GE(atime3, before);
+ EXPECT_LE(atime3, after);
+
+ absl::Time mtime3 = absl::TimeFromTimespec(statbuf3.st_mtim);
+ EXPECT_GE(mtime3, before);
+ EXPECT_LE(mtime3, after);
+
+ EXPECT_EQ(atime3, mtime3);
+}
+
+TEST(UtimensatTest, OnAbsPath) {
+ auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ TestUtimensat(0, f.path());
+}
+
+TEST(UtimensatTest, OnRelPath) {
+ auto d = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(d.path()));
+ auto basename = std::string(Basename(f.path()));
+ const FileDescriptor dirFd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(d.path(), O_RDONLY | O_DIRECTORY));
+ TestUtimensat(dirFd.get(), basename);
+}
+
+TEST(UtimensatTest, OmitNoop) {
+ // Setting both timespecs to UTIME_OMIT on a nonexistant path should succeed.
+ auto path = NewTempAbsPath();
+ const struct timespec times[2] = {{0, UTIME_OMIT}, {0, UTIME_OMIT}};
+ EXPECT_THAT(utimensat(0, path.c_str(), times, 0), SyscallSucceeds());
+}
+
+// Verify that we can actually set atime and mtime to 0.
+TEST(UtimeTest, ZeroAtimeandMtime) {
+ const auto tmp_dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const auto tmp_file =
+ ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(tmp_dir.path()));
+
+ // Stat the file before and after updating atime and mtime.
+ struct stat stat_before = {};
+ EXPECT_THAT(stat(tmp_file.path().c_str(), &stat_before), SyscallSucceeds());
+
+ ASSERT_NE(stat_before.st_atime, 0);
+ ASSERT_NE(stat_before.st_mtime, 0);
+
+ const struct utimbuf times = {}; // Zero for both atime and mtime.
+ EXPECT_THAT(utime(tmp_file.path().c_str(), &times), SyscallSucceeds());
+
+ struct stat stat_after = {};
+ EXPECT_THAT(stat(tmp_file.path().c_str(), &stat_after), SyscallSucceeds());
+
+ // We should see the atime and mtime changed when we set them to 0.
+ ASSERT_EQ(stat_after.st_atime, 0);
+ ASSERT_EQ(stat_after.st_mtime, 0);
+}
+
+TEST(UtimensatTest, InvalidNsec) {
+ auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ struct timespec times[2][2] = {
+ {
+ {0, UTIME_OMIT}, // Valid
+ {2, static_cast<int64_t>(1e10)} // Invalid
+ },
+ {
+ {2, static_cast<int64_t>(1e10)}, // Invalid
+ {0, UTIME_OMIT} // Valid
+ }};
+
+ for (unsigned int i = 0; i < sizeof(times) / sizeof(times[0]); i++) {
+ std::cout << "test:" << i << "\n";
+ EXPECT_THAT(utimensat(0, f.path().c_str(), times[i], 0),
+ SyscallFailsWithErrno(EINVAL));
+ }
+}
+
+TEST(Utimensat, NullPath) {
+ // From man utimensat(2):
+ // "the Linux utimensat() system call implements a nonstandard feature: if
+ // pathname is NULL, then the call modifies the timestamps of the file
+ // referred to by the file descriptor dirfd (which may refer to any type of
+ // file).
+ // Note, however, that the glibc wrapper for utimensat() disallows
+ // passing NULL as the value for file: the wrapper function returns the error
+ // EINVAL in this case."
+ auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
+ const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(f.path(), O_RDWR));
+ struct stat statbuf;
+ const struct timespec times[2] = {{10, 0}, {20, 0}};
+ // Call syscall directly.
+ EXPECT_THAT(syscall(SYS_utimensat, fd.get(), NULL, times, 0),
+ SyscallSucceeds());
+ EXPECT_THAT(fstatat(0, f.path().c_str(), &statbuf, 0), SyscallSucceeds());
+ EXPECT_EQ(10, statbuf.st_atime);
+ EXPECT_EQ(20, statbuf.st_mtime);
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/vdso.cc b/test/syscalls/linux/vdso.cc
new file mode 100644
index 000000000..19c80add8
--- /dev/null
+++ b/test/syscalls/linux/vdso.cc
@@ -0,0 +1,48 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string.h>
+#include <sys/mman.h>
+
+#include <algorithm>
+
+#include "gtest/gtest.h"
+#include "test/util/fs_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/proc_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// Ensure that the vvar page cannot be made writable.
+TEST(VvarTest, WriteVvar) {
+ auto contents = ASSERT_NO_ERRNO_AND_VALUE(GetContents("/proc/self/maps"));
+ auto maps = ASSERT_NO_ERRNO_AND_VALUE(ParseProcMaps(contents));
+ auto it = std::find_if(maps.begin(), maps.end(), [](const ProcMapsEntry& e) {
+ return e.filename == "[vvar]";
+ });
+
+ SKIP_IF(it == maps.end());
+ EXPECT_THAT(mprotect(reinterpret_cast<void*>(it->start), kPageSize,
+ PROT_READ | PROT_WRITE),
+ SyscallFailsWithErrno(EACCES));
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/vdso_clock_gettime.cc b/test/syscalls/linux/vdso_clock_gettime.cc
new file mode 100644
index 000000000..ce1899f45
--- /dev/null
+++ b/test/syscalls/linux/vdso_clock_gettime.cc
@@ -0,0 +1,108 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdint.h>
+#include <sys/time.h>
+#include <syscall.h>
+#include <time.h>
+#include <unistd.h>
+
+#include <map>
+#include <string>
+#include <utility>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/strings/numbers.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+std::string PrintClockId(::testing::TestParamInfo<clockid_t> info) {
+ switch (info.param) {
+ case CLOCK_MONOTONIC:
+ return "CLOCK_MONOTONIC";
+ case CLOCK_REALTIME:
+ return "CLOCK_REALTIME";
+ case CLOCK_BOOTTIME:
+ return "CLOCK_BOOTTIME";
+ default:
+ return absl::StrCat(info.param);
+ }
+}
+
+class CorrectVDSOClockTest : public ::testing::TestWithParam<clockid_t> {};
+
+TEST_P(CorrectVDSOClockTest, IsCorrect) {
+ struct timespec tvdso, tsys;
+ absl::Time vdso_time, sys_time;
+ uint64_t total_calls = 0;
+
+ // It is expected that 82.5% of clock_gettime calls will be less than 100us
+ // skewed from the system time.
+ // Unfortunately this is not only influenced by the VDSO clock skew, but also
+ // by arbitrary scheduling delays and the like. The test is therefore
+ // regularly disabled.
+ std::map<absl::Duration, std::tuple<double, uint64_t, uint64_t>> confidence =
+ {
+ {absl::Microseconds(100), std::make_tuple(0.825, 0, 0)},
+ {absl::Microseconds(250), std::make_tuple(0.94, 0, 0)},
+ {absl::Milliseconds(1), std::make_tuple(0.999, 0, 0)},
+ };
+
+ absl::Time start = absl::Now();
+ while (absl::Now() < start + absl::Seconds(30)) {
+ EXPECT_THAT(clock_gettime(GetParam(), &tvdso), SyscallSucceeds());
+ EXPECT_THAT(syscall(__NR_clock_gettime, GetParam(), &tsys),
+ SyscallSucceeds());
+
+ vdso_time = absl::TimeFromTimespec(tvdso);
+
+ for (auto const& conf : confidence) {
+ std::get<1>(confidence[conf.first]) +=
+ (sys_time - vdso_time) < conf.first;
+ }
+
+ sys_time = absl::TimeFromTimespec(tsys);
+
+ for (auto const& conf : confidence) {
+ std::get<2>(confidence[conf.first]) +=
+ (vdso_time - sys_time) < conf.first;
+ }
+
+ ++total_calls;
+ }
+
+ for (auto const& conf : confidence) {
+ EXPECT_GE(std::get<1>(conf.second) / static_cast<double>(total_calls),
+ std::get<0>(conf.second));
+ EXPECT_GE(std::get<2>(conf.second) / static_cast<double>(total_calls),
+ std::get<0>(conf.second));
+ }
+}
+
+INSTANTIATE_TEST_SUITE_P(ClockGettime, CorrectVDSOClockTest,
+ ::testing::Values(CLOCK_MONOTONIC, CLOCK_REALTIME,
+ CLOCK_BOOTTIME),
+ PrintClockId);
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/vfork.cc b/test/syscalls/linux/vfork.cc
new file mode 100644
index 000000000..19d05998e
--- /dev/null
+++ b/test/syscalls/linux/vfork.cc
@@ -0,0 +1,195 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include <string>
+#include <utility>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/flags/flag.h"
+#include "absl/time/time.h"
+#include "test/util/logging.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/test_util.h"
+#include "test/util/time_util.h"
+
+ABSL_FLAG(bool, vfork_test_child, false,
+ "If true, run the VforkTest child workload.");
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// We don't test with raw CLONE_VFORK to avoid interacting with glibc's use of
+// TLS.
+//
+// Even with vfork(2), we must be careful to do little more in the child than
+// call execve(2). We use the simplest sleep function possible, though this is
+// still precarious, as we're officially only allowed to call execve(2) and
+// _exit(2).
+constexpr absl::Duration kChildDelay = absl::Seconds(10);
+
+// Exit code for successful child subprocesses. We don't want to use 0 since
+// it's too common, and an execve(2) failure causes the child to exit with the
+// errno, so kChildExitCode is chosen to be an unlikely errno:
+constexpr int kChildExitCode = 118; // ENOTNAM: Not a XENIX named type file
+
+int64_t MonotonicNow() {
+ struct timespec now;
+ TEST_PCHECK(clock_gettime(CLOCK_MONOTONIC, &now) == 0);
+ return now.tv_sec * 1000000000ll + now.tv_nsec;
+}
+
+TEST(VforkTest, ParentStopsUntilChildExits) {
+ const auto test = [] {
+ // N.B. Run the test in a single-threaded subprocess because
+ // vfork is not safe in a multi-threaded process.
+
+ const int64_t start = MonotonicNow();
+
+ pid_t pid = vfork();
+ if (pid == 0) {
+ SleepSafe(kChildDelay);
+ _exit(kChildExitCode);
+ }
+ TEST_PCHECK_MSG(pid > 0, "vfork failed");
+ MaybeSave();
+
+ const int64_t end = MonotonicNow();
+
+ absl::Duration dur = absl::Nanoseconds(end - start);
+
+ TEST_CHECK(dur >= kChildDelay);
+
+ int status = 0;
+ TEST_PCHECK(RetryEINTR(waitpid)(pid, &status, 0));
+ TEST_CHECK(WIFEXITED(status));
+ TEST_CHECK(WEXITSTATUS(status) == kChildExitCode);
+ };
+
+ EXPECT_THAT(InForkedProcess(test), IsPosixErrorOkAndHolds(0));
+}
+
+TEST(VforkTest, ParentStopsUntilChildExecves_NoRandomSave) {
+ ExecveArray const owned_child_argv = {"/proc/self/exe", "--vfork_test_child"};
+ char* const* const child_argv = owned_child_argv.get();
+
+ const auto test = [&] {
+ const int64_t start = MonotonicNow();
+
+ pid_t pid = vfork();
+ if (pid == 0) {
+ SleepSafe(kChildDelay);
+ execve(child_argv[0], child_argv, /* envp = */ nullptr);
+ _exit(errno);
+ }
+ // Don't attempt save/restore until after recording end_time,
+ // since the test expects an upper bound on the time spent
+ // stopped.
+ int saved_errno = errno;
+ const int64_t end = MonotonicNow();
+ errno = saved_errno;
+ TEST_PCHECK_MSG(pid > 0, "vfork failed");
+ MaybeSave();
+
+ absl::Duration dur = absl::Nanoseconds(end - start);
+
+ // The parent should resume execution after execve, but before
+ // the post-execve test child exits.
+ TEST_CHECK(dur >= kChildDelay);
+ TEST_CHECK(dur <= 2 * kChildDelay);
+
+ int status = 0;
+ TEST_PCHECK(RetryEINTR(waitpid)(pid, &status, 0));
+ TEST_CHECK(WIFEXITED(status));
+ TEST_CHECK(WEXITSTATUS(status) == kChildExitCode);
+ };
+
+ EXPECT_THAT(InForkedProcess(test), IsPosixErrorOkAndHolds(0));
+}
+
+// A vfork child does not unstop the parent a second time when it exits after
+// exec.
+TEST(VforkTest, ExecedChildExitDoesntUnstopParent_NoRandomSave) {
+ ExecveArray const owned_child_argv = {"/proc/self/exe", "--vfork_test_child"};
+ char* const* const child_argv = owned_child_argv.get();
+
+ const auto test = [&] {
+ pid_t pid1 = vfork();
+ if (pid1 == 0) {
+ execve(child_argv[0], child_argv, /* envp = */ nullptr);
+ _exit(errno);
+ }
+ TEST_PCHECK_MSG(pid1 > 0, "vfork failed");
+ MaybeSave();
+
+ // pid1 exec'd and is now sleeping.
+ SleepSafe(kChildDelay / 2);
+
+ const int64_t start = MonotonicNow();
+
+ pid_t pid2 = vfork();
+ if (pid2 == 0) {
+ SleepSafe(kChildDelay);
+ _exit(kChildExitCode);
+ }
+ TEST_PCHECK_MSG(pid2 > 0, "vfork failed");
+ MaybeSave();
+
+ const int64_t end = MonotonicNow();
+
+ absl::Duration dur = absl::Nanoseconds(end - start);
+
+ // The parent should resume execution only after pid2 exits, not
+ // when pid1 exits.
+ TEST_CHECK(dur >= kChildDelay);
+
+ int status = 0;
+ TEST_PCHECK(RetryEINTR(waitpid)(pid1, &status, 0));
+ TEST_CHECK(WIFEXITED(status));
+ TEST_CHECK(WEXITSTATUS(status) == kChildExitCode);
+
+ TEST_PCHECK(RetryEINTR(waitpid)(pid2, &status, 0));
+ TEST_CHECK(WIFEXITED(status));
+ TEST_CHECK(WEXITSTATUS(status) == kChildExitCode);
+ };
+
+ EXPECT_THAT(InForkedProcess(test), IsPosixErrorOkAndHolds(0));
+}
+
+int RunChild() {
+ SleepSafe(kChildDelay);
+ return kChildExitCode;
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
+
+int main(int argc, char** argv) {
+ gvisor::testing::TestInit(&argc, &argv);
+
+ if (absl::GetFlag(FLAGS_vfork_test_child)) {
+ return gvisor::testing::RunChild();
+ }
+
+ return gvisor::testing::RunAllTests();
+}
diff --git a/test/syscalls/linux/vsyscall.cc b/test/syscalls/linux/vsyscall.cc
new file mode 100644
index 000000000..ae4377108
--- /dev/null
+++ b/test/syscalls/linux/vsyscall.cc
@@ -0,0 +1,46 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <time.h>
+
+#include "gtest/gtest.h"
+#include "test/util/proc_util.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+#if defined(__x86_64__) || defined(__i386__)
+time_t vsyscall_time(time_t* t) {
+ constexpr uint64_t kVsyscallTimeEntry = 0xffffffffff600400;
+ return reinterpret_cast<time_t (*)(time_t*)>(kVsyscallTimeEntry)(t);
+}
+
+TEST(VsyscallTest, VsyscallAlwaysAvailableOnGvisor) {
+ SKIP_IF(!IsRunningOnGvisor());
+ // Vsyscall is always advertised by gvisor.
+ EXPECT_TRUE(ASSERT_NO_ERRNO_AND_VALUE(IsVsyscallEnabled()));
+ // Vsyscall should always works on gvisor.
+ time_t t;
+ EXPECT_THAT(vsyscall_time(&t), SyscallSucceeds());
+}
+#endif
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/wait.cc b/test/syscalls/linux/wait.cc
new file mode 100644
index 000000000..944149d5e
--- /dev/null
+++ b/test/syscalls/linux/wait.cc
@@ -0,0 +1,913 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <signal.h>
+#include <sys/mman.h>
+#include <sys/ptrace.h>
+#include <sys/resource.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include <functional>
+#include <tuple>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/strings/str_cat.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "test/util/cleanup.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/logging.h"
+#include "test/util/multiprocess_util.h"
+#include "test/util/posix_error.h"
+#include "test/util/signal_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+#include "test/util/time_util.h"
+
+using ::testing::UnorderedElementsAre;
+
+// These unit tests focus on the wait4(2) system call, but include a basic
+// checks for the i386 waitpid(2) syscall, which is a subset of wait4(2).
+//
+// NOTE(b/22640830,b/27680907,b/29049891): Some functionality is not tested as
+// it is not currently supported by gVisor:
+// * Process groups.
+// * Core dump status (WCOREDUMP).
+//
+// Tests for waiting on stopped/continued children are in sigstop.cc.
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// The CloneChild function seems to need more than one page of stack space.
+static const size_t kStackSize = 2 * kPageSize;
+
+// The child thread created in CloneAndExit runs this function.
+// This child does not have the TLS setup, so it must not use glibc functions.
+int CloneChild(void* priv) {
+ int64_t sleep = reinterpret_cast<int64_t>(priv);
+ SleepSafe(absl::Seconds(sleep));
+
+ // glibc's _exit(2) function wrapper will helpfully call exit_group(2),
+ // exiting the entire process.
+ syscall(__NR_exit, 0);
+ return 1;
+}
+
+// ForkAndExit forks a child process which exits with exit_code, after
+// sleeping for the specified duration (seconds).
+pid_t ForkAndExit(int exit_code, int64_t sleep) {
+ pid_t child = fork();
+ if (child == 0) {
+ SleepSafe(absl::Seconds(sleep));
+ _exit(exit_code);
+ }
+ return child;
+}
+
+int64_t clock_gettime_nsecs(clockid_t id) {
+ struct timespec ts;
+ TEST_PCHECK(clock_gettime(id, &ts) == 0);
+ return (ts.tv_sec * 1000000000 + ts.tv_nsec);
+}
+
+void spin(int64_t sec) {
+ int64_t ns = sec * 1000000000;
+ int64_t start = clock_gettime_nsecs(CLOCK_THREAD_CPUTIME_ID);
+ int64_t end = start + ns;
+
+ do {
+ constexpr int kLoopCount = 1000000; // large and arbitrary
+ // volatile to prevent the compiler from skipping this loop.
+ for (volatile int i = 0; i < kLoopCount; i++) {
+ }
+ } while (clock_gettime_nsecs(CLOCK_THREAD_CPUTIME_ID) < end);
+}
+
+// ForkSpinAndExit forks a child process which exits with exit_code, after
+// spinning for the specified duration (seconds).
+pid_t ForkSpinAndExit(int exit_code, int64_t spintime) {
+ pid_t child = fork();
+ if (child == 0) {
+ spin(spintime);
+ _exit(exit_code);
+ }
+ return child;
+}
+
+absl::Duration RusageCpuTime(const struct rusage& ru) {
+ return absl::DurationFromTimeval(ru.ru_utime) +
+ absl::DurationFromTimeval(ru.ru_stime);
+}
+
+// Returns the address of the top of the stack.
+// Free with FreeStack.
+uintptr_t AllocStack() {
+ void* addr = mmap(nullptr, kStackSize, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+ if (addr == MAP_FAILED) {
+ return reinterpret_cast<uintptr_t>(MAP_FAILED);
+ }
+
+ return reinterpret_cast<uintptr_t>(addr) + kStackSize;
+}
+
+// Frees a stack page allocated with AllocStack.
+int FreeStack(uintptr_t addr) {
+ addr -= kStackSize;
+ return munmap(reinterpret_cast<void*>(addr), kPageSize);
+}
+
+// CloneAndExit clones a child thread, which exits with 0 after sleeping for
+// the specified duration (must be in seconds). extra_flags are ORed against
+// the standard clone(2) flags.
+int CloneAndExit(int64_t sleep, uintptr_t stack, int extra_flags) {
+ return clone(CloneChild, reinterpret_cast<void*>(stack),
+ CLONE_FILES | CLONE_FS | CLONE_SIGHAND | CLONE_VM | extra_flags,
+ reinterpret_cast<void*>(sleep));
+}
+
+// Simple wrappers around wait4(2) and waitid(2) that ignore interrupts.
+constexpr auto Wait4 = RetryEINTR(wait4);
+constexpr auto Waitid = RetryEINTR(waitid);
+
+// Fixture for tests parameterized by a function that waits for any child to
+// exit with the given options, checks that it exited with the given code, and
+// then returns its PID.
+//
+// N.B. These tests run in a multi-threaded environment. We assume that
+// background threads do not create child processes and are not themselves
+// created with clone(... | SIGCHLD). Either may cause these tests to
+// erroneously wait on child processes/threads.
+class WaitAnyChildTest : public ::testing::TestWithParam<
+ std::function<PosixErrorOr<pid_t>(int, int)>> {
+ protected:
+ PosixErrorOr<pid_t> WaitAny(int code) { return WaitAnyWithOptions(code, 0); }
+
+ PosixErrorOr<pid_t> WaitAnyWithOptions(int code, int options) {
+ return GetParam()(code, options);
+ }
+};
+
+// Wait for any child to exit.
+TEST_P(WaitAnyChildTest, Fork) {
+ pid_t child;
+ ASSERT_THAT(child = ForkAndExit(0, 0), SyscallSucceeds());
+
+ EXPECT_THAT(WaitAny(0), IsPosixErrorOkAndHolds(child));
+}
+
+// Call wait4 for any process after the child has already exited.
+TEST_P(WaitAnyChildTest, AfterExit) {
+ pid_t child;
+ ASSERT_THAT(child = ForkAndExit(0, 0), SyscallSucceeds());
+
+ absl::SleepFor(absl::Seconds(5));
+
+ EXPECT_THAT(WaitAny(0), IsPosixErrorOkAndHolds(child));
+}
+
+// Wait for multiple children to exit, waiting for either at a time.
+TEST_P(WaitAnyChildTest, MultipleFork) {
+ pid_t child1, child2;
+ ASSERT_THAT(child1 = ForkAndExit(0, 0), SyscallSucceeds());
+ ASSERT_THAT(child2 = ForkAndExit(0, 0), SyscallSucceeds());
+
+ std::vector<pid_t> pids;
+ pids.push_back(ASSERT_NO_ERRNO_AND_VALUE(WaitAny(0)));
+ pids.push_back(ASSERT_NO_ERRNO_AND_VALUE(WaitAny(0)));
+ EXPECT_THAT(pids, UnorderedElementsAre(child1, child2));
+}
+
+// Wait for any child to exit.
+// A non-CLONE_THREAD child which sends SIGCHLD upon exit behaves much like
+// a forked process.
+TEST_P(WaitAnyChildTest, CloneSIGCHLD) {
+ uintptr_t stack;
+ ASSERT_THAT(stack = AllocStack(), SyscallSucceeds());
+ auto free =
+ Cleanup([stack] { ASSERT_THAT(FreeStack(stack), SyscallSucceeds()); });
+
+ int child;
+ ASSERT_THAT(child = CloneAndExit(0, stack, SIGCHLD), SyscallSucceeds());
+
+ EXPECT_THAT(WaitAny(0), IsPosixErrorOkAndHolds(child));
+}
+
+// Wait for a child thread and process.
+TEST_P(WaitAnyChildTest, ForkAndClone) {
+ pid_t process;
+ ASSERT_THAT(process = ForkAndExit(0, 0), SyscallSucceeds());
+
+ uintptr_t stack;
+ ASSERT_THAT(stack = AllocStack(), SyscallSucceeds());
+ auto free =
+ Cleanup([stack] { ASSERT_THAT(FreeStack(stack), SyscallSucceeds()); });
+
+ int thread;
+ // Send SIGCHLD for normal wait semantics.
+ ASSERT_THAT(thread = CloneAndExit(0, stack, SIGCHLD), SyscallSucceeds());
+
+ std::vector<pid_t> pids;
+ pids.push_back(ASSERT_NO_ERRNO_AND_VALUE(WaitAny(0)));
+ pids.push_back(ASSERT_NO_ERRNO_AND_VALUE(WaitAny(0)));
+ EXPECT_THAT(pids, UnorderedElementsAre(process, thread));
+}
+
+// Return immediately if no child has exited.
+TEST_P(WaitAnyChildTest, WaitWNOHANG) {
+ EXPECT_THAT(WaitAnyWithOptions(0, WNOHANG),
+ PosixErrorIs(ECHILD, ::testing::_));
+}
+
+// Bad options passed
+TEST_P(WaitAnyChildTest, BadOption) {
+ EXPECT_THAT(WaitAnyWithOptions(0, 123456),
+ PosixErrorIs(EINVAL, ::testing::_));
+}
+
+TEST_P(WaitAnyChildTest, WaitedChildRusage) {
+ struct rusage before;
+ ASSERT_THAT(getrusage(RUSAGE_CHILDREN, &before), SyscallSucceeds());
+
+ pid_t child;
+ constexpr absl::Duration kSpin = absl::Seconds(3);
+ ASSERT_THAT(child = ForkSpinAndExit(0, absl::ToInt64Seconds(kSpin)),
+ SyscallSucceeds());
+ ASSERT_THAT(WaitAny(0), IsPosixErrorOkAndHolds(child));
+
+ struct rusage after;
+ ASSERT_THAT(getrusage(RUSAGE_CHILDREN, &after), SyscallSucceeds());
+
+ EXPECT_GE(RusageCpuTime(after) - RusageCpuTime(before), kSpin);
+}
+
+TEST_P(WaitAnyChildTest, IgnoredChildRusage) {
+ // "POSIX.1-2001 specifies that if the disposition of SIGCHLD is
+ // set to SIG_IGN or the SA_NOCLDWAIT flag is set for SIGCHLD (see
+ // sigaction(2)), then children that terminate do not become zombies and a
+ // call to wait() or waitpid() will block until all children have terminated,
+ // and then fail with errno set to ECHILD." - waitpid(2)
+ //
+ // "RUSAGE_CHILDREN: Return resource usage statistics for all children of the
+ // calling process that have terminated *and been waited for*." -
+ // getrusage(2), emphasis added
+
+ struct sigaction sa;
+ sa.sa_handler = SIG_IGN;
+ const auto cleanup_sigact =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSigaction(SIGCHLD, sa));
+
+ struct rusage before;
+ ASSERT_THAT(getrusage(RUSAGE_CHILDREN, &before), SyscallSucceeds());
+
+ const absl::Duration start =
+ absl::Nanoseconds(clock_gettime_nsecs(CLOCK_MONOTONIC));
+
+ constexpr absl::Duration kSpin = absl::Seconds(3);
+
+ // ForkAndSpin uses CLOCK_THREAD_CPUTIME_ID, which is lower resolution than,
+ // and may diverge from, CLOCK_MONOTONIC, so we allow a small grace period but
+ // still check that we blocked for a while.
+ constexpr absl::Duration kSpinGrace = absl::Milliseconds(100);
+
+ pid_t child;
+ ASSERT_THAT(child = ForkSpinAndExit(0, absl::ToInt64Seconds(kSpin)),
+ SyscallSucceeds());
+ ASSERT_THAT(WaitAny(0), PosixErrorIs(ECHILD, ::testing::_));
+ const absl::Duration end =
+ absl::Nanoseconds(clock_gettime_nsecs(CLOCK_MONOTONIC));
+ EXPECT_GE(end - start, kSpin - kSpinGrace);
+
+ struct rusage after;
+ ASSERT_THAT(getrusage(RUSAGE_CHILDREN, &after), SyscallSucceeds());
+ EXPECT_EQ(before.ru_utime.tv_sec, after.ru_utime.tv_sec);
+ EXPECT_EQ(before.ru_utime.tv_usec, after.ru_utime.tv_usec);
+ EXPECT_EQ(before.ru_stime.tv_sec, after.ru_stime.tv_sec);
+ EXPECT_EQ(before.ru_stime.tv_usec, after.ru_stime.tv_usec);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ Waiters, WaitAnyChildTest,
+ ::testing::Values(
+ [](int code, int options) -> PosixErrorOr<pid_t> {
+ int status;
+ auto const pid = Wait4(-1, &status, options, nullptr);
+ MaybeSave();
+ if (pid < 0) {
+ return PosixError(errno, "wait4");
+ }
+ if (!WIFEXITED(status) || WEXITSTATUS(status) != code) {
+ return PosixError(
+ EINVAL, absl::StrCat("unexpected wait status: got ", status,
+ ", wanted ", code));
+ }
+ return static_cast<pid_t>(pid);
+ },
+ [](int code, int options) -> PosixErrorOr<pid_t> {
+ siginfo_t si;
+ auto const rv = Waitid(P_ALL, 0, &si, WEXITED | options);
+ MaybeSave();
+ if (rv < 0) {
+ return PosixError(errno, "waitid");
+ }
+ if (si.si_signo != SIGCHLD) {
+ return PosixError(
+ EINVAL, absl::StrCat("unexpected signo: got ", si.si_signo,
+ ", wanted ", SIGCHLD));
+ }
+ if (si.si_status != code) {
+ return PosixError(
+ EINVAL, absl::StrCat("unexpected status: got ", si.si_status,
+ ", wanted ", code));
+ }
+ if (si.si_code != CLD_EXITED) {
+ return PosixError(EINVAL,
+ absl::StrCat("unexpected code: got ", si.si_code,
+ ", wanted ", CLD_EXITED));
+ }
+ auto const uid = getuid();
+ if (si.si_uid != uid) {
+ return PosixError(EINVAL,
+ absl::StrCat("unexpected uid: got ", si.si_uid,
+ ", wanted ", uid));
+ }
+ return static_cast<pid_t>(si.si_pid);
+ }));
+
+// Fixture for tests parameterized by a (sysno, function) tuple. The function
+// takes the PID of a specific child to wait for, waits for it to exit, and
+// checks that it exits with the given code.
+class WaitSpecificChildTest
+ : public ::testing::TestWithParam<
+ std::tuple<int, std::function<PosixError(pid_t, int, int)>>> {
+ protected:
+ int Sysno() { return std::get<0>(GetParam()); }
+
+ PosixError WaitForWithOptions(pid_t pid, int options, int code) {
+ return std::get<1>(GetParam())(pid, options, code);
+ }
+
+ PosixError WaitFor(pid_t pid, int code) {
+ return std::get<1>(GetParam())(pid, 0, code);
+ }
+};
+
+// Wait for specific child to exit.
+TEST_P(WaitSpecificChildTest, Fork) {
+ pid_t child;
+ ASSERT_THAT(child = ForkAndExit(0, 0), SyscallSucceeds());
+
+ EXPECT_NO_ERRNO(WaitFor(child, 0));
+}
+
+// Non-zero exit codes are correctly propagated.
+TEST_P(WaitSpecificChildTest, NormalExit) {
+ pid_t child;
+ ASSERT_THAT(child = ForkAndExit(42, 0), SyscallSucceeds());
+
+ EXPECT_NO_ERRNO(WaitFor(child, 42));
+}
+
+// Wait for multiple children to exit.
+TEST_P(WaitSpecificChildTest, MultipleFork) {
+ pid_t child1, child2;
+ ASSERT_THAT(child1 = ForkAndExit(0, 0), SyscallSucceeds());
+ ASSERT_THAT(child2 = ForkAndExit(0, 0), SyscallSucceeds());
+
+ EXPECT_NO_ERRNO(WaitFor(child1, 0));
+ EXPECT_NO_ERRNO(WaitFor(child2, 0));
+}
+
+// Wait for multiple children to exit, out of the order they were created.
+TEST_P(WaitSpecificChildTest, MultipleForkOutOfOrder) {
+ pid_t child1, child2;
+ ASSERT_THAT(child1 = ForkAndExit(0, 0), SyscallSucceeds());
+ ASSERT_THAT(child2 = ForkAndExit(0, 0), SyscallSucceeds());
+
+ EXPECT_NO_ERRNO(WaitFor(child2, 0));
+ EXPECT_NO_ERRNO(WaitFor(child1, 0));
+}
+
+// Wait for specific child to exit, entering wait4 before the exit occurs.
+TEST_P(WaitSpecificChildTest, ForkSleep) {
+ pid_t child;
+ ASSERT_THAT(child = ForkAndExit(0, 5), SyscallSucceeds());
+
+ EXPECT_NO_ERRNO(WaitFor(child, 0));
+}
+
+// Wait should block until the child exits.
+TEST_P(WaitSpecificChildTest, ForkBlock) {
+ pid_t child;
+
+ auto start = absl::Now();
+ ASSERT_THAT(child = ForkAndExit(0, 5), SyscallSucceeds());
+
+ EXPECT_NO_ERRNO(WaitFor(child, 0));
+
+ EXPECT_GE(absl::Now() - start, absl::Seconds(5));
+}
+
+// Waiting after the child has already exited returns immediately.
+TEST_P(WaitSpecificChildTest, AfterExit) {
+ pid_t child;
+ ASSERT_THAT(child = ForkAndExit(0, 0), SyscallSucceeds());
+
+ absl::SleepFor(absl::Seconds(5));
+
+ EXPECT_NO_ERRNO(WaitFor(child, 0));
+}
+
+// Wait for child of sibling thread.
+TEST_P(WaitSpecificChildTest, SiblingChildren) {
+ absl::Mutex mu;
+ pid_t child;
+ bool ready = false;
+ bool stop = false;
+
+ ScopedThread t([&] {
+ absl::MutexLock ml(&mu);
+ EXPECT_THAT(child = ForkAndExit(0, 0), SyscallSucceeds());
+ ready = true;
+ mu.Await(absl::Condition(&stop));
+ });
+
+ // N.B. This must be declared after ScopedThread, so it is destructed first,
+ // thus waking the thread.
+ absl::MutexLock ml(&mu);
+ mu.Await(absl::Condition(&ready));
+
+ EXPECT_NO_ERRNO(WaitFor(child, 0));
+
+ // Keep the sibling alive until after we've waited so the child isn't
+ // reparented.
+ stop = true;
+}
+
+// Waiting for child of sibling thread not allowed with WNOTHREAD.
+TEST_P(WaitSpecificChildTest, SiblingChildrenWNOTHREAD) {
+ // Linux added WNOTHREAD support to waitid(2) in
+ // 91c4e8ea8f05916df0c8a6f383508ac7c9e10dba ("wait: allow sys_waitid() to
+ // accept __WNOTHREAD/__WCLONE/__WALL"). i.e., Linux 4.7.
+ //
+ // Skip the test if it isn't supported yet.
+ if (Sysno() == SYS_waitid) {
+ int ret = waitid(P_ALL, 0, nullptr, WEXITED | WNOHANG | __WNOTHREAD);
+ SKIP_IF(ret < 0 && errno == EINVAL);
+ }
+
+ absl::Mutex mu;
+ pid_t child;
+ bool ready = false;
+ bool stop = false;
+
+ ScopedThread t([&] {
+ absl::MutexLock ml(&mu);
+ EXPECT_THAT(child = ForkAndExit(0, 0), SyscallSucceeds());
+ ready = true;
+ mu.Await(absl::Condition(&stop));
+
+ // This thread can wait on child.
+ EXPECT_NO_ERRNO(WaitForWithOptions(child, __WNOTHREAD, 0));
+ });
+
+ // N.B. This must be declared after ScopedThread, so it is destructed first,
+ // thus waking the thread.
+ absl::MutexLock ml(&mu);
+ mu.Await(absl::Condition(&ready));
+
+ // This thread can't wait on child.
+ EXPECT_THAT(WaitForWithOptions(child, __WNOTHREAD, 0),
+ PosixErrorIs(ECHILD, ::testing::_));
+
+ // Keep the sibling alive until after we've waited so the child isn't
+ // reparented.
+ stop = true;
+}
+
+// Wait for specific child to exit.
+// A non-CLONE_THREAD child which sends SIGCHLD upon exit behaves much like
+// a forked process.
+TEST_P(WaitSpecificChildTest, CloneSIGCHLD) {
+ uintptr_t stack;
+ ASSERT_THAT(stack = AllocStack(), SyscallSucceeds());
+ auto free =
+ Cleanup([stack] { ASSERT_THAT(FreeStack(stack), SyscallSucceeds()); });
+
+ int child;
+ ASSERT_THAT(child = CloneAndExit(0, stack, SIGCHLD), SyscallSucceeds());
+
+ EXPECT_NO_ERRNO(WaitFor(child, 0));
+}
+
+// Wait for specific child to exit.
+// A non-CLONE_THREAD child which does not send SIGCHLD upon exit can be waited
+// on, but returns ECHILD.
+TEST_P(WaitSpecificChildTest, CloneNoSIGCHLD) {
+ uintptr_t stack;
+ ASSERT_THAT(stack = AllocStack(), SyscallSucceeds());
+ auto free =
+ Cleanup([stack] { ASSERT_THAT(FreeStack(stack), SyscallSucceeds()); });
+
+ int child;
+ ASSERT_THAT(child = CloneAndExit(0, stack, 0), SyscallSucceeds());
+
+ EXPECT_THAT(WaitFor(child, 0), PosixErrorIs(ECHILD, ::testing::_));
+}
+
+// Waiting after the child has already exited returns immediately.
+TEST_P(WaitSpecificChildTest, CloneAfterExit) {
+ uintptr_t stack;
+ ASSERT_THAT(stack = AllocStack(), SyscallSucceeds());
+ auto free =
+ Cleanup([stack] { ASSERT_THAT(FreeStack(stack), SyscallSucceeds()); });
+
+ int child;
+ // Send SIGCHLD for normal wait semantics.
+ ASSERT_THAT(child = CloneAndExit(0, stack, SIGCHLD), SyscallSucceeds());
+
+ absl::SleepFor(absl::Seconds(5));
+
+ EXPECT_NO_ERRNO(WaitFor(child, 0));
+}
+
+// A CLONE_THREAD child cannot be waited on.
+TEST_P(WaitSpecificChildTest, CloneThread) {
+ uintptr_t stack;
+ ASSERT_THAT(stack = AllocStack(), SyscallSucceeds());
+ auto free =
+ Cleanup([stack] { ASSERT_THAT(FreeStack(stack), SyscallSucceeds()); });
+
+ int child;
+ ASSERT_THAT(child = CloneAndExit(15, stack, CLONE_THREAD), SyscallSucceeds());
+ auto start = absl::Now();
+
+ EXPECT_THAT(WaitFor(child, 0), PosixErrorIs(ECHILD, ::testing::_));
+
+ // Ensure wait4 didn't block.
+ EXPECT_LE(absl::Now() - start, absl::Seconds(10));
+
+ // Since we can't wait on the child, we sleep to try to avoid freeing its
+ // stack before it exits.
+ absl::SleepFor(absl::Seconds(5));
+}
+
+// A child that does not send a SIGCHLD on exit may be waited on with
+// the __WCLONE flag.
+TEST_P(WaitSpecificChildTest, CloneWCLONE) {
+ // Linux added WCLONE support to waitid(2) in
+ // 91c4e8ea8f05916df0c8a6f383508ac7c9e10dba ("wait: allow sys_waitid() to
+ // accept __WNOTHREAD/__WCLONE/__WALL"). i.e., Linux 4.7.
+ //
+ // Skip the test if it isn't supported yet.
+ if (Sysno() == SYS_waitid) {
+ int ret = waitid(P_ALL, 0, nullptr, WEXITED | WNOHANG | __WCLONE);
+ SKIP_IF(ret < 0 && errno == EINVAL);
+ }
+
+ uintptr_t stack;
+ ASSERT_THAT(stack = AllocStack(), SyscallSucceeds());
+ auto free =
+ Cleanup([stack] { ASSERT_THAT(FreeStack(stack), SyscallSucceeds()); });
+
+ int child;
+ ASSERT_THAT(child = CloneAndExit(0, stack, 0), SyscallSucceeds());
+
+ EXPECT_NO_ERRNO(WaitForWithOptions(child, __WCLONE, 0));
+}
+
+// A forked child cannot be waited on with WCLONE.
+TEST_P(WaitSpecificChildTest, ForkWCLONE) {
+ // Linux added WCLONE support to waitid(2) in
+ // 91c4e8ea8f05916df0c8a6f383508ac7c9e10dba ("wait: allow sys_waitid() to
+ // accept __WNOTHREAD/__WCLONE/__WALL"). i.e., Linux 4.7.
+ //
+ // Skip the test if it isn't supported yet.
+ if (Sysno() == SYS_waitid) {
+ int ret = waitid(P_ALL, 0, nullptr, WEXITED | WNOHANG | __WCLONE);
+ SKIP_IF(ret < 0 && errno == EINVAL);
+ }
+
+ pid_t child;
+ ASSERT_THAT(child = ForkAndExit(0, 0), SyscallSucceeds());
+
+ EXPECT_THAT(WaitForWithOptions(child, WNOHANG | __WCLONE, 0),
+ PosixErrorIs(ECHILD, ::testing::_));
+
+ EXPECT_NO_ERRNO(WaitFor(child, 0));
+}
+
+// Any type of child can be waited on with WALL.
+TEST_P(WaitSpecificChildTest, WALL) {
+ // Linux added WALL support to waitid(2) in
+ // 91c4e8ea8f05916df0c8a6f383508ac7c9e10dba ("wait: allow sys_waitid() to
+ // accept __WNOTHREAD/__WCLONE/__WALL"). i.e., Linux 4.7.
+ //
+ // Skip the test if it isn't supported yet.
+ if (Sysno() == SYS_waitid) {
+ int ret = waitid(P_ALL, 0, nullptr, WEXITED | WNOHANG | __WALL);
+ SKIP_IF(ret < 0 && errno == EINVAL);
+ }
+
+ pid_t child;
+ ASSERT_THAT(child = ForkAndExit(0, 0), SyscallSucceeds());
+
+ EXPECT_NO_ERRNO(WaitForWithOptions(child, __WALL, 0));
+
+ uintptr_t stack;
+ ASSERT_THAT(stack = AllocStack(), SyscallSucceeds());
+ auto free =
+ Cleanup([stack] { ASSERT_THAT(FreeStack(stack), SyscallSucceeds()); });
+
+ ASSERT_THAT(child = CloneAndExit(0, stack, 0), SyscallSucceeds());
+
+ EXPECT_NO_ERRNO(WaitForWithOptions(child, __WALL, 0));
+}
+
+// Return ECHILD for bad child.
+TEST_P(WaitSpecificChildTest, BadChild) {
+ EXPECT_THAT(WaitFor(42, 0), PosixErrorIs(ECHILD, ::testing::_));
+}
+
+// Wait for a child process that only exits after calling execve(2) from a
+// non-leader thread.
+TEST_P(WaitSpecificChildTest, AfterChildExecve) {
+ ExecveArray const owned_child_argv = {"/bin/true"};
+ char* const* const child_argv = owned_child_argv.get();
+
+ uintptr_t stack;
+ ASSERT_THAT(stack = AllocStack(), SyscallSucceeds());
+ auto free =
+ Cleanup([stack] { ASSERT_THAT(FreeStack(stack), SyscallSucceeds()); });
+
+ pid_t const child = fork();
+ if (child == 0) {
+ // Give the parent some time to start waiting.
+ SleepSafe(absl::Seconds(5));
+ // Pass CLONE_VFORK to block the original thread in the child process until
+ // the clone thread calls execve, annihilating them both. (This means that
+ // if clone returns at all, something went wrong.)
+ //
+ // N.B. clone(2) is not officially async-signal-safe, but at minimum glibc's
+ // x86_64 implementation is safe. See glibc
+ // sysdeps/unix/sysv/linux/x86_64/clone.S.
+ clone(
+ +[](void* arg) {
+ auto child_argv = static_cast<char* const*>(arg);
+ execve(child_argv[0], child_argv, /* envp = */ nullptr);
+ return errno;
+ },
+ reinterpret_cast<void*>(stack),
+ CLONE_FILES | CLONE_FS | CLONE_SIGHAND | CLONE_THREAD | CLONE_VM |
+ CLONE_VFORK,
+ const_cast<char**>(child_argv));
+ _exit(errno);
+ }
+ ASSERT_THAT(child, SyscallSucceeds());
+ EXPECT_NO_ERRNO(WaitFor(child, 0));
+}
+
+PosixError CheckWait4(pid_t pid, int options, int code) {
+ int status;
+ auto const rv = Wait4(pid, &status, options, nullptr);
+ MaybeSave();
+ if (rv < 0) {
+ return PosixError(errno, "wait4");
+ } else if (rv != pid) {
+ return PosixError(
+ EINVAL, absl::StrCat("unexpected pid: got ", rv, ", wanted ", pid));
+ }
+ if (!WIFEXITED(status) || WEXITSTATUS(status) != code) {
+ return PosixError(EINVAL, absl::StrCat("unexpected wait status: got ",
+ status, ", wanted ", code));
+ }
+ return NoError();
+};
+
+PosixError CheckWaitid(pid_t pid, int options, int code) {
+ siginfo_t si;
+ auto const rv = Waitid(P_PID, pid, &si, options | WEXITED);
+ MaybeSave();
+ if (rv < 0) {
+ return PosixError(errno, "waitid");
+ }
+ if (si.si_pid != pid) {
+ return PosixError(EINVAL, absl::StrCat("unexpected pid: got ", si.si_pid,
+ ", wanted ", pid));
+ }
+ if (si.si_signo != SIGCHLD) {
+ return PosixError(EINVAL, absl::StrCat("unexpected signo: got ",
+ si.si_signo, ", wanted ", SIGCHLD));
+ }
+ if (si.si_status != code) {
+ return PosixError(EINVAL, absl::StrCat("unexpected status: got ",
+ si.si_status, ", wanted ", code));
+ }
+ if (si.si_code != CLD_EXITED) {
+ return PosixError(EINVAL, absl::StrCat("unexpected code: got ", si.si_code,
+ ", wanted ", CLD_EXITED));
+ }
+ return NoError();
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ Waiters, WaitSpecificChildTest,
+ ::testing::Values(std::make_tuple(SYS_wait4, CheckWait4),
+ std::make_tuple(SYS_waitid, CheckWaitid)));
+
+// WIFEXITED, WIFSIGNALED, WTERMSIG indicate signal exit.
+TEST(WaitTest, SignalExit) {
+ pid_t child;
+ ASSERT_THAT(child = ForkAndExit(0, 10), SyscallSucceeds());
+
+ EXPECT_THAT(kill(child, SIGKILL), SyscallSucceeds());
+
+ int status;
+ EXPECT_THAT(Wait4(child, &status, 0, nullptr),
+ SyscallSucceedsWithValue(child));
+
+ EXPECT_FALSE(WIFEXITED(status));
+ EXPECT_TRUE(WIFSIGNALED(status));
+ EXPECT_EQ(SIGKILL, WTERMSIG(status));
+}
+
+// waitid requires at least one option.
+TEST(WaitTest, WaitidOptions) {
+ EXPECT_THAT(Waitid(P_ALL, 0, nullptr, 0), SyscallFailsWithErrno(EINVAL));
+}
+
+// waitid does not wait for a child to exit if not passed WEXITED.
+TEST(WaitTest, WaitidNoWEXITED) {
+ pid_t child;
+ ASSERT_THAT(child = ForkAndExit(0, 0), SyscallSucceeds());
+ EXPECT_THAT(Waitid(P_ALL, 0, nullptr, WSTOPPED),
+ SyscallFailsWithErrno(ECHILD));
+ EXPECT_THAT(Waitid(P_ALL, 0, nullptr, WEXITED), SyscallSucceeds());
+}
+
+// WNOWAIT allows the same wait result to be returned again.
+TEST(WaitTest, WaitidWNOWAIT) {
+ pid_t child;
+ ASSERT_THAT(child = ForkAndExit(42, 0), SyscallSucceeds());
+
+ siginfo_t info;
+ ASSERT_THAT(Waitid(P_PID, child, &info, WEXITED | WNOWAIT),
+ SyscallSucceeds());
+ EXPECT_EQ(child, info.si_pid);
+ EXPECT_EQ(SIGCHLD, info.si_signo);
+ EXPECT_EQ(CLD_EXITED, info.si_code);
+ EXPECT_EQ(42, info.si_status);
+
+ ASSERT_THAT(Waitid(P_PID, child, &info, WEXITED), SyscallSucceeds());
+ EXPECT_EQ(child, info.si_pid);
+ EXPECT_EQ(SIGCHLD, info.si_signo);
+ EXPECT_EQ(CLD_EXITED, info.si_code);
+ EXPECT_EQ(42, info.si_status);
+
+ EXPECT_THAT(Waitid(P_PID, child, &info, WEXITED),
+ SyscallFailsWithErrno(ECHILD));
+}
+
+// waitpid(pid, status, options) is equivalent to
+// wait4(pid, status, options, nullptr).
+// This is a dedicated syscall on i386, glibc maps it to wait4 on amd64.
+TEST(WaitTest, WaitPid) {
+ pid_t child;
+ ASSERT_THAT(child = ForkAndExit(42, 0), SyscallSucceeds());
+
+ int status;
+ EXPECT_THAT(RetryEINTR(waitpid)(child, &status, 0),
+ SyscallSucceedsWithValue(child));
+
+ EXPECT_TRUE(WIFEXITED(status));
+ EXPECT_EQ(42, WEXITSTATUS(status));
+}
+
+// Test that signaling a zombie succeeds. This is a signals test that is in this
+// file for some reason.
+TEST(WaitTest, KillZombie) {
+ pid_t child;
+ ASSERT_THAT(child = ForkAndExit(42, 0), SyscallSucceeds());
+
+ // Sleep for three seconds to ensure the child has exited.
+ absl::SleepFor(absl::Seconds(3));
+
+ // The child is now a zombie. Check that killing it returns 0.
+ EXPECT_THAT(kill(child, SIGTERM), SyscallSucceeds());
+ EXPECT_THAT(kill(child, 0), SyscallSucceeds());
+
+ EXPECT_THAT(Wait4(child, nullptr, 0, nullptr),
+ SyscallSucceedsWithValue(child));
+}
+
+TEST(WaitTest, Wait4Rusage) {
+ pid_t child;
+ constexpr absl::Duration kSpin = absl::Seconds(3);
+ ASSERT_THAT(child = ForkSpinAndExit(21, absl::ToInt64Seconds(kSpin)),
+ SyscallSucceeds());
+
+ int status;
+ struct rusage rusage = {};
+ ASSERT_THAT(Wait4(child, &status, 0, &rusage),
+ SyscallSucceedsWithValue(child));
+
+ EXPECT_TRUE(WIFEXITED(status));
+ EXPECT_EQ(21, WEXITSTATUS(status));
+
+ EXPECT_GE(RusageCpuTime(rusage), kSpin);
+}
+
+TEST(WaitTest, WaitidRusage) {
+ pid_t child;
+ constexpr absl::Duration kSpin = absl::Seconds(3);
+ ASSERT_THAT(child = ForkSpinAndExit(27, absl::ToInt64Seconds(kSpin)),
+ SyscallSucceeds());
+
+ siginfo_t si = {};
+ struct rusage rusage = {};
+
+ // From waitid(2):
+ // The raw waitid() system call takes a fifth argument, of type
+ // struct rusage *. If this argument is non-NULL, then it is used
+ // to return resource usage information about the child, in the
+ // same manner as wait4(2).
+ EXPECT_THAT(
+ RetryEINTR(syscall)(SYS_waitid, P_PID, child, &si, WEXITED, &rusage),
+ SyscallSucceeds());
+ EXPECT_EQ(si.si_signo, SIGCHLD);
+ EXPECT_EQ(si.si_code, CLD_EXITED);
+ EXPECT_EQ(si.si_status, 27);
+ EXPECT_EQ(si.si_pid, child);
+
+ EXPECT_GE(RusageCpuTime(rusage), kSpin);
+}
+
+// After bf959931ddb88c4e4366e96dd22e68fa0db9527c ("wait/ptrace: assume __WALL
+// if the child is traced") (Linux 4.7), tracees are always eligible for
+// waiting, regardless of type.
+TEST(WaitTest, TraceeWALL) {
+ int fds[2];
+ ASSERT_THAT(pipe(fds), SyscallSucceeds());
+ FileDescriptor rfd(fds[0]);
+ FileDescriptor wfd(fds[1]);
+
+ pid_t child = fork();
+ if (child == 0) {
+ // Child.
+ rfd.reset();
+
+ TEST_PCHECK(ptrace(PTRACE_TRACEME, 0, nullptr, nullptr) == 0);
+
+ // Notify parent that we're now a tracee.
+ wfd.reset();
+
+ _exit(0);
+ }
+ ASSERT_THAT(child, SyscallSucceeds());
+
+ wfd.reset();
+
+ // Wait for child to become tracee.
+ char c;
+ EXPECT_THAT(ReadFd(rfd.get(), &c, sizeof(c)), SyscallSucceedsWithValue(0));
+
+ // We can wait on the fork child with WCLONE, as it is a tracee.
+ int status;
+ if (IsRunningOnGvisor()) {
+ ASSERT_THAT(Wait4(child, &status, __WCLONE, nullptr),
+ SyscallSucceedsWithValue(child));
+
+ EXPECT_TRUE(WIFEXITED(status) && WEXITSTATUS(status) == 0) << status;
+ } else {
+ // On older versions of Linux, we may get ECHILD.
+ ASSERT_THAT(Wait4(child, &status, __WCLONE, nullptr),
+ ::testing::AnyOf(SyscallSucceedsWithValue(child),
+ SyscallFailsWithErrno(ECHILD)));
+ }
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/write.cc b/test/syscalls/linux/write.cc
new file mode 100644
index 000000000..39b5b2f56
--- /dev/null
+++ b/test/syscalls/linux/write.cc
@@ -0,0 +1,139 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <sys/resource.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "test/util/cleanup.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+// TODO(gvisor.dev/issue/2370): This test is currently very rudimentary.
+class WriteTest : public ::testing::Test {
+ public:
+ ssize_t WriteBytes(int fd, int bytes) {
+ std::vector<char> buf(bytes);
+ std::fill(buf.begin(), buf.end(), 'a');
+ return WriteFd(fd, buf.data(), buf.size());
+ }
+};
+
+TEST_F(WriteTest, WriteNoExceedsRLimit) {
+ // Get the current rlimit and restore after test run.
+ struct rlimit initial_lim;
+ ASSERT_THAT(getrlimit(RLIMIT_FSIZE, &initial_lim), SyscallSucceeds());
+ auto cleanup = Cleanup([&initial_lim] {
+ EXPECT_THAT(setrlimit(RLIMIT_FSIZE, &initial_lim), SyscallSucceeds());
+ });
+
+ int fd;
+ struct rlimit setlim;
+ const int target_lim = 1024;
+ setlim.rlim_cur = target_lim;
+ setlim.rlim_max = RLIM_INFINITY;
+ const std::string pathname = NewTempAbsPath();
+ ASSERT_THAT(fd = open(pathname.c_str(), O_WRONLY | O_CREAT, S_IRWXU),
+ SyscallSucceeds());
+ ASSERT_THAT(setrlimit(RLIMIT_FSIZE, &setlim), SyscallSucceeds());
+
+ EXPECT_THAT(WriteBytes(fd, target_lim), SyscallSucceedsWithValue(target_lim));
+
+ std::vector<char> buf(target_lim + 1);
+ std::fill(buf.begin(), buf.end(), 'a');
+ EXPECT_THAT(pwrite(fd, buf.data(), target_lim, 1), SyscallSucceeds());
+ EXPECT_THAT(pwrite64(fd, buf.data(), target_lim, 1), SyscallSucceeds());
+
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
+TEST_F(WriteTest, WriteExceedsRLimit) {
+ // Get the current rlimit and restore after test run.
+ struct rlimit initial_lim;
+ ASSERT_THAT(getrlimit(RLIMIT_FSIZE, &initial_lim), SyscallSucceeds());
+ auto cleanup = Cleanup([&initial_lim] {
+ EXPECT_THAT(setrlimit(RLIMIT_FSIZE, &initial_lim), SyscallSucceeds());
+ });
+
+ int fd;
+ sigset_t filesize_mask;
+ sigemptyset(&filesize_mask);
+ sigaddset(&filesize_mask, SIGXFSZ);
+
+ struct rlimit setlim;
+ const int target_lim = 1024;
+ setlim.rlim_cur = target_lim;
+ setlim.rlim_max = RLIM_INFINITY;
+
+ const std::string pathname = NewTempAbsPath();
+ ASSERT_THAT(fd = open(pathname.c_str(), O_WRONLY | O_CREAT, S_IRWXU),
+ SyscallSucceeds());
+ ASSERT_THAT(setrlimit(RLIMIT_FSIZE, &setlim), SyscallSucceeds());
+ ASSERT_THAT(sigprocmask(SIG_BLOCK, &filesize_mask, nullptr),
+ SyscallSucceeds());
+ std::vector<char> buf(target_lim + 2);
+ std::fill(buf.begin(), buf.end(), 'a');
+
+ EXPECT_THAT(write(fd, buf.data(), target_lim + 1),
+ SyscallSucceedsWithValue(target_lim));
+ EXPECT_THAT(write(fd, buf.data(), 1), SyscallFailsWithErrno(EFBIG));
+ siginfo_t info;
+ struct timespec timelimit = {0, 0};
+ ASSERT_THAT(RetryEINTR(sigtimedwait)(&filesize_mask, &info, &timelimit),
+ SyscallSucceedsWithValue(SIGXFSZ));
+ EXPECT_EQ(info.si_code, SI_USER);
+ EXPECT_EQ(info.si_pid, getpid());
+ EXPECT_EQ(info.si_uid, getuid());
+
+ EXPECT_THAT(pwrite(fd, buf.data(), target_lim + 1, 1),
+ SyscallSucceedsWithValue(target_lim - 1));
+ EXPECT_THAT(pwrite(fd, buf.data(), 1, target_lim),
+ SyscallFailsWithErrno(EFBIG));
+ ASSERT_THAT(RetryEINTR(sigtimedwait)(&filesize_mask, &info, &timelimit),
+ SyscallSucceedsWithValue(SIGXFSZ));
+ EXPECT_EQ(info.si_code, SI_USER);
+ EXPECT_EQ(info.si_pid, getpid());
+ EXPECT_EQ(info.si_uid, getuid());
+
+ EXPECT_THAT(pwrite64(fd, buf.data(), target_lim + 1, 1),
+ SyscallSucceedsWithValue(target_lim - 1));
+ EXPECT_THAT(pwrite64(fd, buf.data(), 1, target_lim),
+ SyscallFailsWithErrno(EFBIG));
+ ASSERT_THAT(RetryEINTR(sigtimedwait)(&filesize_mask, &info, &timelimit),
+ SyscallSucceedsWithValue(SIGXFSZ));
+ EXPECT_EQ(info.si_code, SI_USER);
+ EXPECT_EQ(info.si_pid, getpid());
+ EXPECT_EQ(info.si_uid, getuid());
+
+ ASSERT_THAT(sigprocmask(SIG_UNBLOCK, &filesize_mask, nullptr),
+ SyscallSucceeds());
+ EXPECT_THAT(close(fd), SyscallSucceeds());
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/xattr.cc b/test/syscalls/linux/xattr.cc
new file mode 100644
index 000000000..cbcf08451
--- /dev/null
+++ b/test/syscalls/linux/xattr.cc
@@ -0,0 +1,610 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sys/types.h>
+#include <sys/xattr.h>
+#include <unistd.h>
+
+#include <string>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "absl/container/flat_hash_set.h"
+#include "test/syscalls/linux/file_base.h"
+#include "test/util/capability_util.h"
+#include "test/util/file_descriptor.h"
+#include "test/util/posix_error.h"
+#include "test/util/temp_path.h"
+#include "test/util/test_util.h"
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+class XattrTest : public FileTest {};
+
+TEST_F(XattrTest, XattrNonexistentFile) {
+ const char* path = "/does/not/exist";
+ const char* name = "user.test";
+ EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0),
+ SyscallFailsWithErrno(ENOENT));
+ EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENOENT));
+ EXPECT_THAT(listxattr(path, nullptr, 0), SyscallFailsWithErrno(ENOENT));
+ EXPECT_THAT(removexattr(path, name), SyscallFailsWithErrno(ENOENT));
+}
+
+TEST_F(XattrTest, XattrNullName) {
+ const char* path = test_file_name_.c_str();
+
+ EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, /*flags=*/0),
+ SyscallFailsWithErrno(EFAULT));
+ EXPECT_THAT(getxattr(path, nullptr, nullptr, 0),
+ SyscallFailsWithErrno(EFAULT));
+ EXPECT_THAT(removexattr(path, nullptr), SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(XattrTest, XattrEmptyName) {
+ const char* path = test_file_name_.c_str();
+
+ EXPECT_THAT(setxattr(path, "", nullptr, 0, /*flags=*/0),
+ SyscallFailsWithErrno(ERANGE));
+ EXPECT_THAT(getxattr(path, "", nullptr, 0), SyscallFailsWithErrno(ERANGE));
+ EXPECT_THAT(removexattr(path, ""), SyscallFailsWithErrno(ERANGE));
+}
+
+TEST_F(XattrTest, XattrLargeName) {
+ const char* path = test_file_name_.c_str();
+ std::string name = "user.";
+ name += std::string(XATTR_NAME_MAX - name.length(), 'a');
+
+ if (!IsRunningOnGvisor()) {
+ // In gVisor, access to xattrs is controlled with an explicit list of
+ // allowed names. This name isn't going to be configured to allow access, so
+ // don't test it.
+ EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0),
+ SyscallSucceeds());
+ EXPECT_THAT(getxattr(path, name.c_str(), nullptr, 0),
+ SyscallSucceedsWithValue(0));
+ }
+
+ name += "a";
+ EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0),
+ SyscallFailsWithErrno(ERANGE));
+ EXPECT_THAT(getxattr(path, name.c_str(), nullptr, 0),
+ SyscallFailsWithErrno(ERANGE));
+ EXPECT_THAT(removexattr(path, name.c_str()), SyscallFailsWithErrno(ERANGE));
+}
+
+TEST_F(XattrTest, XattrInvalidPrefix) {
+ const char* path = test_file_name_.c_str();
+ std::string name(XATTR_NAME_MAX, 'a');
+ EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0),
+ SyscallFailsWithErrno(EOPNOTSUPP));
+ EXPECT_THAT(getxattr(path, name.c_str(), nullptr, 0),
+ SyscallFailsWithErrno(EOPNOTSUPP));
+ EXPECT_THAT(removexattr(path, name.c_str()),
+ SyscallFailsWithErrno(EOPNOTSUPP));
+}
+
+// Do not allow save/restore cycles after making the test file read-only, as
+// the restore will fail to open it with r/w permissions.
+TEST_F(XattrTest, XattrReadOnly_NoRandomSave) {
+ // Drop capabilities that allow us to override file and directory permissions.
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+ const char* path = test_file_name_.c_str();
+ const char name[] = "user.test";
+ char val = 'a';
+ size_t size = sizeof(val);
+
+ EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
+
+ DisableSave ds;
+ ASSERT_NO_ERRNO(testing::Chmod(test_file_name_, S_IRUSR));
+
+ EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0),
+ SyscallFailsWithErrno(EACCES));
+ EXPECT_THAT(removexattr(path, name), SyscallFailsWithErrno(EACCES));
+
+ char buf = '-';
+ EXPECT_THAT(getxattr(path, name, &buf, size), SyscallSucceedsWithValue(size));
+ EXPECT_EQ(buf, val);
+
+ char list[sizeof(name)];
+ EXPECT_THAT(listxattr(path, list, sizeof(list)),
+ SyscallSucceedsWithValue(sizeof(name)));
+ EXPECT_STREQ(list, name);
+}
+
+// Do not allow save/restore cycles after making the test file write-only, as
+// the restore will fail to open it with r/w permissions.
+TEST_F(XattrTest, XattrWriteOnly_NoRandomSave) {
+ // Drop capabilities that allow us to override file and directory permissions.
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false));
+ ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false));
+
+ DisableSave ds;
+ ASSERT_NO_ERRNO(testing::Chmod(test_file_name_, S_IWUSR));
+
+ const char* path = test_file_name_.c_str();
+ const char name[] = "user.test";
+ char val = 'a';
+ size_t size = sizeof(val);
+
+ EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
+
+ EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(EACCES));
+
+ // listxattr will succeed even without read permissions.
+ char list[sizeof(name)];
+ EXPECT_THAT(listxattr(path, list, sizeof(list)),
+ SyscallSucceedsWithValue(sizeof(name)));
+ EXPECT_STREQ(list, name);
+
+ EXPECT_THAT(removexattr(path, name), SyscallSucceeds());
+}
+
+TEST_F(XattrTest, XattrTrustedWithNonadmin) {
+ // TODO(b/148380782): Support setxattr and getxattr with "trusted" prefix.
+ SKIP_IF(IsRunningOnGvisor());
+ SKIP_IF(ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN)));
+
+ const char* path = test_file_name_.c_str();
+ const char name[] = "trusted.abc";
+ EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0),
+ SyscallFailsWithErrno(EPERM));
+ EXPECT_THAT(removexattr(path, name), SyscallFailsWithErrno(EPERM));
+ EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
+}
+
+TEST_F(XattrTest, XattrOnDirectory) {
+ TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ const char name[] = "user.test";
+ EXPECT_THAT(setxattr(dir.path().c_str(), name, nullptr, 0, /*flags=*/0),
+ SyscallSucceeds());
+ EXPECT_THAT(getxattr(dir.path().c_str(), name, nullptr, 0),
+ SyscallSucceedsWithValue(0));
+
+ char list[sizeof(name)];
+ EXPECT_THAT(listxattr(dir.path().c_str(), list, sizeof(list)),
+ SyscallSucceedsWithValue(sizeof(name)));
+ EXPECT_STREQ(list, name);
+
+ EXPECT_THAT(removexattr(dir.path().c_str(), name), SyscallSucceeds());
+}
+
+TEST_F(XattrTest, XattrOnSymlink) {
+ TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateSymlinkTo(dir.path(), test_file_name_));
+ const char name[] = "user.test";
+ EXPECT_THAT(setxattr(link.path().c_str(), name, nullptr, 0, /*flags=*/0),
+ SyscallSucceeds());
+ EXPECT_THAT(getxattr(link.path().c_str(), name, nullptr, 0),
+ SyscallSucceedsWithValue(0));
+
+ char list[sizeof(name)];
+ EXPECT_THAT(listxattr(link.path().c_str(), list, sizeof(list)),
+ SyscallSucceedsWithValue(sizeof(name)));
+ EXPECT_STREQ(list, name);
+
+ EXPECT_THAT(removexattr(link.path().c_str(), name), SyscallSucceeds());
+}
+
+TEST_F(XattrTest, XattrOnInvalidFileTypes) {
+ const char name[] = "user.test";
+
+ char char_device[] = "/dev/zero";
+ EXPECT_THAT(setxattr(char_device, name, nullptr, 0, /*flags=*/0),
+ SyscallFailsWithErrno(EPERM));
+ EXPECT_THAT(getxattr(char_device, name, nullptr, 0),
+ SyscallFailsWithErrno(ENODATA));
+ EXPECT_THAT(listxattr(char_device, nullptr, 0), SyscallSucceedsWithValue(0));
+
+ // Use tmpfs, where creation of named pipes is supported.
+ const std::string fifo = NewTempAbsPathInDir("/dev/shm");
+ const char* path = fifo.c_str();
+ EXPECT_THAT(mknod(path, S_IFIFO | S_IRUSR | S_IWUSR, 0), SyscallSucceeds());
+ EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0),
+ SyscallFailsWithErrno(EPERM));
+ EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
+ EXPECT_THAT(listxattr(path, nullptr, 0), SyscallSucceedsWithValue(0));
+ EXPECT_THAT(removexattr(path, name), SyscallFailsWithErrno(EPERM));
+}
+
+TEST_F(XattrTest, SetxattrSizeSmallerThanValue) {
+ const char* path = test_file_name_.c_str();
+ const char name[] = "user.test";
+ std::vector<char> val = {'a', 'a'};
+ size_t size = 1;
+ EXPECT_THAT(setxattr(path, name, val.data(), size, /*flags=*/0),
+ SyscallSucceeds());
+
+ std::vector<char> buf = {'-', '-'};
+ std::vector<char> expected_buf = {'a', '-'};
+ EXPECT_THAT(getxattr(path, name, buf.data(), buf.size()),
+ SyscallSucceedsWithValue(size));
+ EXPECT_EQ(buf, expected_buf);
+}
+
+TEST_F(XattrTest, SetxattrZeroSize) {
+ const char* path = test_file_name_.c_str();
+ const char name[] = "user.test";
+ char val = 'a';
+ EXPECT_THAT(setxattr(path, name, &val, 0, /*flags=*/0), SyscallSucceeds());
+
+ char buf = '-';
+ EXPECT_THAT(getxattr(path, name, &buf, XATTR_SIZE_MAX),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(buf, '-');
+}
+
+TEST_F(XattrTest, SetxattrSizeTooLarge) {
+ const char* path = test_file_name_.c_str();
+ const char name[] = "user.test";
+
+ // Note that each particular fs implementation may stipulate a lower size
+ // limit, in which case we actually may fail (e.g. error with ENOSPC) for
+ // some sizes under XATTR_SIZE_MAX.
+ size_t size = XATTR_SIZE_MAX + 1;
+ std::vector<char> val(size);
+ EXPECT_THAT(setxattr(path, name, val.data(), size, /*flags=*/0),
+ SyscallFailsWithErrno(E2BIG));
+
+ EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
+}
+
+TEST_F(XattrTest, SetxattrNullValueAndNonzeroSize) {
+ const char* path = test_file_name_.c_str();
+ const char name[] = "user.test";
+ EXPECT_THAT(setxattr(path, name, nullptr, 1, /*flags=*/0),
+ SyscallFailsWithErrno(EFAULT));
+
+ EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
+}
+
+TEST_F(XattrTest, SetxattrNullValueAndZeroSize) {
+ const char* path = test_file_name_.c_str();
+ const char name[] = "user.test";
+ EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
+
+ EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(XattrTest, SetxattrValueTooLargeButOKSize) {
+ const char* path = test_file_name_.c_str();
+ const char name[] = "user.test";
+ std::vector<char> val(XATTR_SIZE_MAX + 1);
+ std::fill(val.begin(), val.end(), 'a');
+ size_t size = 1;
+ EXPECT_THAT(setxattr(path, name, val.data(), size, /*flags=*/0),
+ SyscallSucceeds());
+
+ std::vector<char> buf = {'-', '-'};
+ std::vector<char> expected_buf = {'a', '-'};
+ EXPECT_THAT(getxattr(path, name, buf.data(), size),
+ SyscallSucceedsWithValue(size));
+ EXPECT_EQ(buf, expected_buf);
+}
+
+TEST_F(XattrTest, SetxattrReplaceWithSmaller) {
+ const char* path = test_file_name_.c_str();
+ const char name[] = "user.test";
+ std::vector<char> val = {'a', 'a'};
+ EXPECT_THAT(setxattr(path, name, val.data(), 2, /*flags=*/0),
+ SyscallSucceeds());
+ EXPECT_THAT(setxattr(path, name, val.data(), 1, /*flags=*/0),
+ SyscallSucceeds());
+
+ std::vector<char> buf = {'-', '-'};
+ std::vector<char> expected_buf = {'a', '-'};
+ EXPECT_THAT(getxattr(path, name, buf.data(), 2), SyscallSucceedsWithValue(1));
+ EXPECT_EQ(buf, expected_buf);
+}
+
+TEST_F(XattrTest, SetxattrReplaceWithLarger) {
+ const char* path = test_file_name_.c_str();
+ const char name[] = "user.test";
+ std::vector<char> val = {'a', 'a'};
+ EXPECT_THAT(setxattr(path, name, val.data(), 1, /*flags=*/0),
+ SyscallSucceeds());
+ EXPECT_THAT(setxattr(path, name, val.data(), 2, /*flags=*/0),
+ SyscallSucceeds());
+
+ std::vector<char> buf = {'-', '-'};
+ EXPECT_THAT(getxattr(path, name, buf.data(), 2), SyscallSucceedsWithValue(2));
+ EXPECT_EQ(buf, val);
+}
+
+TEST_F(XattrTest, SetxattrCreateFlag) {
+ const char* path = test_file_name_.c_str();
+ const char name[] = "user.test";
+ EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_CREATE),
+ SyscallSucceeds());
+ EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_CREATE),
+ SyscallFailsWithErrno(EEXIST));
+
+ EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(XattrTest, SetxattrReplaceFlag) {
+ const char* path = test_file_name_.c_str();
+ const char name[] = "user.test";
+ EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_REPLACE),
+ SyscallFailsWithErrno(ENODATA));
+ EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
+ EXPECT_THAT(setxattr(path, name, nullptr, 0, XATTR_REPLACE),
+ SyscallSucceeds());
+
+ EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallSucceedsWithValue(0));
+}
+
+TEST_F(XattrTest, SetxattrInvalidFlags) {
+ const char* path = test_file_name_.c_str();
+ int invalid_flags = 0xff;
+ EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, invalid_flags),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_F(XattrTest, Getxattr) {
+ const char* path = test_file_name_.c_str();
+ const char name[] = "user.test";
+ int val = 1234;
+ size_t size = sizeof(val);
+ EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
+
+ int buf = 0;
+ EXPECT_THAT(getxattr(path, name, &buf, size), SyscallSucceedsWithValue(size));
+ EXPECT_EQ(buf, val);
+}
+
+TEST_F(XattrTest, GetxattrSizeSmallerThanValue) {
+ const char* path = test_file_name_.c_str();
+ const char name[] = "user.test";
+ std::vector<char> val = {'a', 'a'};
+ size_t size = val.size();
+ EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
+
+ char buf = '-';
+ EXPECT_THAT(getxattr(path, name, &buf, 1), SyscallFailsWithErrno(ERANGE));
+ EXPECT_EQ(buf, '-');
+}
+
+TEST_F(XattrTest, GetxattrSizeLargerThanValue) {
+ const char* path = test_file_name_.c_str();
+ const char name[] = "user.test";
+ char val = 'a';
+ EXPECT_THAT(setxattr(path, name, &val, 1, /*flags=*/0), SyscallSucceeds());
+
+ std::vector<char> buf(XATTR_SIZE_MAX);
+ std::fill(buf.begin(), buf.end(), '-');
+ std::vector<char> expected_buf = buf;
+ expected_buf[0] = 'a';
+ EXPECT_THAT(getxattr(path, name, buf.data(), buf.size()),
+ SyscallSucceedsWithValue(1));
+ EXPECT_EQ(buf, expected_buf);
+}
+
+TEST_F(XattrTest, GetxattrZeroSize) {
+ const char* path = test_file_name_.c_str();
+ const char name[] = "user.test";
+ char val = 'a';
+ EXPECT_THAT(setxattr(path, name, &val, sizeof(val), /*flags=*/0),
+ SyscallSucceeds());
+
+ char buf = '-';
+ EXPECT_THAT(getxattr(path, name, &buf, 0),
+ SyscallSucceedsWithValue(sizeof(val)));
+ EXPECT_EQ(buf, '-');
+}
+
+TEST_F(XattrTest, GetxattrSizeTooLarge) {
+ const char* path = test_file_name_.c_str();
+ const char name[] = "user.test";
+ char val = 'a';
+ EXPECT_THAT(setxattr(path, name, &val, sizeof(val), /*flags=*/0),
+ SyscallSucceeds());
+
+ std::vector<char> buf(XATTR_SIZE_MAX + 1);
+ std::fill(buf.begin(), buf.end(), '-');
+ std::vector<char> expected_buf = buf;
+ expected_buf[0] = 'a';
+ EXPECT_THAT(getxattr(path, name, buf.data(), buf.size()),
+ SyscallSucceedsWithValue(sizeof(val)));
+ EXPECT_EQ(buf, expected_buf);
+}
+
+TEST_F(XattrTest, GetxattrNullValue) {
+ const char* path = test_file_name_.c_str();
+ const char name[] = "user.test";
+ char val = 'a';
+ size_t size = sizeof(val);
+ EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
+
+ EXPECT_THAT(getxattr(path, name, nullptr, size),
+ SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(XattrTest, GetxattrNullValueAndZeroSize) {
+ const char* path = test_file_name_.c_str();
+ const char name[] = "user.test";
+ char val = 'a';
+ size_t size = sizeof(val);
+ // Set value with zero size.
+ EXPECT_THAT(setxattr(path, name, &val, 0, /*flags=*/0), SyscallSucceeds());
+ // Get value with nonzero size.
+ EXPECT_THAT(getxattr(path, name, nullptr, size), SyscallSucceedsWithValue(0));
+
+ // Set value with nonzero size.
+ EXPECT_THAT(setxattr(path, name, &val, size, /*flags=*/0), SyscallSucceeds());
+ // Get value with zero size.
+ EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallSucceedsWithValue(size));
+}
+
+TEST_F(XattrTest, GetxattrNonexistentName) {
+ const char* path = test_file_name_.c_str();
+ const char name[] = "user.test";
+ EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
+}
+
+TEST_F(XattrTest, Listxattr) {
+ const char* path = test_file_name_.c_str();
+ const std::string name = "user.test";
+ const std::string name2 = "user.test2";
+ const std::string name3 = "user.test3";
+ EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0),
+ SyscallSucceeds());
+ EXPECT_THAT(setxattr(path, name2.c_str(), nullptr, 0, /*flags=*/0),
+ SyscallSucceeds());
+ EXPECT_THAT(setxattr(path, name3.c_str(), nullptr, 0, /*flags=*/0),
+ SyscallSucceeds());
+
+ std::vector<char> list(name.size() + 1 + name2.size() + 1 + name3.size() + 1);
+ char* buf = list.data();
+ EXPECT_THAT(listxattr(path, buf, XATTR_SIZE_MAX),
+ SyscallSucceedsWithValue(list.size()));
+
+ absl::flat_hash_set<std::string> got = {};
+ for (char* p = buf; p < buf + list.size(); p += strlen(p) + 1) {
+ got.insert(std::string{p});
+ }
+
+ absl::flat_hash_set<std::string> expected = {name, name2, name3};
+ EXPECT_EQ(got, expected);
+}
+
+TEST_F(XattrTest, ListxattrNoXattrs) {
+ const char* path = test_file_name_.c_str();
+
+ std::vector<char> list, expected;
+ EXPECT_THAT(listxattr(path, list.data(), sizeof(list)),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(list, expected);
+
+ // Listxattr should succeed if there are no attributes, even if the buffer
+ // passed in is a nullptr.
+ EXPECT_THAT(listxattr(path, nullptr, sizeof(list)),
+ SyscallSucceedsWithValue(0));
+}
+
+TEST_F(XattrTest, ListxattrNullBuffer) {
+ const char* path = test_file_name_.c_str();
+ const char name[] = "user.test";
+ EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
+
+ EXPECT_THAT(listxattr(path, nullptr, sizeof(name)),
+ SyscallFailsWithErrno(EFAULT));
+}
+
+TEST_F(XattrTest, ListxattrSizeTooSmall) {
+ const char* path = test_file_name_.c_str();
+ const char name[] = "user.test";
+ EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
+
+ char list[sizeof(name) - 1];
+ EXPECT_THAT(listxattr(path, list, sizeof(list)),
+ SyscallFailsWithErrno(ERANGE));
+}
+
+TEST_F(XattrTest, ListxattrZeroSize) {
+ const char* path = test_file_name_.c_str();
+ const char name[] = "user.test";
+ EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
+ EXPECT_THAT(listxattr(path, nullptr, 0),
+ SyscallSucceedsWithValue(sizeof(name)));
+}
+
+TEST_F(XattrTest, RemoveXattr) {
+ const char* path = test_file_name_.c_str();
+ const char name[] = "user.test";
+ EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallSucceeds());
+ EXPECT_THAT(removexattr(path, name), SyscallSucceeds());
+ EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENODATA));
+}
+
+TEST_F(XattrTest, RemoveXattrNonexistentName) {
+ const char* path = test_file_name_.c_str();
+ const char name[] = "user.test";
+ EXPECT_THAT(removexattr(path, name), SyscallFailsWithErrno(ENODATA));
+}
+
+TEST_F(XattrTest, LXattrOnSymlink) {
+ const char name[] = "user.test";
+ TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir());
+ TempPath link = ASSERT_NO_ERRNO_AND_VALUE(
+ TempPath::CreateSymlinkTo(dir.path(), test_file_name_));
+
+ EXPECT_THAT(lsetxattr(link.path().c_str(), name, nullptr, 0, 0),
+ SyscallFailsWithErrno(EPERM));
+ EXPECT_THAT(lgetxattr(link.path().c_str(), name, nullptr, 0),
+ SyscallFailsWithErrno(ENODATA));
+ EXPECT_THAT(llistxattr(link.path().c_str(), nullptr, 0),
+ SyscallSucceedsWithValue(0));
+ EXPECT_THAT(lremovexattr(link.path().c_str(), name),
+ SyscallFailsWithErrno(EPERM));
+}
+
+TEST_F(XattrTest, LXattrOnNonsymlink) {
+ const char* path = test_file_name_.c_str();
+ const char name[] = "user.test";
+ int val = 1234;
+ size_t size = sizeof(val);
+ EXPECT_THAT(lsetxattr(path, name, &val, size, /*flags=*/0),
+ SyscallSucceeds());
+
+ int buf = 0;
+ EXPECT_THAT(lgetxattr(path, name, &buf, size),
+ SyscallSucceedsWithValue(size));
+ EXPECT_EQ(buf, val);
+
+ char list[sizeof(name)];
+ EXPECT_THAT(llistxattr(path, list, sizeof(list)),
+ SyscallSucceedsWithValue(sizeof(name)));
+ EXPECT_STREQ(list, name);
+
+ EXPECT_THAT(lremovexattr(path, name), SyscallSucceeds());
+}
+
+TEST_F(XattrTest, XattrWithFD) {
+ const FileDescriptor fd =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_.c_str(), 0));
+ const char name[] = "user.test";
+ int val = 1234;
+ size_t size = sizeof(val);
+ EXPECT_THAT(fsetxattr(fd.get(), name, &val, size, /*flags=*/0),
+ SyscallSucceeds());
+
+ int buf = 0;
+ EXPECT_THAT(fgetxattr(fd.get(), name, &buf, size),
+ SyscallSucceedsWithValue(size));
+ EXPECT_EQ(buf, val);
+
+ char list[sizeof(name)];
+ EXPECT_THAT(flistxattr(fd.get(), list, sizeof(list)),
+ SyscallSucceedsWithValue(sizeof(name)));
+ EXPECT_STREQ(list, name);
+
+ EXPECT_THAT(fremovexattr(fd.get(), name), SyscallSucceeds());
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor