summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorKevin Krakauer <krakauer@google.com>2019-06-12 15:21:22 -0700
committerKevin Krakauer <krakauer@google.com>2019-06-12 15:21:22 -0700
commit0bbbcafd68154e7c7b46692b84a39fb6bb5f1568 (patch)
treed8fba01ad76900715665b0418a786de2d77e2a05
parent06a83df533244dc2b3b8adfc1bf0608d3753c1d9 (diff)
parent70578806e8d3e01fae2249b3e602cd5b05d378a0 (diff)
Merge branch 'master' into iptables-1-pkg
Change-Id: I7457a11de4725e1bf3811420c505d225b1cb6943
-rw-r--r--.bazelrc2
-rw-r--r--BUILD7
-rw-r--r--README.md7
-rw-r--r--WORKSPACE67
-rw-r--r--cloudbuild/go.Dockerfile2
-rw-r--r--cloudbuild/go.yaml22
-rw-r--r--go.mod21
-rw-r--r--kokoro/common.cfg2
l---------[-rwxr-xr-x]kokoro/run_build.sh43
l---------[-rwxr-xr-x]kokoro/run_tests.sh259
-rw-r--r--pkg/abi/linux/capability.go84
-rw-r--r--pkg/abi/linux/mm.go9
-rw-r--r--pkg/abi/linux/prctl.go7
-rw-r--r--pkg/abi/linux/socket.go34
-rw-r--r--pkg/memutil/BUILD11
-rw-r--r--pkg/memutil/memutil_unsafe.go (renamed from pkg/sentry/memutil/memutil_unsafe.go)3
-rw-r--r--pkg/procid/BUILD (renamed from pkg/sentry/platform/procid/BUILD)4
-rw-r--r--pkg/procid/procid.go (renamed from pkg/sentry/platform/procid/procid.go)0
-rw-r--r--pkg/procid/procid_amd64.s (renamed from pkg/sentry/platform/procid/procid_amd64.s)0
-rw-r--r--pkg/procid/procid_arm64.s (renamed from pkg/sentry/platform/procid/procid_arm64.s)0
-rw-r--r--pkg/procid/procid_net_test.go (renamed from pkg/sentry/platform/procid/procid_net_test.go)0
-rw-r--r--pkg/procid/procid_test.go (renamed from pkg/sentry/platform/procid/procid_test.go)0
-rw-r--r--pkg/sentry/context/contexttest/BUILD2
-rw-r--r--pkg/sentry/context/contexttest/contexttest.go2
-rw-r--r--pkg/sentry/fs/dirent.go2
-rw-r--r--pkg/sentry/fs/file.go24
-rw-r--r--pkg/sentry/fs/gofer/socket.go16
-rw-r--r--pkg/sentry/fs/host/socket.go73
-rw-r--r--pkg/sentry/fs/host/socket_test.go156
-rw-r--r--pkg/sentry/fs/inode.go4
-rw-r--r--pkg/sentry/fs/inode_overlay.go12
-rw-r--r--pkg/sentry/fs/proc/BUILD1
-rw-r--r--pkg/sentry/fs/proc/inode.go40
-rw-r--r--pkg/sentry/fs/proc/net.go34
-rw-r--r--pkg/sentry/fs/proc/task.go17
-rw-r--r--pkg/sentry/fs/timerfd/timerfd.go2
-rw-r--r--pkg/sentry/fs/tmpfs/fs.go25
-rw-r--r--pkg/sentry/hostmm/BUILD18
-rw-r--r--pkg/sentry/hostmm/cgroup.go111
-rw-r--r--pkg/sentry/hostmm/hostmm.go130
-rw-r--r--pkg/sentry/kernel/BUILD13
-rw-r--r--pkg/sentry/kernel/epoll/epoll.go2
-rw-r--r--pkg/sentry/kernel/eventfd/eventfd.go2
-rw-r--r--pkg/sentry/kernel/kernel.go55
-rw-r--r--pkg/sentry/kernel/pipe/node.go12
-rw-r--r--pkg/sentry/kernel/pipe/node_test.go8
-rw-r--r--pkg/sentry/kernel/pipe/pipe.go39
-rw-r--r--pkg/sentry/kernel/ptrace.go17
-rw-r--r--pkg/sentry/kernel/syscalls.go60
-rw-r--r--pkg/sentry/kernel/table_test.go8
-rw-r--r--pkg/sentry/kernel/task.go7
-rw-r--r--pkg/sentry/kernel/task_exec.go7
-rw-r--r--pkg/sentry/kernel/task_identity.go24
-rw-r--r--pkg/sentry/kernel/task_sched.go4
-rw-r--r--pkg/sentry/memutil/BUILD14
-rw-r--r--pkg/sentry/mm/lifecycle.go6
-rw-r--r--pkg/sentry/mm/metadata.go30
-rw-r--r--pkg/sentry/mm/mm.go12
-rw-r--r--pkg/sentry/mm/syscalls.go63
-rw-r--r--pkg/sentry/mm/vma.go3
-rw-r--r--pkg/sentry/pgalloc/BUILD3
-rw-r--r--pkg/sentry/pgalloc/pgalloc.go63
-rw-r--r--pkg/sentry/platform/kvm/BUILD2
-rw-r--r--pkg/sentry/platform/kvm/machine.go2
-rw-r--r--pkg/sentry/platform/ptrace/BUILD2
-rw-r--r--pkg/sentry/platform/ptrace/subprocess.go21
-rw-r--r--pkg/sentry/platform/ptrace/subprocess_amd64.go37
-rw-r--r--pkg/sentry/platform/ptrace/subprocess_linux.go4
-rw-r--r--pkg/sentry/socket/control/control.go12
-rw-r--r--pkg/sentry/socket/epsocket/BUILD1
-rw-r--r--pkg/sentry/socket/epsocket/epsocket.go96
-rw-r--r--pkg/sentry/socket/epsocket/provider.go9
-rw-r--r--pkg/sentry/socket/hostinet/BUILD1
-rw-r--r--pkg/sentry/socket/hostinet/socket.go61
-rw-r--r--pkg/sentry/socket/netlink/provider.go9
-rw-r--r--pkg/sentry/socket/netlink/socket.go17
-rw-r--r--pkg/sentry/socket/rpcinet/BUILD1
-rw-r--r--pkg/sentry/socket/rpcinet/socket.go31
-rw-r--r--pkg/sentry/socket/socket.go21
-rw-r--r--pkg/sentry/socket/unix/transport/BUILD1
-rw-r--r--pkg/sentry/socket/unix/transport/connectioned.go29
-rw-r--r--pkg/sentry/socket/unix/transport/connectionless.go20
-rw-r--r--pkg/sentry/socket/unix/transport/unix.go30
-rw-r--r--pkg/sentry/socket/unix/unix.go80
-rw-r--r--pkg/sentry/strace/socket.go14
-rw-r--r--pkg/sentry/syscalls/linux/BUILD1
-rw-r--r--pkg/sentry/syscalls/linux/error.go4
-rw-r--r--pkg/sentry/syscalls/linux/linux64.go756
-rw-r--r--pkg/sentry/syscalls/linux/sys_mempolicy.go312
-rw-r--r--pkg/sentry/syscalls/linux/sys_mmap.go145
-rw-r--r--pkg/sentry/syscalls/linux/sys_prctl.go33
-rw-r--r--pkg/sentry/syscalls/linux/sys_socket.go4
-rw-r--r--pkg/sentry/syscalls/syscalls.go88
-rw-r--r--pkg/sentry/time/BUILD2
-rw-r--r--pkg/sentry/usage/BUILD2
-rw-r--r--pkg/sentry/usage/memory.go2
-rw-r--r--pkg/sentry/usermem/usermem.go36
-rw-r--r--pkg/sentry/usermem/usermem_test.go15
-rw-r--r--pkg/tcpip/link/fdbased/endpoint.go168
-rw-r--r--pkg/tcpip/link/fdbased/endpoint_test.go2
-rw-r--r--pkg/tcpip/link/sniffer/sniffer.go10
-rw-r--r--pkg/tcpip/network/ipv4/ipv4.go4
-rw-r--r--pkg/tcpip/network/ipv6/ipv6.go4
-rw-r--r--pkg/tcpip/sample/tun_tcp_connect/main.go2
-rw-r--r--pkg/tcpip/sample/tun_tcp_echo/main.go2
-rw-r--r--pkg/tcpip/stack/route.go10
-rw-r--r--pkg/tcpip/stack/transport_test.go4
-rw-r--r--pkg/tcpip/tcpip.go12
-rw-r--r--pkg/tcpip/transport/icmp/BUILD1
-rw-r--r--pkg/tcpip/transport/icmp/endpoint.go6
-rw-r--r--pkg/tcpip/transport/raw/endpoint.go5
-rw-r--r--pkg/tcpip/transport/tcp/BUILD2
-rw-r--r--pkg/tcpip/transport/tcp/accept.go50
-rw-r--r--pkg/tcpip/transport/tcp/connect.go52
-rw-r--r--pkg/tcpip/transport/tcp/cubic.go3
-rw-r--r--pkg/tcpip/transport/tcp/cubic_state.go29
-rw-r--r--pkg/tcpip/transport/tcp/endpoint.go223
-rw-r--r--pkg/tcpip/transport/tcp/endpoint_state.go44
-rw-r--r--pkg/tcpip/transport/tcp/protocol.go26
-rw-r--r--pkg/tcpip/transport/tcp/rcv.go37
-rw-r--r--pkg/tcpip/transport/tcp/segment_queue.go6
-rw-r--r--pkg/tcpip/transport/tcp/snd.go16
-rw-r--r--pkg/tcpip/transport/tcp/tcp_noracedetector_test.go519
-rw-r--r--pkg/tcpip/transport/tcp/tcp_test.go963
-rw-r--r--pkg/tcpip/transport/tcp/testing/context/context.go85
-rw-r--r--pkg/tcpip/transport/udp/endpoint.go6
-rw-r--r--pkg/urpc/urpc.go2
-rw-r--r--runsc/BUILD10
-rw-r--r--runsc/boot/BUILD4
-rw-r--r--runsc/boot/config.go13
-rw-r--r--runsc/boot/controller.go27
-rw-r--r--runsc/boot/fds.go3
-rw-r--r--runsc/boot/filter/config.go4
-rw-r--r--runsc/boot/fs.go986
-rw-r--r--runsc/boot/fs_test.go193
-rw-r--r--runsc/boot/loader.go131
-rw-r--r--runsc/boot/loader_test.go9
-rw-r--r--runsc/boot/network.go41
-rw-r--r--runsc/boot/pprof.go (renamed from pkg/sentry/memutil/memutil.go)8
-rw-r--r--runsc/cmd/BUILD4
-rw-r--r--runsc/cmd/boot.go22
-rw-r--r--runsc/cmd/capability_test.go2
-rw-r--r--runsc/cmd/cmd.go19
-rw-r--r--runsc/cmd/create.go9
-rw-r--r--runsc/cmd/do.go39
-rw-r--r--runsc/cmd/error.go72
-rw-r--r--runsc/cmd/exec.go62
-rw-r--r--runsc/cmd/help.go126
-rw-r--r--runsc/cmd/restore.go10
-rw-r--r--runsc/cmd/run.go8
-rw-r--r--runsc/cmd/start.go1
-rw-r--r--runsc/cmd/syscalls.go347
-rw-r--r--runsc/cmd/wait.go4
-rw-r--r--runsc/container/console_test.go2
-rw-r--r--runsc/container/container.go8
-rw-r--r--runsc/container/container_test.go5
-rw-r--r--runsc/container/multi_container_test.go307
-rw-r--r--runsc/main.go145
-rw-r--r--runsc/sandbox/network.go121
-rw-r--r--runsc/sandbox/sandbox.go99
-rw-r--r--runsc/specutils/BUILD5
-rw-r--r--runsc/specutils/fs.go40
-rw-r--r--runsc/specutils/namespace.go52
-rw-r--r--runsc/test/integration/BUILD1
-rw-r--r--runsc/test/integration/exec_test.go23
-rw-r--r--runsc/test/integration/regression_test.go45
-rw-r--r--runsc/test/testutil/BUILD1
-rw-r--r--runsc/test/testutil/testutil.go51
-rw-r--r--test/BUILD6
-rw-r--r--test/syscalls/BUILD212
-rw-r--r--test/syscalls/build_defs.bzl19
-rw-r--r--test/syscalls/linux/BUILD58
-rw-r--r--test/syscalls/linux/accept_bind.cc14
-rw-r--r--test/syscalls/linux/mempolicy.cc37
-rw-r--r--test/syscalls/linux/pipe.cc4
-rw-r--r--test/syscalls/linux/prctl.cc34
-rw-r--r--test/syscalls/linux/proc.cc57
-rw-r--r--test/syscalls/linux/proc_net_unix.cc178
-rw-r--r--test/syscalls/linux/sendfile_socket.cc170
-rw-r--r--test/syscalls/linux/socket_abstract.cc9
-rw-r--r--test/syscalls/linux/socket_filesystem.cc9
-rw-r--r--test/syscalls/linux/socket_inet_loopback.cc61
-rw-r--r--test/syscalls/linux/socket_ip_loopback_blocking.cc2
-rw-r--r--test/syscalls/linux/socket_ip_tcp_generic.cc104
-rw-r--r--test/syscalls/linux/socket_ip_tcp_generic_loopback.cc2
-rw-r--r--test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc2
-rw-r--r--test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc2
-rw-r--r--test/syscalls/linux/socket_ip_tcp_udp_generic.cc2
-rw-r--r--test/syscalls/linux/socket_ip_udp_loopback.cc6
-rw-r--r--test/syscalls/linux/socket_ip_udp_loopback_blocking.cc2
-rw-r--r--test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc2
-rw-r--r--test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc2
-rw-r--r--test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc129
-rw-r--r--test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc2
-rw-r--r--test/syscalls/linux/socket_unix.cc1508
-rw-r--r--test/syscalls/linux/socket_unix_abstract_nonblock.cc2
-rw-r--r--test/syscalls/linux/socket_unix_blocking_local.cc2
-rw-r--r--test/syscalls/linux/socket_unix_cmsg.cc1473
-rw-r--r--test/syscalls/linux/socket_unix_cmsg.h (renamed from test/syscalls/linux/socket_unix_abstract.cc)21
-rw-r--r--test/syscalls/linux/socket_unix_dgram_local.cc6
-rw-r--r--test/syscalls/linux/socket_unix_dgram_non_blocking.cc2
-rw-r--r--test/syscalls/linux/socket_unix_filesystem.cc37
-rw-r--r--test/syscalls/linux/socket_unix_filesystem_nonblock.cc2
-rw-r--r--test/syscalls/linux/socket_unix_non_stream_blocking_local.cc2
-rw-r--r--test/syscalls/linux/socket_unix_pair.cc5
-rw-r--r--test/syscalls/linux/socket_unix_pair_nonblock.cc2
-rw-r--r--test/syscalls/linux/socket_unix_seqpacket_local.cc6
-rw-r--r--test/syscalls/linux/socket_unix_stream_blocking_local.cc2
-rw-r--r--test/syscalls/linux/socket_unix_stream_local.cc2
-rw-r--r--test/syscalls/linux/socket_unix_stream_nonblock_local.cc2
-rw-r--r--test/syscalls/linux/socket_unix_unbound_dgram.cc24
-rw-r--r--test/syscalls/linux/tcp_socket.cc127
-rw-r--r--test/syscalls/syscall_test_runner.go14
-rwxr-xr-xtools/go_branch.sh76
-rwxr-xr-xtools/run_build.sh46
-rwxr-xr-xtools/run_tests.sh288
216 files changed, 9012 insertions, 4720 deletions
diff --git a/.bazelrc b/.bazelrc
index b76976995..f6b21086d 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -1,4 +1,4 @@
-# Copyright 2019 Google LLC
+# Copyright 2019 The gVisor Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
diff --git a/BUILD b/BUILD
index 391791ca9..6d5e800ca 100644
--- a/BUILD
+++ b/BUILD
@@ -1,6 +1,7 @@
package(licenses = ["notice"]) # Apache 2.0
load("@io_bazel_rules_go//go:def.bzl", "go_path")
+load("@bazel_gazelle//:def.bzl", "gazelle")
# The sandbox filegroup is used for sandbox-internal dependencies.
package_group(
@@ -22,3 +23,9 @@ go_path(
"//runsc",
],
)
+
+# gazelle is a set of build tools.
+#
+# To update the WORKSPACE from go.mod, use:
+# bazel run //:gazelle -- update-repos -from_file=go.mod
+gazelle(name = "gazelle")
diff --git a/README.md b/README.md
index f0252025c..17a15ad43 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,8 @@
![gVisor](g3doc/logo.png)
+[![Status](https://storage.googleapis.com/gvisor-build-badges/build.svg)](https://storage.googleapis.com/gvisor-build-badges/build.html)
+[![gVisor chat](https://badges.gitter.im/gvisor/community.png)](https://gitter.im/gvisor/community)
+
## What is gVisor?
**gVisor** is a user-space kernel, written in Go, that implements a substantial
@@ -36,8 +39,6 @@ be found at [gvisor.dev][gvisor-dev].
## Installing from source
-[![Status](https://storage.googleapis.com/gvisor-build-badges/build.svg)](https://storage.googleapis.com/gvisor-build-badges/build.html)
-
gVisor currently requires x86\_64 Linux to build, though support for other
architectures may become available in the future.
@@ -83,7 +84,7 @@ sudo cp ./bazel-bin/runsc/linux_amd64_pure_stripped/runsc /usr/local/bin
The test suite can be run with Bazel:
```
-bazel test ...
+bazel test //...
```
or in a Docker container:
diff --git a/WORKSPACE b/WORKSPACE
index 5da06317f..5155dc527 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -6,6 +6,7 @@ http_archive(
sha256 = "a82a352bffae6bee4e95f68a8d80a70e87f42c4741e6a448bec11998fcc82329",
url = "https://github.com/bazelbuild/rules_go/releases/download/0.18.5/rules_go-0.18.5.tar.gz",
)
+
http_archive(
name = "bazel_gazelle",
sha256 = "3c681998538231a2d24d0c07ed5a7658cb72bfb5fd4bf9911157c0e9ac6a2687",
@@ -37,86 +38,116 @@ http_archive(
# External repositories, in sorted order.
go_repository(
name = "com_github_cenkalti_backoff",
- commit = "66e726b43552c0bab0539b28e640b89fd6862115",
+ commit = "2146c9339422",
importpath = "github.com/cenkalti/backoff",
)
go_repository(
name = "com_github_gofrs_flock",
- commit = "886344bea0798d02ff3fae16a922be5f6b26cee0",
+ commit = "886344bea079",
importpath = "github.com/gofrs/flock",
)
go_repository(
name = "com_github_golang_mock",
- commit = "600781dde9cca80734169b9e969d9054ccc57937",
importpath = "github.com/golang/mock",
+ tag = "v1.3.1",
)
go_repository(
name = "com_github_google_go-cmp",
- commit = "3af367b6b30c263d47e8895973edcca9a49cf029",
importpath = "github.com/google/go-cmp",
+ tag = "v0.2.0",
)
go_repository(
name = "com_github_google_subcommands",
- commit = "ce3d4cfc062faac7115d44e5befec8b5a08c3faa",
+ commit = "636abe8753b8",
importpath = "github.com/google/subcommands",
)
go_repository(
name = "com_github_google_uuid",
- commit = "dec09d789f3dba190787f8b4454c7d3c936fed9e",
+ commit = "dec09d789f3d",
importpath = "github.com/google/uuid",
)
go_repository(
name = "com_github_kr_pty",
- commit = "282ce0e5322c82529687d609ee670fac7c7d917c",
importpath = "github.com/kr/pty",
+ tag = "v1.1.1",
)
go_repository(
name = "com_github_opencontainers_runtime-spec",
- commit = "b2d941ef6a780da2d9982c1fb28d77ad97f54fc7",
+ commit = "b2d941ef6a78",
importpath = "github.com/opencontainers/runtime-spec",
)
go_repository(
name = "com_github_syndtr_gocapability",
- commit = "d98352740cb2c55f81556b63d4a1ec64c5a319c2",
+ commit = "d98352740cb2",
importpath = "github.com/syndtr/gocapability",
)
go_repository(
name = "com_github_vishvananda_netlink",
- commit = "adb577d4a45e341da53c4d9196ad4222c9a23e69",
+ commit = "adb577d4a45e",
importpath = "github.com/vishvananda/netlink",
)
go_repository(
name = "com_github_vishvananda_netns",
- commit = "be1fbeda19366dea804f00efff2dd73a1642fdcc",
+ commit = "be1fbeda1936",
importpath = "github.com/vishvananda/netns",
)
go_repository(
+ name = "org_golang_x_crypto",
+ commit = "c2843e01d9a2",
+ importpath = "golang.org/x/crypto",
+)
+
+go_repository(
name = "org_golang_x_net",
- commit = "b3c676e531a6dc479fa1b35ac961c13f5e2b4d2e",
+ commit = "d8887717615a",
importpath = "golang.org/x/net",
)
go_repository(
+ name = "org_golang_x_text",
+ importpath = "golang.org/x/text",
+ tag = "v0.3.0",
+)
+
+go_repository(
+ name = "org_golang_x_tools",
+ commit = "36563e24a262",
+ importpath = "golang.org/x/tools",
+)
+
+go_repository(
+ name = "org_golang_x_sync",
+ commit = "112230192c58",
+ importpath = "golang.org/x/sync",
+)
+
+go_repository(
name = "org_golang_x_sys",
- commit = "0dd5e194bbf5eb84a39666eb4c98a4d007e4203a",
+ commit = "d0b11bdaac8a",
importpath = "golang.org/x/sys",
)
go_repository(
name = "com_github_google_btree",
- commit = "4030bb1f1f0c35b30ca7009e9ebd06849dd45306",
importpath = "github.com/google/btree",
+ tag = "v1.0.0",
+)
+
+go_repository(
+ name = "com_github_golang_protobuf",
+ importpath = "github.com/golang/protobuf",
+ tag = "v1.3.1",
)
# System Call test dependencies.
@@ -142,10 +173,10 @@ http_archive(
http_archive(
name = "com_google_googletest",
- sha256 = "574e884a41f0a9b76f849a5cdd89c393651e7537e5daa725cf12511232cbd74b",
- strip_prefix = "googletest-61cdca569b1f7e4629f8b949f0a9606c28281a6b",
+ sha256 = "db657310d3c5ca2d3f674e3a4b79718d1d39da70604568ee0568ba8e39065ef4",
+ strip_prefix = "googletest-31200def0dec8a624c861f919e86e4444e6e6ee7",
urls = [
- "https://mirror.bazel.build/github.com/google/googletest/archive/61cdca569b1f7e4629f8b949f0a9606c28281a6b.tar.gz",
- "https://github.com/google/googletest/archive/61cdca569b1f7e4629f8b949f0a9606c28281a6b.tar.gz",
+ "https://mirror.bazel.build/github.com/google/googletest/archive/31200def0dec8a624c861f919e86e4444e6e6ee7.tar.gz",
+ "https://github.com/google/googletest/archive/31200def0dec8a624c861f919e86e4444e6e6ee7.tar.gz",
],
)
diff --git a/cloudbuild/go.Dockerfile b/cloudbuild/go.Dockerfile
new file mode 100644
index 000000000..226442fd2
--- /dev/null
+++ b/cloudbuild/go.Dockerfile
@@ -0,0 +1,2 @@
+FROM ubuntu
+RUN apt-get -q update && apt-get install -qqy git rsync
diff --git a/cloudbuild/go.yaml b/cloudbuild/go.yaml
new file mode 100644
index 000000000..a38ef71fc
--- /dev/null
+++ b/cloudbuild/go.yaml
@@ -0,0 +1,22 @@
+steps:
+- name: 'gcr.io/cloud-builders/git'
+ args: ['fetch', '--all', '--unshallow']
+- name: 'gcr.io/cloud-builders/bazel'
+ args: ['build', ':gopath']
+- name: 'gcr.io/cloud-builders/docker'
+ args: ['build', '-t', 'gcr.io/$PROJECT_ID/go-branch', '-f', 'cloudbuild/go.Dockerfile', '.']
+- name: 'gcr.io/$PROJECT_ID/go-branch'
+ args: ['tools/go_branch.sh']
+- name: 'gcr.io/cloud-builders/git'
+ args: ['checkout', 'go']
+- name: 'gcr.io/cloud-builders/git'
+ args: ['clean', '-f']
+- name: 'golang'
+ args: ['go', 'build', './...']
+- name: 'gcr.io/cloud-builders/git'
+ entrypoint: 'bash'
+ args:
+ - '-c'
+ - 'if [[ "$BRANCH_NAME" == "master" ]]; then git push "${_ORIGIN}" go:go; fi'
+substitutions:
+ _ORIGIN: origin
diff --git a/go.mod b/go.mod
new file mode 100644
index 000000000..e58b84cfb
--- /dev/null
+++ b/go.mod
@@ -0,0 +1,21 @@
+module gvisor.googlesource.com/gvisor
+
+go 1.12
+
+require (
+ github.com/cenkalti/backoff v0.0.0-20190506075156-2146c9339422
+ github.com/gofrs/flock v0.6.1-0.20180915234121-886344bea079
+ github.com/golang/mock v1.3.1
+ github.com/golang/protobuf v1.3.1
+ github.com/google/btree v1.0.0
+ github.com/google/go-cmp v0.2.0
+ github.com/google/subcommands v0.0.0-20190508160503-636abe8753b8
+ github.com/google/uuid v0.0.0-20171129191014-dec09d789f3d
+ github.com/kr/pty v1.1.1
+ github.com/opencontainers/runtime-spec v0.1.2-0.20171211145439-b2d941ef6a78
+ github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2
+ github.com/vishvananda/netlink v1.0.1-0.20190318003149-adb577d4a45e
+ github.com/vishvananda/netns v0.0.0-20171111001504-be1fbeda1936
+ golang.org/x/net v0.0.0-20190311183353-d8887717615a
+ golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a
+)
diff --git a/kokoro/common.cfg b/kokoro/common.cfg
index f6776ae84..cad873fe1 100644
--- a/kokoro/common.cfg
+++ b/kokoro/common.cfg
@@ -11,7 +11,7 @@ before_action {
# Configure bazel to access RBE.
bazel_setting {
# Our GCP project name
- project_id: "copybara-shentu"
+ project_id: "gvisor-rbe"
# Use RBE for execution as well as caching.
local_execution: false
diff --git a/kokoro/run_build.sh b/kokoro/run_build.sh
index 63fffda48..9deafe9bb 100755..120000
--- a/kokoro/run_build.sh
+++ b/kokoro/run_build.sh
@@ -1,42 +1 @@
-#!/bin/bash
-
-# Copyright 2018 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Fail on any error.
-set -e
-# Display commands to stderr.
-set -x
-
-# Install the latest version of Bazel.
-use_bazel.sh latest
-
-# Log the bazel path and version.
-which bazel
-bazel version
-
-cd git/repo
-
-# Build runsc.
-bazel build //runsc
-
-# Move the runsc binary into "latest" directory, and also a directory with the
-# current date.
-latest_dir="${KOKORO_ARTIFACTS_DIR}"/latest
-today_dir="${KOKORO_ARTIFACTS_DIR}"/"$(date -Idate)"
-mkdir -p "${latest_dir}" "${today_dir}"
-cp bazel-bin/runsc/linux_amd64_pure_stripped/runsc "${latest_dir}"
-sha512sum "${latest_dir}"/runsc | awk '{print $1 " runsc"}' > "${latest_dir}"/runsc.sha512
-cp bazel-bin/runsc/linux_amd64_pure_stripped/runsc "${today_dir}"
-sha512sum "${today_dir}"/runsc | awk '{print $1 " runsc"}' > "${today_dir}"/runsc.sha512
+../tools/run_build.sh \ No newline at end of file
diff --git a/kokoro/run_tests.sh b/kokoro/run_tests.sh
index 6a7c1fdb6..931cd2622 100755..120000
--- a/kokoro/run_tests.sh
+++ b/kokoro/run_tests.sh
@@ -1,258 +1 @@
-#!/bin/bash
-
-# Copyright 2018 The gVisor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Fail on any error. Treat unset variables as error. Print commands as executed.
-set -eux
-
-
-###################
-# GLOBAL ENV VARS #
-###################
-
-readonly WORKSPACE_DIR="${PWD}/git/repo"
-
-# Used to configure RBE.
-readonly CLOUD_PROJECT_ID="copybara-shentu"
-readonly RBE_PROJECT_ID="projects/${CLOUD_PROJECT_ID}/instances/default_instance"
-
-# Random runtime name to avoid collisions.
-readonly RUNTIME="runsc_test_$((RANDOM))"
-
-# Packages that will be built and tested.
-readonly BUILD_PACKAGES=("//...")
-readonly TEST_PACKAGES=("//pkg/..." "//runsc/..." "//tools/...")
-
-#######################
-# BAZEL CONFIGURATION #
-#######################
-
-# Install the latest version of Bazel, and log the location and version.
-use_bazel.sh latest
-which bazel
-bazel version
-
-# Load the kvm module
-sudo -n -E modprobe kvm
-
-# General Bazel build/test flags.
-BAZEL_BUILD_FLAGS=(
- "--show_timestamps"
- "--test_output=errors"
- "--keep_going"
- "--verbose_failures=true"
-)
-
-# Bazel build/test for RBE, a super-set of BAZEL_BUILD_FLAGS.
-BAZEL_BUILD_RBE_FLAGS=(
- "${BAZEL_BUILD_FLAGS[@]}"
- "--config=remote"
- "--project_id=${CLOUD_PROJECT_ID}"
- "--remote_instance_name=${RBE_PROJECT_ID}"
- "--auth_credentials=${KOKORO_BAZEL_AUTH_CREDENTIAL}"
-)
-
-####################
-# Helper Functions #
-####################
-
-build_everything() {
- FLAVOR="${1}"
-
- cd ${WORKSPACE_DIR}
- bazel build \
- -c "${FLAVOR}" "${BAZEL_BUILD_RBE_FLAGS[@]}" \
- "${BUILD_PACKAGES[@]}"
-}
-
-# Run simple tests runs the tests that require no special setup or
-# configuration.
-run_simple_tests() {
- cd ${WORKSPACE_DIR}
- bazel test \
- "${BAZEL_BUILD_FLAGS[@]}" \
- "${TEST_PACKAGES[@]}"
-}
-
-install_runtime() {
- cd ${WORKSPACE_DIR}
- sudo -n ${WORKSPACE_DIR}/runsc/test/install.sh --runtime ${RUNTIME}
-}
-
-# Install dependencies for the crictl tests.
-install_crictl_test_deps() {
- # Install containerd.
- sudo -n -E apt-get update
- sudo -n -E apt-get install -y btrfs-tools libseccomp-dev
- # go get will exit with a status of 1 despite succeeding, so ignore errors.
- go get -d github.com/containerd/containerd || true
- cd ${GOPATH}/src/github.com/containerd/containerd
- git checkout v1.2.2
- make
- sudo -n -E make install
-
- # Install crictl.
- # go get will exit with a status of 1 despite succeeding, so ignore errors.
- go get -d github.com/kubernetes-sigs/cri-tools || true
- cd ${GOPATH}/src/github.com/kubernetes-sigs/cri-tools
- git checkout tags/v1.11.0
- make
- sudo -n -E make install
-
- # Install gvisor-containerd-shim.
- local latest=/tmp/gvisor-containerd-shim-latest
- local shim_path=/tmp/gvisor-containerd-shim
- wget --no-verbose https://storage.googleapis.com/cri-containerd-staging/gvisor-containerd-shim/latest -O ${latest}
- wget --no-verbose https://storage.googleapis.com/cri-containerd-staging/gvisor-containerd-shim/gvisor-containerd-shim-$(cat ${latest}) -O ${shim_path}
- chmod +x ${shim_path}
- sudo -n -E mv ${shim_path} /usr/local/bin
-
- # Configure containerd-shim.
- local shim_config_path=/etc/containerd
- local shim_config_tmp_path=/tmp/gvisor-containerd-shim.toml
- sudo -n -E mkdir -p ${shim_config_path}
- cat > ${shim_config_tmp_path} <<-EOF
- runc_shim = "/usr/local/bin/containerd-shim"
-
- [runsc_config]
- debug = "true"
- debug-log = "/tmp/runsc-logs/"
- strace = "true"
- file-access = "shared"
-EOF
- sudo mv ${shim_config_tmp_path} ${shim_config_path}
-
- # Configure CNI.
- sudo -n -E env PATH=${PATH} ${GOPATH}/src/github.com/containerd/containerd/script/setup/install-cni
-}
-
-# Run the tests that require docker.
-run_docker_tests() {
- cd ${WORKSPACE_DIR}
-
- # Run tests with a default runtime (runc).
- bazel test \
- "${BAZEL_BUILD_FLAGS[@]}" \
- --test_env=RUNSC_RUNTIME="" \
- --test_output=all \
- //runsc/test/image:image_test
-
- # These names are used to exclude tests not supported in certain
- # configuration, e.g. save/restore not supported with hostnet.
- declare -a variations=("" "-kvm" "-hostnet" "-overlay")
- for v in "${variations[@]}"; do
- # Run runsc tests with docker that are tagged manual.
- bazel test \
- "${BAZEL_BUILD_FLAGS[@]}" \
- --test_env=RUNSC_RUNTIME="${RUNTIME}${v}" \
- --test_output=all \
- //runsc/test/image:image_test \
- //runsc/test/integration:integration_test
- done
-}
-
-# Run the tests that require root.
-run_root_tests() {
- cd ${WORKSPACE_DIR}
- bazel build //runsc/test/root:root_test
- local root_test=$(find -L ./bazel-bin/ -executable -type f -name root_test | grep __main__)
- if [[ ! -f "${root_test}" ]]; then
- echo "root_test executable not found"
- exit 1
- fi
- sudo -n -E RUNSC_RUNTIME="${RUNTIME}" RUNSC_EXEC=/tmp/"${RUNTIME}"/runsc ${root_test}
-}
-
-# Run syscall unit tests.
-run_syscall_tests() {
- cd ${WORKSPACE_DIR}
- bazel test "${BAZEL_BUILD_RBE_FLAGS[@]}" \
- --test_tag_filters=runsc_ptrace //test/syscalls/...
-}
-
-run_runsc_do_tests() {
- local runsc=$(find bazel-bin/runsc -type f -executable -name "runsc" | head -n1)
-
- # run runsc do without root privileges.
- unshare -Ur ${runsc} --network=none --TESTONLY-unsafe-nonroot do true
- unshare -Ur ${runsc} --TESTONLY-unsafe-nonroot --network=host do --netns=false true
-
- # run runsc do with root privileges.
- sudo -n -E ${runsc} do true
-}
-
-# Find and rename all test xml and log files so that Sponge can pick them up.
-# XML files must be named sponge_log.xml, and log files must be named
-# sponge_log.log. We move all such files into KOKORO_ARTIFACTS_DIR, in a
-# subdirectory named with the test name.
-upload_test_artifacts() {
- cd ${WORKSPACE_DIR}
- find -L "bazel-testlogs" -name "test.xml" -o -name "test.log" -o -name "outputs.zip" |
- tar --create --files-from - --transform 's/test\./sponge_log./' |
- tar --extract --directory ${KOKORO_ARTIFACTS_DIR}
- if [[ -d "/tmp/${RUNTIME}/logs" ]]; then
- tar --create --gzip "--file=${KOKORO_ARTIFACTS_DIR}/runsc-logs.tar.gz" -C /tmp/ ${RUNTIME}/logs
- fi
-}
-
-# Finish runs at exit, even in the event of an error, and uploads all test
-# artifacts.
-finish() {
- # Grab the last exit code, we will return it.
- local exit_code=${?}
- upload_test_artifacts
- exit ${exit_code}
-}
-
-# Run bazel in a docker container
-build_in_docker() {
- cd ${WORKSPACE_DIR}
- bazel clean
- bazel shutdown
- make
- make runsc
- make bazel-shutdown
-}
-
-########
-# MAIN #
-########
-
-main() {
- # Register finish to run at exit.
- trap finish EXIT
-
- # Build and run the simple tests.
- build_everything opt
- run_simple_tests
-
- # So far so good. Install more deps and run the integration tests.
- install_runtime
- install_crictl_test_deps
- run_docker_tests
- run_root_tests
-
- run_syscall_tests
- run_runsc_do_tests
-
- # Build other flavors too.
- build_everything dbg
-
- build_in_docker
- # No need to call "finish" here, it will happen at exit.
-}
-
-# Kick it off.
-main
+../tools/run_tests.sh \ No newline at end of file
diff --git a/pkg/abi/linux/capability.go b/pkg/abi/linux/capability.go
index c120cac64..65dd77e6e 100644
--- a/pkg/abi/linux/capability.go
+++ b/pkg/abi/linux/capability.go
@@ -69,6 +69,90 @@ func (cp Capability) Ok() bool {
return cp >= 0 && cp <= MaxCapability
}
+// String returns the capability name.
+func (cp Capability) String() string {
+ switch cp {
+ case CAP_CHOWN:
+ return "CAP_CHOWN"
+ case CAP_DAC_OVERRIDE:
+ return "CAP_DAC_OVERRIDE"
+ case CAP_DAC_READ_SEARCH:
+ return "CAP_DAC_READ_SEARCH"
+ case CAP_FOWNER:
+ return "CAP_FOWNER"
+ case CAP_FSETID:
+ return "CAP_FSETID"
+ case CAP_KILL:
+ return "CAP_KILL"
+ case CAP_SETGID:
+ return "CAP_SETGID"
+ case CAP_SETUID:
+ return "CAP_SETUID"
+ case CAP_SETPCAP:
+ return "CAP_SETPCAP"
+ case CAP_LINUX_IMMUTABLE:
+ return "CAP_LINUX_IMMUTABLE"
+ case CAP_NET_BIND_SERVICE:
+ return "CAP_NET_BIND_SERVICE"
+ case CAP_NET_BROADCAST:
+ return "CAP_NET_BROADCAST"
+ case CAP_NET_ADMIN:
+ return "CAP_NET_ADMIN"
+ case CAP_NET_RAW:
+ return "CAP_NET_RAW"
+ case CAP_IPC_LOCK:
+ return "CAP_IPC_LOCK"
+ case CAP_IPC_OWNER:
+ return "CAP_IPC_OWNER"
+ case CAP_SYS_MODULE:
+ return "CAP_SYS_MODULE"
+ case CAP_SYS_RAWIO:
+ return "CAP_SYS_RAWIO"
+ case CAP_SYS_CHROOT:
+ return "CAP_SYS_CHROOT"
+ case CAP_SYS_PTRACE:
+ return "CAP_SYS_PTRACE"
+ case CAP_SYS_PACCT:
+ return "CAP_SYS_PACCT"
+ case CAP_SYS_ADMIN:
+ return "CAP_SYS_ADMIN"
+ case CAP_SYS_BOOT:
+ return "CAP_SYS_BOOT"
+ case CAP_SYS_NICE:
+ return "CAP_SYS_NICE"
+ case CAP_SYS_RESOURCE:
+ return "CAP_SYS_RESOURCE"
+ case CAP_SYS_TIME:
+ return "CAP_SYS_TIME"
+ case CAP_SYS_TTY_CONFIG:
+ return "CAP_SYS_TTY_CONFIG"
+ case CAP_MKNOD:
+ return "CAP_MKNOD"
+ case CAP_LEASE:
+ return "CAP_LEASE"
+ case CAP_AUDIT_WRITE:
+ return "CAP_AUDIT_WRITE"
+ case CAP_AUDIT_CONTROL:
+ return "CAP_AUDIT_CONTROL"
+ case CAP_SETFCAP:
+ return "CAP_SETFCAP"
+ case CAP_MAC_OVERRIDE:
+ return "CAP_MAC_OVERRIDE"
+ case CAP_MAC_ADMIN:
+ return "CAP_MAC_ADMIN"
+ case CAP_SYSLOG:
+ return "CAP_SYSLOG"
+ case CAP_WAKE_ALARM:
+ return "CAP_WAKE_ALARM"
+ case CAP_BLOCK_SUSPEND:
+ return "CAP_BLOCK_SUSPEND"
+ case CAP_AUDIT_READ:
+ return "CAP_AUDIT_READ"
+ default:
+ return "UNKNOWN"
+ }
+}
+
// Version numbers used by the capget/capset syscalls, defined in Linux's
// include/uapi/linux/capability.h.
const (
diff --git a/pkg/abi/linux/mm.go b/pkg/abi/linux/mm.go
index 0b02f938a..cd043dac3 100644
--- a/pkg/abi/linux/mm.go
+++ b/pkg/abi/linux/mm.go
@@ -114,3 +114,12 @@ const (
MPOL_MODE_FLAGS = (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES)
)
+
+// Flags for mbind(2).
+const (
+ MPOL_MF_STRICT = 1 << 0
+ MPOL_MF_MOVE = 1 << 1
+ MPOL_MF_MOVE_ALL = 1 << 2
+
+ MPOL_MF_VALID = MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL
+)
diff --git a/pkg/abi/linux/prctl.go b/pkg/abi/linux/prctl.go
index 0428282dd..391cfaa1c 100644
--- a/pkg/abi/linux/prctl.go
+++ b/pkg/abi/linux/prctl.go
@@ -155,3 +155,10 @@ const (
ARCH_GET_GS = 0x1004
ARCH_SET_CPUID = 0x1012
)
+
+// Flags for prctl(PR_SET_DUMPABLE), defined in include/linux/sched/coredump.h.
+const (
+ SUID_DUMP_DISABLE = 0
+ SUID_DUMP_USER = 1
+ SUID_DUMP_ROOT = 2
+)
diff --git a/pkg/abi/linux/socket.go b/pkg/abi/linux/socket.go
index 417840731..a714ac86d 100644
--- a/pkg/abi/linux/socket.go
+++ b/pkg/abi/linux/socket.go
@@ -102,15 +102,19 @@ const (
SOL_NETLINK = 270
)
+// A SockType is a type (as opposed to family) of sockets. These are enumerated
+// below as SOCK_* constants.
+type SockType int
+
// Socket types, from linux/net.h.
const (
- SOCK_STREAM = 1
- SOCK_DGRAM = 2
- SOCK_RAW = 3
- SOCK_RDM = 4
- SOCK_SEQPACKET = 5
- SOCK_DCCP = 6
- SOCK_PACKET = 10
+ SOCK_STREAM SockType = 1
+ SOCK_DGRAM = 2
+ SOCK_RAW = 3
+ SOCK_RDM = 4
+ SOCK_SEQPACKET = 5
+ SOCK_DCCP = 6
+ SOCK_PACKET = 10
)
// SOCK_TYPE_MASK covers all of the above socket types. The remaining bits are
@@ -200,6 +204,22 @@ const (
SS_DISCONNECTING = 4 // In process of disconnecting.
)
+// TCP protocol states, from include/net/tcp_states.h.
+const (
+ TCP_ESTABLISHED uint32 = iota + 1
+ TCP_SYN_SENT
+ TCP_SYN_RECV
+ TCP_FIN_WAIT1
+ TCP_FIN_WAIT2
+ TCP_TIME_WAIT
+ TCP_CLOSE
+ TCP_CLOSE_WAIT
+ TCP_LAST_ACK
+ TCP_LISTEN
+ TCP_CLOSING
+ TCP_NEW_SYN_RECV
+)
+
// SockAddrMax is the maximum size of a struct sockaddr, from
// uapi/linux/socket.h.
const SockAddrMax = 128
diff --git a/pkg/memutil/BUILD b/pkg/memutil/BUILD
new file mode 100644
index 000000000..71b48a972
--- /dev/null
+++ b/pkg/memutil/BUILD
@@ -0,0 +1,11 @@
+load("//tools/go_stateify:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+ name = "memutil",
+ srcs = ["memutil_unsafe.go"],
+ importpath = "gvisor.googlesource.com/gvisor/pkg/memutil",
+ visibility = ["//visibility:public"],
+ deps = ["@org_golang_x_sys//unix:go_default_library"],
+)
diff --git a/pkg/sentry/memutil/memutil_unsafe.go b/pkg/memutil/memutil_unsafe.go
index 92eab8a26..979d942a9 100644
--- a/pkg/sentry/memutil/memutil_unsafe.go
+++ b/pkg/memutil/memutil_unsafe.go
@@ -12,6 +12,9 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+// +build linux
+
+// Package memutil provides a wrapper for the memfd_create() system call.
package memutil
import (
diff --git a/pkg/sentry/platform/procid/BUILD b/pkg/procid/BUILD
index 277509624..7c22b763a 100644
--- a/pkg/sentry/platform/procid/BUILD
+++ b/pkg/procid/BUILD
@@ -9,8 +9,8 @@ go_library(
"procid_amd64.s",
"procid_arm64.s",
],
- importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid",
- visibility = ["//pkg/sentry:internal"],
+ importpath = "gvisor.googlesource.com/gvisor/pkg/procid",
+ visibility = ["//visibility:public"],
)
go_test(
diff --git a/pkg/sentry/platform/procid/procid.go b/pkg/procid/procid.go
index 78b92422c..78b92422c 100644
--- a/pkg/sentry/platform/procid/procid.go
+++ b/pkg/procid/procid.go
diff --git a/pkg/sentry/platform/procid/procid_amd64.s b/pkg/procid/procid_amd64.s
index 30ec8e6e2..30ec8e6e2 100644
--- a/pkg/sentry/platform/procid/procid_amd64.s
+++ b/pkg/procid/procid_amd64.s
diff --git a/pkg/sentry/platform/procid/procid_arm64.s b/pkg/procid/procid_arm64.s
index e340d9f98..e340d9f98 100644
--- a/pkg/sentry/platform/procid/procid_arm64.s
+++ b/pkg/procid/procid_arm64.s
diff --git a/pkg/sentry/platform/procid/procid_net_test.go b/pkg/procid/procid_net_test.go
index b628e2285..b628e2285 100644
--- a/pkg/sentry/platform/procid/procid_net_test.go
+++ b/pkg/procid/procid_net_test.go
diff --git a/pkg/sentry/platform/procid/procid_test.go b/pkg/procid/procid_test.go
index 88dd0b3ae..88dd0b3ae 100644
--- a/pkg/sentry/platform/procid/procid_test.go
+++ b/pkg/procid/procid_test.go
diff --git a/pkg/sentry/context/contexttest/BUILD b/pkg/sentry/context/contexttest/BUILD
index ce4f1e42c..d17b1bdcf 100644
--- a/pkg/sentry/context/contexttest/BUILD
+++ b/pkg/sentry/context/contexttest/BUILD
@@ -9,11 +9,11 @@ go_library(
importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest",
visibility = ["//pkg/sentry:internal"],
deps = [
+ "//pkg/memutil",
"//pkg/sentry/context",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/kernel/time",
"//pkg/sentry/limits",
- "//pkg/sentry/memutil",
"//pkg/sentry/pgalloc",
"//pkg/sentry/platform",
"//pkg/sentry/platform/ptrace",
diff --git a/pkg/sentry/context/contexttest/contexttest.go b/pkg/sentry/context/contexttest/contexttest.go
index 210a235d2..83da40711 100644
--- a/pkg/sentry/context/contexttest/contexttest.go
+++ b/pkg/sentry/context/contexttest/contexttest.go
@@ -21,11 +21,11 @@ import (
"testing"
"time"
+ "gvisor.googlesource.com/gvisor/pkg/memutil"
"gvisor.googlesource.com/gvisor/pkg/sentry/context"
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
- "gvisor.googlesource.com/gvisor/pkg/sentry/memutil"
"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ptrace"
diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go
index c0bc261a2..a0a35c242 100644
--- a/pkg/sentry/fs/dirent.go
+++ b/pkg/sentry/fs/dirent.go
@@ -805,7 +805,7 @@ func (d *Dirent) Bind(ctx context.Context, root *Dirent, name string, data trans
var childDir *Dirent
err := d.genericCreate(ctx, root, name, func() error {
var e error
- childDir, e = d.Inode.Bind(ctx, name, data, perms)
+ childDir, e = d.Inode.Bind(ctx, d, name, data, perms)
if e != nil {
return e
}
diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go
index 8c1307235..f64954457 100644
--- a/pkg/sentry/fs/file.go
+++ b/pkg/sentry/fs/file.go
@@ -545,12 +545,28 @@ type lockedWriter struct {
// Write implements io.Writer.Write.
func (w *lockedWriter) Write(buf []byte) (int, error) {
- n, err := w.File.FileOperations.Write(w.Ctx, w.File, usermem.BytesIOSequence(buf), w.File.offset)
- return int(n), err
+ return w.WriteAt(buf, w.File.offset)
}
// WriteAt implements io.Writer.WriteAt.
func (w *lockedWriter) WriteAt(buf []byte, offset int64) (int, error) {
- n, err := w.File.FileOperations.Write(w.Ctx, w.File, usermem.BytesIOSequence(buf), offset)
- return int(n), err
+ var (
+ written int
+ err error
+ )
+ // The io.Writer contract requires that Write writes all available
+ // bytes and does not return short writes. This causes errors with
+ // io.Copy, since our own Write interface does not have this same
+ // contract. Enforce that here.
+ for written < len(buf) {
+ var n int64
+ n, err = w.File.FileOperations.Write(w.Ctx, w.File, usermem.BytesIOSequence(buf[written:]), offset+int64(written))
+ if n > 0 {
+ written += int(n)
+ }
+ if err != nil {
+ break
+ }
+ }
+ return written, err
}
diff --git a/pkg/sentry/fs/gofer/socket.go b/pkg/sentry/fs/gofer/socket.go
index cbd5b9a84..7ac0a421f 100644
--- a/pkg/sentry/fs/gofer/socket.go
+++ b/pkg/sentry/fs/gofer/socket.go
@@ -15,6 +15,7 @@
package gofer
import (
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
"gvisor.googlesource.com/gvisor/pkg/log"
"gvisor.googlesource.com/gvisor/pkg/p9"
"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
@@ -61,13 +62,13 @@ type endpoint struct {
path string
}
-func unixSockToP9(t transport.SockType) (p9.ConnectFlags, bool) {
+func sockTypeToP9(t linux.SockType) (p9.ConnectFlags, bool) {
switch t {
- case transport.SockStream:
+ case linux.SOCK_STREAM:
return p9.StreamSocket, true
- case transport.SockSeqpacket:
+ case linux.SOCK_SEQPACKET:
return p9.SeqpacketSocket, true
- case transport.SockDgram:
+ case linux.SOCK_DGRAM:
return p9.DgramSocket, true
}
return 0, false
@@ -75,7 +76,7 @@ func unixSockToP9(t transport.SockType) (p9.ConnectFlags, bool) {
// BidirectionalConnect implements ConnectableEndpoint.BidirectionalConnect.
func (e *endpoint) BidirectionalConnect(ce transport.ConnectingEndpoint, returnConnect func(transport.Receiver, transport.ConnectedEndpoint)) *syserr.Error {
- cf, ok := unixSockToP9(ce.Type())
+ cf, ok := sockTypeToP9(ce.Type())
if !ok {
return syserr.ErrConnectionRefused
}
@@ -139,3 +140,8 @@ func (e *endpoint) UnidirectionalConnect() (transport.ConnectedEndpoint, *syserr
func (e *endpoint) Release() {
e.inode.DecRef()
}
+
+// Passcred implements transport.BoundEndpoint.Passcred.
+func (e *endpoint) Passcred() bool {
+ return false
+}
diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go
index 3ed137006..305eea718 100644
--- a/pkg/sentry/fs/host/socket.go
+++ b/pkg/sentry/fs/host/socket.go
@@ -15,9 +15,11 @@
package host
import (
+ "fmt"
"sync"
"syscall"
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
"gvisor.googlesource.com/gvisor/pkg/fd"
"gvisor.googlesource.com/gvisor/pkg/fdnotifier"
"gvisor.googlesource.com/gvisor/pkg/log"
@@ -51,25 +53,11 @@ type ConnectedEndpoint struct {
// ref keeps track of references to a connectedEndpoint.
ref refs.AtomicRefCount
- // mu protects fd, readClosed and writeClosed.
- mu sync.RWMutex `state:"nosave"`
-
- // file is an *fd.FD containing the FD backing this endpoint. It must be
- // set to nil if it has been closed.
- file *fd.FD `state:"nosave"`
-
- // readClosed is true if the FD has read shutdown or if it has been closed.
- readClosed bool
-
- // writeClosed is true if the FD has write shutdown or if it has been
- // closed.
- writeClosed bool
-
// If srfd >= 0, it is the host FD that file was imported from.
srfd int `state:"wait"`
// stype is the type of Unix socket.
- stype transport.SockType
+ stype linux.SockType
// sndbuf is the size of the send buffer.
//
@@ -78,6 +66,13 @@ type ConnectedEndpoint struct {
// prevent lots of small messages from filling the real send buffer
// size on the host.
sndbuf int `state:"nosave"`
+
+ // mu protects the fields below.
+ mu sync.RWMutex `state:"nosave"`
+
+ // file is an *fd.FD containing the FD backing this endpoint. It must be
+ // set to nil if it has been closed.
+ file *fd.FD `state:"nosave"`
}
// init performs initialization required for creating new ConnectedEndpoints and
@@ -111,7 +106,7 @@ func (c *ConnectedEndpoint) init() *syserr.Error {
return syserr.ErrInvalidEndpointState
}
- c.stype = transport.SockType(stype)
+ c.stype = linux.SockType(stype)
c.sndbuf = sndbuf
return nil
@@ -169,7 +164,7 @@ func NewSocketWithDirent(ctx context.Context, d *fs.Dirent, f *fd.FD, flags fs.F
ep := transport.NewExternal(e.stype, uniqueid.GlobalProviderFromContext(ctx), &q, e, e)
- return unixsocket.NewWithDirent(ctx, d, ep, e.stype != transport.SockStream, flags), nil
+ return unixsocket.NewWithDirent(ctx, d, ep, e.stype, flags), nil
}
// newSocket allocates a new unix socket with host endpoint.
@@ -201,16 +196,13 @@ func newSocket(ctx context.Context, orgfd int, saveable bool) (*fs.File, error)
ep := transport.NewExternal(e.stype, uniqueid.GlobalProviderFromContext(ctx), &q, e, e)
- return unixsocket.New(ctx, ep, e.stype != transport.SockStream), nil
+ return unixsocket.New(ctx, ep, e.stype), nil
}
// Send implements transport.ConnectedEndpoint.Send.
func (c *ConnectedEndpoint) Send(data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (uintptr, bool, *syserr.Error) {
c.mu.RLock()
defer c.mu.RUnlock()
- if c.writeClosed {
- return 0, false, syserr.ErrClosedForSend
- }
if !controlMessages.Empty() {
return 0, false, syserr.ErrInvalidEndpointState
@@ -218,7 +210,7 @@ func (c *ConnectedEndpoint) Send(data [][]byte, controlMessages transport.Contro
// Since stream sockets don't preserve message boundaries, we can write
// only as much of the message as fits in the send buffer.
- truncate := c.stype == transport.SockStream
+ truncate := c.stype == linux.SOCK_STREAM
n, totalLen, err := fdWriteVec(c.file.FD(), data, c.sndbuf, truncate)
if n < totalLen && err == nil {
@@ -244,8 +236,13 @@ func (c *ConnectedEndpoint) SendNotify() {}
// CloseSend implements transport.ConnectedEndpoint.CloseSend.
func (c *ConnectedEndpoint) CloseSend() {
c.mu.Lock()
- c.writeClosed = true
- c.mu.Unlock()
+ defer c.mu.Unlock()
+
+ if err := syscall.Shutdown(c.file.FD(), syscall.SHUT_WR); err != nil {
+ // A well-formed UDS shutdown can't fail. See
+ // net/unix/af_unix.c:unix_shutdown.
+ panic(fmt.Sprintf("failed write shutdown on host socket %+v: %v", c, err))
+ }
}
// CloseNotify implements transport.ConnectedEndpoint.CloseNotify.
@@ -255,9 +252,7 @@ func (c *ConnectedEndpoint) CloseNotify() {}
func (c *ConnectedEndpoint) Writable() bool {
c.mu.RLock()
defer c.mu.RUnlock()
- if c.writeClosed {
- return true
- }
+
return fdnotifier.NonBlockingPoll(int32(c.file.FD()), waiter.EventOut)&waiter.EventOut != 0
}
@@ -285,9 +280,6 @@ func (c *ConnectedEndpoint) EventUpdate() {
func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights uintptr, peek bool) (uintptr, uintptr, transport.ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) {
c.mu.RLock()
defer c.mu.RUnlock()
- if c.readClosed {
- return 0, 0, transport.ControlMessages{}, false, tcpip.FullAddress{}, false, syserr.ErrClosedForReceive
- }
var cm unet.ControlMessage
if numRights > 0 {
@@ -344,31 +336,34 @@ func (c *ConnectedEndpoint) RecvNotify() {}
// CloseRecv implements transport.Receiver.CloseRecv.
func (c *ConnectedEndpoint) CloseRecv() {
c.mu.Lock()
- c.readClosed = true
- c.mu.Unlock()
+ defer c.mu.Unlock()
+
+ if err := syscall.Shutdown(c.file.FD(), syscall.SHUT_RD); err != nil {
+ // A well-formed UDS shutdown can't fail. See
+ // net/unix/af_unix.c:unix_shutdown.
+ panic(fmt.Sprintf("failed read shutdown on host socket %+v: %v", c, err))
+ }
}
// Readable implements transport.Receiver.Readable.
func (c *ConnectedEndpoint) Readable() bool {
c.mu.RLock()
defer c.mu.RUnlock()
- if c.readClosed {
- return true
- }
+
return fdnotifier.NonBlockingPoll(int32(c.file.FD()), waiter.EventIn)&waiter.EventIn != 0
}
// SendQueuedSize implements transport.Receiver.SendQueuedSize.
func (c *ConnectedEndpoint) SendQueuedSize() int64 {
- // SendQueuedSize isn't supported for host sockets because we don't allow the
- // sentry to call ioctl(2).
+ // TODO(gvisor.dev/issue/273): SendQueuedSize isn't supported for host
+ // sockets because we don't allow the sentry to call ioctl(2).
return -1
}
// RecvQueuedSize implements transport.Receiver.RecvQueuedSize.
func (c *ConnectedEndpoint) RecvQueuedSize() int64 {
- // RecvQueuedSize isn't supported for host sockets because we don't allow the
- // sentry to call ioctl(2).
+ // TODO(gvisor.dev/issue/273): RecvQueuedSize isn't supported for host
+ // sockets because we don't allow the sentry to call ioctl(2).
return -1
}
diff --git a/pkg/sentry/fs/host/socket_test.go b/pkg/sentry/fs/host/socket_test.go
index 06392a65a..bc3ce5627 100644
--- a/pkg/sentry/fs/host/socket_test.go
+++ b/pkg/sentry/fs/host/socket_test.go
@@ -198,20 +198,6 @@ func TestListen(t *testing.T) {
}
}
-func TestSend(t *testing.T) {
- e := ConnectedEndpoint{writeClosed: true}
- if _, _, err := e.Send(nil, transport.ControlMessages{}, tcpip.FullAddress{}); err != syserr.ErrClosedForSend {
- t.Errorf("Got %#v.Send() = %v, want = %v", e, err, syserr.ErrClosedForSend)
- }
-}
-
-func TestRecv(t *testing.T) {
- e := ConnectedEndpoint{readClosed: true}
- if _, _, _, _, _, _, err := e.Recv(nil, false, 0, false); err != syserr.ErrClosedForReceive {
- t.Errorf("Got %#v.Recv() = %v, want = %v", e, err, syserr.ErrClosedForReceive)
- }
-}
-
func TestPasscred(t *testing.T) {
e := ConnectedEndpoint{}
if got, want := e.Passcred(), false; got != want {
@@ -244,20 +230,6 @@ func TestQueuedSize(t *testing.T) {
}
}
-func TestReadable(t *testing.T) {
- e := ConnectedEndpoint{readClosed: true}
- if got, want := e.Readable(), true; got != want {
- t.Errorf("Got %#v.Readable() = %t, want = %t", e, got, want)
- }
-}
-
-func TestWritable(t *testing.T) {
- e := ConnectedEndpoint{writeClosed: true}
- if got, want := e.Writable(), true; got != want {
- t.Errorf("Got %#v.Writable() = %t, want = %t", e, got, want)
- }
-}
-
func TestRelease(t *testing.T) {
f, err := syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
if err != nil {
@@ -272,131 +244,3 @@ func TestRelease(t *testing.T) {
t.Errorf("got = %#v, want = %#v", c, want)
}
}
-
-func TestClose(t *testing.T) {
- type testCase struct {
- name string
- cep *ConnectedEndpoint
- addFD bool
- f func()
- want *ConnectedEndpoint
- }
-
- var tests []testCase
-
- // nil is the value used by ConnectedEndpoint to indicate a closed file.
- // Non-nil files are used to check if the file gets closed.
-
- f, err := syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
- if err != nil {
- t.Fatal("Creating socket:", err)
- }
- c := &ConnectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f)}
- tests = append(tests, testCase{
- name: "First CloseRecv",
- cep: c,
- addFD: false,
- f: c.CloseRecv,
- want: &ConnectedEndpoint{queue: c.queue, file: c.file, readClosed: true},
- })
-
- f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
- if err != nil {
- t.Fatal("Creating socket:", err)
- }
- c = &ConnectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), readClosed: true}
- tests = append(tests, testCase{
- name: "Second CloseRecv",
- cep: c,
- addFD: false,
- f: c.CloseRecv,
- want: &ConnectedEndpoint{queue: c.queue, file: c.file, readClosed: true},
- })
-
- f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
- if err != nil {
- t.Fatal("Creating socket:", err)
- }
- c = &ConnectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f)}
- tests = append(tests, testCase{
- name: "First CloseSend",
- cep: c,
- addFD: false,
- f: c.CloseSend,
- want: &ConnectedEndpoint{queue: c.queue, file: c.file, writeClosed: true},
- })
-
- f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
- if err != nil {
- t.Fatal("Creating socket:", err)
- }
- c = &ConnectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), writeClosed: true}
- tests = append(tests, testCase{
- name: "Second CloseSend",
- cep: c,
- addFD: false,
- f: c.CloseSend,
- want: &ConnectedEndpoint{queue: c.queue, file: c.file, writeClosed: true},
- })
-
- f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
- if err != nil {
- t.Fatal("Creating socket:", err)
- }
- c = &ConnectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), writeClosed: true}
- tests = append(tests, testCase{
- name: "CloseSend then CloseRecv",
- cep: c,
- addFD: true,
- f: c.CloseRecv,
- want: &ConnectedEndpoint{queue: c.queue, file: c.file, readClosed: true, writeClosed: true},
- })
-
- f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
- if err != nil {
- t.Fatal("Creating socket:", err)
- }
- c = &ConnectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), readClosed: true}
- tests = append(tests, testCase{
- name: "CloseRecv then CloseSend",
- cep: c,
- addFD: true,
- f: c.CloseSend,
- want: &ConnectedEndpoint{queue: c.queue, file: c.file, readClosed: true, writeClosed: true},
- })
-
- f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
- if err != nil {
- t.Fatal("Creating socket:", err)
- }
- c = &ConnectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), readClosed: true, writeClosed: true}
- tests = append(tests, testCase{
- name: "Full close then CloseRecv",
- cep: c,
- addFD: false,
- f: c.CloseRecv,
- want: &ConnectedEndpoint{queue: c.queue, file: c.file, readClosed: true, writeClosed: true},
- })
-
- f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
- if err != nil {
- t.Fatal("Creating socket:", err)
- }
- c = &ConnectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), readClosed: true, writeClosed: true}
- tests = append(tests, testCase{
- name: "Full close then CloseSend",
- cep: c,
- addFD: false,
- f: c.CloseSend,
- want: &ConnectedEndpoint{queue: c.queue, file: c.file, readClosed: true, writeClosed: true},
- })
-
- for _, test := range tests {
- if test.addFD {
- fdnotifier.AddFD(int32(test.cep.file.FD()), nil)
- }
- if test.f(); !reflect.DeepEqual(test.cep, test.want) {
- t.Errorf("%s: got = %#v, want = %#v", test.name, test.cep, test.want)
- }
- }
-}
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go
index aef1a1cb9..0b54c2e77 100644
--- a/pkg/sentry/fs/inode.go
+++ b/pkg/sentry/fs/inode.go
@@ -220,9 +220,9 @@ func (i *Inode) Rename(ctx context.Context, oldParent *Dirent, renamed *Dirent,
}
// Bind calls i.InodeOperations.Bind with i as the directory.
-func (i *Inode) Bind(ctx context.Context, name string, data transport.BoundEndpoint, perm FilePermissions) (*Dirent, error) {
+func (i *Inode) Bind(ctx context.Context, parent *Dirent, name string, data transport.BoundEndpoint, perm FilePermissions) (*Dirent, error) {
if i.overlay != nil {
- return overlayBind(ctx, i.overlay, name, data, perm)
+ return overlayBind(ctx, i.overlay, parent, name, data, perm)
}
return i.InodeOperations.Bind(ctx, i, name, data, perm)
}
diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go
index cdffe173b..06506fb20 100644
--- a/pkg/sentry/fs/inode_overlay.go
+++ b/pkg/sentry/fs/inode_overlay.go
@@ -398,14 +398,14 @@ func overlayRename(ctx context.Context, o *overlayEntry, oldParent *Dirent, rena
return nil
}
-func overlayBind(ctx context.Context, o *overlayEntry, name string, data transport.BoundEndpoint, perm FilePermissions) (*Dirent, error) {
+func overlayBind(ctx context.Context, o *overlayEntry, parent *Dirent, name string, data transport.BoundEndpoint, perm FilePermissions) (*Dirent, error) {
+ if err := copyUp(ctx, parent); err != nil {
+ return nil, err
+ }
+
o.copyMu.RLock()
defer o.copyMu.RUnlock()
- // We do not support doing anything exciting with sockets unless there
- // is already a directory in the upper filesystem.
- if o.upper == nil {
- return nil, syserror.EOPNOTSUPP
- }
+
d, err := o.upper.InodeOperations.Bind(ctx, o.upper, name, data, perm)
if err != nil {
return nil, err
diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD
index d19c360e0..1728fe0b5 100644
--- a/pkg/sentry/fs/proc/BUILD
+++ b/pkg/sentry/fs/proc/BUILD
@@ -45,6 +45,7 @@ go_library(
"//pkg/sentry/kernel/time",
"//pkg/sentry/limits",
"//pkg/sentry/mm",
+ "//pkg/sentry/socket",
"//pkg/sentry/socket/rpcinet",
"//pkg/sentry/socket/unix",
"//pkg/sentry/socket/unix/transport",
diff --git a/pkg/sentry/fs/proc/inode.go b/pkg/sentry/fs/proc/inode.go
index 379569823..986bc0a45 100644
--- a/pkg/sentry/fs/proc/inode.go
+++ b/pkg/sentry/fs/proc/inode.go
@@ -21,11 +21,14 @@ import (
"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
"gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/device"
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/mm"
"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
)
// taskOwnedInodeOps wraps an fs.InodeOperations and overrides the UnstableAttr
-// method to return the task as the owner.
+// method to return either the task or root as the owner, depending on the
+// task's dumpability.
//
// +stateify savable
type taskOwnedInodeOps struct {
@@ -41,9 +44,42 @@ func (i *taskOwnedInodeOps) UnstableAttr(ctx context.Context, inode *fs.Inode) (
if err != nil {
return fs.UnstableAttr{}, err
}
- // Set the task owner as the file owner.
+
+ // By default, set the task owner as the file owner.
creds := i.t.Credentials()
uattr.Owner = fs.FileOwner{creds.EffectiveKUID, creds.EffectiveKGID}
+
+ // Linux doesn't apply dumpability adjustments to world
+ // readable/executable directories so that applications can stat
+ // /proc/PID to determine the effective UID of a process. See
+ // fs/proc/base.c:task_dump_owner.
+ if fs.IsDir(inode.StableAttr) && uattr.Perms == fs.FilePermsFromMode(0555) {
+ return uattr, nil
+ }
+
+ // If the task is not dumpable, then root (in the namespace preferred)
+ // owns the file.
+ var m *mm.MemoryManager
+ i.t.WithMuLocked(func(t *kernel.Task) {
+ m = t.MemoryManager()
+ })
+
+ if m == nil {
+ uattr.Owner.UID = auth.RootKUID
+ uattr.Owner.GID = auth.RootKGID
+ } else if m.Dumpability() != mm.UserDumpable {
+ if kuid := creds.UserNamespace.MapToKUID(auth.RootUID); kuid.Ok() {
+ uattr.Owner.UID = kuid
+ } else {
+ uattr.Owner.UID = auth.RootKUID
+ }
+ if kgid := creds.UserNamespace.MapToKGID(auth.RootGID); kgid.Ok() {
+ uattr.Owner.GID = kgid
+ } else {
+ uattr.Owner.GID = auth.RootKGID
+ }
+ }
+
return uattr, nil
}
diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go
index 4a107c739..034950158 100644
--- a/pkg/sentry/fs/proc/net.go
+++ b/pkg/sentry/fs/proc/net.go
@@ -27,6 +27,7 @@ import (
"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
"gvisor.googlesource.com/gvisor/pkg/sentry/inet"
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/socket"
"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix"
"gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
)
@@ -213,17 +214,18 @@ func (n *netUnix) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]s
fmt.Fprintf(&buf, "Num RefCount Protocol Flags Type St Inode Path\n")
// Entries
- for _, sref := range n.k.ListSockets(linux.AF_UNIX) {
- s := sref.Get()
+ for _, se := range n.k.ListSockets() {
+ s := se.Sock.Get()
if s == nil {
- log.Debugf("Couldn't resolve weakref %v in socket table, racing with destruction?", sref)
+ log.Debugf("Couldn't resolve weakref %v in socket table, racing with destruction?", se.Sock)
continue
}
sfile := s.(*fs.File)
- sops, ok := sfile.FileOperations.(*unix.SocketOperations)
- if !ok {
- panic(fmt.Sprintf("Found non-unix socket file in unix socket table: %+v", sfile))
+ if family, _, _ := sfile.FileOperations.(socket.Socket).Type(); family != linux.AF_UNIX {
+ // Not a unix socket.
+ continue
}
+ sops := sfile.FileOperations.(*unix.SocketOperations)
addr, err := sops.Endpoint().GetLocalAddress()
if err != nil {
@@ -240,24 +242,6 @@ func (n *netUnix) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]s
}
}
- var sockState int
- switch sops.Endpoint().Type() {
- case linux.SOCK_DGRAM:
- sockState = linux.SS_CONNECTING
- // Unlike Linux, we don't have unbound connection-less sockets,
- // so no SS_DISCONNECTING.
-
- case linux.SOCK_SEQPACKET:
- fallthrough
- case linux.SOCK_STREAM:
- // Connectioned.
- if sops.Endpoint().(transport.ConnectingEndpoint).Connected() {
- sockState = linux.SS_CONNECTED
- } else {
- sockState = linux.SS_UNCONNECTED
- }
- }
-
// In the socket entry below, the value for the 'Num' field requires
// some consideration. Linux prints the address to the struct
// unix_sock representing a socket in the kernel, but may redact the
@@ -282,7 +266,7 @@ func (n *netUnix) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]s
0, // Protocol, always 0 for UDS.
sockFlags, // Flags.
sops.Endpoint().Type(), // Type.
- sockState, // State.
+ sops.State(), // State.
sfile.InodeID(), // Inode.
)
diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go
index 77e03d349..21a965f90 100644
--- a/pkg/sentry/fs/proc/task.go
+++ b/pkg/sentry/fs/proc/task.go
@@ -96,7 +96,7 @@ func (p *proc) newTaskDir(t *kernel.Task, msrc *fs.MountSource, showSubtasks boo
contents["cgroup"] = newCGroupInode(t, msrc, p.cgroupControllers)
}
- // TODO(b/31916171): Set EUID/EGID based on dumpability.
+ // N.B. taskOwnedInodeOps enforces dumpability-based ownership.
d := &taskDir{
Dir: *ramfs.NewDir(t, contents, fs.RootOwner, fs.FilePermsFromMode(0555)),
t: t,
@@ -667,6 +667,21 @@ func newComm(t *kernel.Task, msrc *fs.MountSource) *fs.Inode {
return newProcInode(c, msrc, fs.SpecialFile, t)
}
+// Check implements fs.InodeOperations.Check.
+func (c *comm) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
+ // This file can always be read or written by members of the same
+ // thread group. See fs/proc/base.c:proc_tid_comm_permission.
+ //
+ // N.B. This check is currently a no-op as we don't yet support writing
+ // and this file is world-readable anyways.
+ t := kernel.TaskFromContext(ctx)
+ if t != nil && t.ThreadGroup() == c.t.ThreadGroup() && !p.Execute {
+ return true
+ }
+
+ return fs.ContextCanAccessFile(ctx, inode, p)
+}
+
// GetFile implements fs.InodeOperations.GetFile.
func (c *comm) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
return fs.NewFile(ctx, dirent, flags, &commFile{t: c.t}), nil
diff --git a/pkg/sentry/fs/timerfd/timerfd.go b/pkg/sentry/fs/timerfd/timerfd.go
index bce5f091d..c1721f434 100644
--- a/pkg/sentry/fs/timerfd/timerfd.go
+++ b/pkg/sentry/fs/timerfd/timerfd.go
@@ -54,6 +54,8 @@ type TimerOperations struct {
// NewFile returns a timerfd File that receives time from c.
func NewFile(ctx context.Context, c ktime.Clock) *fs.File {
dirent := fs.NewDirent(anon.NewInode(ctx), "anon_inode:[timerfd]")
+ // Release the initial dirent reference after NewFile takes a reference.
+ defer dirent.DecRef()
tops := &TimerOperations{}
tops.timer = ktime.NewTimer(c, tops)
// Timerfds reject writes, but the Write flag must be set in order to
diff --git a/pkg/sentry/fs/tmpfs/fs.go b/pkg/sentry/fs/tmpfs/fs.go
index b7c29a4d1..83e1bf247 100644
--- a/pkg/sentry/fs/tmpfs/fs.go
+++ b/pkg/sentry/fs/tmpfs/fs.go
@@ -34,6 +34,16 @@ const (
// GID for the root directory.
rootGIDKey = "gid"
+ // cacheKey sets the caching policy for the mount.
+ cacheKey = "cache"
+
+ // cacheAll uses the virtual file system cache for everything (default).
+ cacheAll = "cache"
+
+ // cacheRevalidate allows dirents to be cached, but revalidates them on each
+ // lookup.
+ cacheRevalidate = "revalidate"
+
// TODO(edahlgren/mpratt): support a tmpfs size limit.
// size = "size"
@@ -122,15 +132,24 @@ func (f *Filesystem) Mount(ctx context.Context, device string, flags fs.MountSou
delete(options, rootGIDKey)
}
+ // Construct a mount which will follow the cache options provided.
+ var msrc *fs.MountSource
+ switch options[cacheKey] {
+ case "", cacheAll:
+ msrc = fs.NewCachingMountSource(f, flags)
+ case cacheRevalidate:
+ msrc = fs.NewRevalidatingMountSource(f, flags)
+ default:
+ return nil, fmt.Errorf("invalid cache policy option %q", options[cacheKey])
+ }
+ delete(options, cacheKey)
+
// Fail if the caller passed us more options than we can parse. They may be
// expecting us to set something we can't set.
if len(options) > 0 {
return nil, fmt.Errorf("unsupported mount options: %v", options)
}
- // Construct a mount which will cache dirents.
- msrc := fs.NewCachingMountSource(f, flags)
-
// Construct the tmpfs root.
return NewDir(ctx, nil, owner, perms, msrc), nil
}
diff --git a/pkg/sentry/hostmm/BUILD b/pkg/sentry/hostmm/BUILD
new file mode 100644
index 000000000..1a4632a54
--- /dev/null
+++ b/pkg/sentry/hostmm/BUILD
@@ -0,0 +1,18 @@
+load("//tools/go_stateify:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+ name = "hostmm",
+ srcs = [
+ "cgroup.go",
+ "hostmm.go",
+ ],
+ importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/hostmm",
+ visibility = ["//pkg/sentry:internal"],
+ deps = [
+ "//pkg/fd",
+ "//pkg/log",
+ "//pkg/sentry/usermem",
+ ],
+)
diff --git a/pkg/sentry/hostmm/cgroup.go b/pkg/sentry/hostmm/cgroup.go
new file mode 100644
index 000000000..e5cc26ab2
--- /dev/null
+++ b/pkg/sentry/hostmm/cgroup.go
@@ -0,0 +1,111 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package hostmm
+
+import (
+ "bufio"
+ "fmt"
+ "os"
+ "path"
+ "strings"
+)
+
+// currentCgroupDirectory returns the directory for the cgroup for the given
+// controller in which the calling process resides.
+func currentCgroupDirectory(ctrl string) (string, error) {
+ root, err := cgroupRootDirectory(ctrl)
+ if err != nil {
+ return "", err
+ }
+ cg, err := currentCgroup(ctrl)
+ if err != nil {
+ return "", err
+ }
+ return path.Join(root, cg), nil
+}
+
+// cgroupRootDirectory returns the root directory for the cgroup hierarchy in
+// which the given cgroup controller is mounted in the calling process' mount
+// namespace.
+func cgroupRootDirectory(ctrl string) (string, error) {
+ const path = "/proc/self/mounts"
+ file, err := os.Open(path)
+ if err != nil {
+ return "", err
+ }
+ defer file.Close()
+
+ // Per proc(5) -> fstab(5):
+ // Each line of /proc/self/mounts describes a mount.
+ scanner := bufio.NewScanner(file)
+ for scanner.Scan() {
+ // Each line consists of 6 space-separated fields. Find the line for
+ // which the third field (fs_vfstype) is cgroup, and the fourth field
+ // (fs_mntops, a comma-separated list of mount options) contains
+ // ctrl.
+ var spec, file, vfstype, mntopts, freq, passno string
+ const nrfields = 6
+ line := scanner.Text()
+ n, err := fmt.Sscan(line, &spec, &file, &vfstype, &mntopts, &freq, &passno)
+ if err != nil {
+ return "", fmt.Errorf("failed to parse %s: %v", path, err)
+ }
+ if n != nrfields {
+ return "", fmt.Errorf("failed to parse %s: line %q: got %d fields, wanted %d", path, line, n, nrfields)
+ }
+ if vfstype != "cgroup" {
+ continue
+ }
+ for _, mntopt := range strings.Split(mntopts, ",") {
+ if mntopt == ctrl {
+ return file, nil
+ }
+ }
+ }
+ return "", fmt.Errorf("no cgroup hierarchy mounted for controller %s", ctrl)
+}
+
+// currentCgroup returns the cgroup for the given controller in which the
+// calling process resides. The returned string is a path that should be
+// interpreted as relative to cgroupRootDirectory(ctrl).
+func currentCgroup(ctrl string) (string, error) {
+ const path = "/proc/self/cgroup"
+ file, err := os.Open(path)
+ if err != nil {
+ return "", err
+ }
+ defer file.Close()
+
+ // Per proc(5) -> cgroups(7):
+ // Each line of /proc/self/cgroups describes a cgroup hierarchy.
+ scanner := bufio.NewScanner(file)
+ for scanner.Scan() {
+ // Each line consists of 3 colon-separated fields. Find the line for
+ // which the second field (controller-list, a comma-separated list of
+ // cgroup controllers) contains ctrl.
+ line := scanner.Text()
+ const nrfields = 3
+ fields := strings.Split(line, ":")
+ if len(fields) != nrfields {
+ return "", fmt.Errorf("failed to parse %s: line %q: got %d fields, wanted %d", path, line, len(fields), nrfields)
+ }
+ for _, controller := range strings.Split(fields[1], ",") {
+ if controller == ctrl {
+ return fields[2], nil
+ }
+ }
+ }
+ return "", fmt.Errorf("not a member of a cgroup hierarchy for controller %s", ctrl)
+}
diff --git a/pkg/sentry/hostmm/hostmm.go b/pkg/sentry/hostmm/hostmm.go
new file mode 100644
index 000000000..5432cada9
--- /dev/null
+++ b/pkg/sentry/hostmm/hostmm.go
@@ -0,0 +1,130 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package hostmm provides tools for interacting with the host Linux kernel's
+// virtual memory management subsystem.
+package hostmm
+
+import (
+ "fmt"
+ "os"
+ "path"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/fd"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// NotifyCurrentMemcgPressureCallback requests that f is called whenever the
+// calling process' memory cgroup indicates memory pressure of the given level,
+// as specified by Linux's Documentation/cgroup-v1/memory.txt.
+//
+// If NotifyCurrentMemcgPressureCallback succeeds, it returns a function that
+// terminates the requested memory pressure notifications. This function may be
+// called at most once.
+func NotifyCurrentMemcgPressureCallback(f func(), level string) (func(), error) {
+ cgdir, err := currentCgroupDirectory("memory")
+ if err != nil {
+ return nil, err
+ }
+
+ pressurePath := path.Join(cgdir, "memory.pressure_level")
+ pressureFile, err := os.Open(pressurePath)
+ if err != nil {
+ return nil, err
+ }
+ defer pressureFile.Close()
+
+ eventControlPath := path.Join(cgdir, "cgroup.event_control")
+ eventControlFile, err := os.OpenFile(eventControlPath, os.O_WRONLY, 0)
+ if err != nil {
+ return nil, err
+ }
+ defer eventControlFile.Close()
+
+ eventFD, err := newEventFD()
+ if err != nil {
+ return nil, err
+ }
+
+ // Don't use fmt.Fprintf since the whole string needs to be written in a
+ // single syscall.
+ eventControlStr := fmt.Sprintf("%d %d %s", eventFD.FD(), pressureFile.Fd(), level)
+ if n, err := eventControlFile.Write([]byte(eventControlStr)); n != len(eventControlStr) || err != nil {
+ eventFD.Close()
+ return nil, fmt.Errorf("error writing %q to %s: got (%d, %v), wanted (%d, nil)", eventControlStr, eventControlPath, n, err, len(eventControlStr))
+ }
+
+ log.Debugf("Receiving memory pressure level notifications from %s at level %q", pressurePath, level)
+ const sizeofUint64 = 8
+ // The most significant bit of the eventfd value is set by the stop
+ // function, which is practically unambiguous since it's not plausible for
+ // 2**63 pressure events to occur between eventfd reads.
+ const stopVal = 1 << 63
+ stopCh := make(chan struct{})
+ go func() { // S/R-SAFE: f provides synchronization if necessary
+ rw := fd.NewReadWriter(eventFD.FD())
+ var buf [sizeofUint64]byte
+ for {
+ n, err := rw.Read(buf[:])
+ if err != nil {
+ if err == syscall.EINTR {
+ continue
+ }
+ panic(fmt.Sprintf("failed to read from memory pressure level eventfd: %v", err))
+ }
+ if n != sizeofUint64 {
+ panic(fmt.Sprintf("short read from memory pressure level eventfd: got %d bytes, wanted %d", n, sizeofUint64))
+ }
+ val := usermem.ByteOrder.Uint64(buf[:])
+ if val >= stopVal {
+ // Assume this was due to the notifier's "destructor" (the
+ // function returned by NotifyCurrentMemcgPressureCallback
+ // below) being called.
+ eventFD.Close()
+ close(stopCh)
+ return
+ }
+ f()
+ }
+ }()
+ return func() {
+ rw := fd.NewReadWriter(eventFD.FD())
+ var buf [sizeofUint64]byte
+ usermem.ByteOrder.PutUint64(buf[:], stopVal)
+ for {
+ n, err := rw.Write(buf[:])
+ if err != nil {
+ if err == syscall.EINTR {
+ continue
+ }
+ panic(fmt.Sprintf("failed to write to memory pressure level eventfd: %v", err))
+ }
+ if n != sizeofUint64 {
+ panic(fmt.Sprintf("short write to memory pressure level eventfd: got %d bytes, wanted %d", n, sizeofUint64))
+ }
+ break
+ }
+ <-stopCh
+ }, nil
+}
+
+func newEventFD() (*fd.FD, error) {
+ f, _, e := syscall.Syscall(syscall.SYS_EVENTFD2, 0, 0, 0)
+ if e != 0 {
+ return nil, fmt.Errorf("failed to create eventfd: %v", e)
+ }
+ return fd.New(int(f)), nil
+}
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index 99a2fd964..04e375910 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -64,6 +64,18 @@ go_template_instance(
},
)
+go_template_instance(
+ name = "socket_list",
+ out = "socket_list.go",
+ package = "kernel",
+ prefix = "socket",
+ template = "//pkg/ilist:generic_list",
+ types = {
+ "Element": "*SocketEntry",
+ "Linker": "*SocketEntry",
+ },
+)
+
proto_library(
name = "uncaught_signal_proto",
srcs = ["uncaught_signal.proto"],
@@ -104,6 +116,7 @@ go_library(
"sessions.go",
"signal.go",
"signal_handlers.go",
+ "socket_list.go",
"syscalls.go",
"syscalls_state.go",
"syslog.go",
diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go
index bbacba1f4..43ae22a5d 100644
--- a/pkg/sentry/kernel/epoll/epoll.go
+++ b/pkg/sentry/kernel/epoll/epoll.go
@@ -156,6 +156,8 @@ var cycleMu sync.Mutex
func NewEventPoll(ctx context.Context) *fs.File {
// name matches fs/eventpoll.c:epoll_create1.
dirent := fs.NewDirent(anon.NewInode(ctx), fmt.Sprintf("anon_inode:[eventpoll]"))
+ // Release the initial dirent reference after NewFile takes a reference.
+ defer dirent.DecRef()
return fs.NewFile(ctx, dirent, fs.FileFlags{}, &EventPoll{
files: make(map[FileIdentifier]*pollEntry),
})
diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go
index 2f900be38..fe474cbf0 100644
--- a/pkg/sentry/kernel/eventfd/eventfd.go
+++ b/pkg/sentry/kernel/eventfd/eventfd.go
@@ -69,6 +69,8 @@ type EventOperations struct {
func New(ctx context.Context, initVal uint64, semMode bool) *fs.File {
// name matches fs/eventfd.c:eventfd_file_create.
dirent := fs.NewDirent(anon.NewInode(ctx), "anon_inode:[eventfd]")
+ // Release the initial dirent reference after NewFile takes a reference.
+ defer dirent.DecRef()
return fs.NewFile(ctx, dirent, fs.FileFlags{Read: true, Write: true}, &EventOperations{
val: initVal,
semMode: semMode,
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 85d73ace2..f253a81d9 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -182,9 +182,13 @@ type Kernel struct {
// danglingEndpoints is used to save / restore tcpip.DanglingEndpoints.
danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"`
- // socketTable is used to track all sockets on the system. Protected by
+ // sockets is the list of all network sockets the system. Protected by
// extMu.
- socketTable map[int]map[*refs.WeakRef]struct{}
+ sockets socketList
+
+ // nextSocketEntry is the next entry number to use in sockets. Protected
+ // by extMu.
+ nextSocketEntry uint64
// deviceRegistry is used to save/restore device.SimpleDevices.
deviceRegistry struct{} `state:".(*device.Registry)"`
@@ -283,7 +287,6 @@ func (k *Kernel) Init(args InitKernelArgs) error {
k.monotonicClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Monotonic}
k.futexes = futex.NewManager()
k.netlinkPorts = port.New()
- k.socketTable = make(map[int]map[*refs.WeakRef]struct{})
return nil
}
@@ -1137,51 +1140,43 @@ func (k *Kernel) EmitUnimplementedEvent(ctx context.Context) {
})
}
-// socketEntry represents a socket recorded in Kernel.socketTable. It implements
+// SocketEntry represents a socket recorded in Kernel.sockets. It implements
// refs.WeakRefUser for sockets stored in the socket table.
//
// +stateify savable
-type socketEntry struct {
- k *Kernel
- sock *refs.WeakRef
- family int
+type SocketEntry struct {
+ socketEntry
+ k *Kernel
+ Sock *refs.WeakRef
+ ID uint64 // Socket table entry number.
}
// WeakRefGone implements refs.WeakRefUser.WeakRefGone.
-func (s *socketEntry) WeakRefGone() {
+func (s *SocketEntry) WeakRefGone() {
s.k.extMu.Lock()
- // k.socketTable is guaranteed to point to a valid socket table for s.family
- // at this point, since we made sure of the fact when we created this
- // socketEntry, and we never delete socket tables.
- delete(s.k.socketTable[s.family], s.sock)
+ s.k.sockets.Remove(s)
s.k.extMu.Unlock()
}
// RecordSocket adds a socket to the system-wide socket table for tracking.
//
// Precondition: Caller must hold a reference to sock.
-func (k *Kernel) RecordSocket(sock *fs.File, family int) {
+func (k *Kernel) RecordSocket(sock *fs.File) {
k.extMu.Lock()
- table, ok := k.socketTable[family]
- if !ok {
- table = make(map[*refs.WeakRef]struct{})
- k.socketTable[family] = table
- }
- se := socketEntry{k: k, family: family}
- se.sock = refs.NewWeakRef(sock, &se)
- table[se.sock] = struct{}{}
+ id := k.nextSocketEntry
+ k.nextSocketEntry++
+ s := &SocketEntry{k: k, ID: id}
+ s.Sock = refs.NewWeakRef(sock, s)
+ k.sockets.PushBack(s)
k.extMu.Unlock()
}
-// ListSockets returns a snapshot of all sockets of a given family.
-func (k *Kernel) ListSockets(family int) []*refs.WeakRef {
+// ListSockets returns a snapshot of all sockets.
+func (k *Kernel) ListSockets() []*SocketEntry {
k.extMu.Lock()
- socks := []*refs.WeakRef{}
- if table, ok := k.socketTable[family]; ok {
- socks = make([]*refs.WeakRef, 0, len(table))
- for s := range table {
- socks = append(socks, s)
- }
+ var socks []*SocketEntry
+ for s := k.sockets.Front(); s != nil; s = s.Next() {
+ socks = append(socks, s)
}
k.extMu.Unlock()
return socks
diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go
index 926c4c623..dc7da529e 100644
--- a/pkg/sentry/kernel/pipe/node.go
+++ b/pkg/sentry/kernel/pipe/node.go
@@ -38,7 +38,11 @@ type inodeOperations struct {
fsutil.InodeNotMappable `state:"nosave"`
fsutil.InodeNotSocket `state:"nosave"`
fsutil.InodeNotSymlink `state:"nosave"`
- fsutil.InodeNotVirtual `state:"nosave"`
+
+ // Marking pipe inodes as virtual allows them to be saved and restored
+ // even if they have been unlinked. We can get away with this because
+ // their state exists entirely within the sentry.
+ fsutil.InodeVirtual `state:"nosave"`
fsutil.InodeSimpleAttributes
@@ -86,7 +90,7 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi
switch {
case flags.Read && !flags.Write: // O_RDONLY.
- r := i.p.Open(ctx, flags)
+ r := i.p.Open(ctx, d, flags)
i.newHandleLocked(&i.rWakeup)
if i.p.isNamed && !flags.NonBlocking && !i.p.HasWriters() {
@@ -102,7 +106,7 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi
return r, nil
case flags.Write && !flags.Read: // O_WRONLY.
- w := i.p.Open(ctx, flags)
+ w := i.p.Open(ctx, d, flags)
i.newHandleLocked(&i.wWakeup)
if i.p.isNamed && !i.p.HasReaders() {
@@ -122,7 +126,7 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi
case flags.Read && flags.Write: // O_RDWR.
// Pipes opened for read-write always succeeds without blocking.
- rw := i.p.Open(ctx, flags)
+ rw := i.p.Open(ctx, d, flags)
i.newHandleLocked(&i.rWakeup)
i.newHandleLocked(&i.wWakeup)
return rw, nil
diff --git a/pkg/sentry/kernel/pipe/node_test.go b/pkg/sentry/kernel/pipe/node_test.go
index 31d9b0443..9a946b380 100644
--- a/pkg/sentry/kernel/pipe/node_test.go
+++ b/pkg/sentry/kernel/pipe/node_test.go
@@ -62,7 +62,9 @@ var perms fs.FilePermissions = fs.FilePermissions{
}
func testOpenOrDie(ctx context.Context, t *testing.T, n fs.InodeOperations, flags fs.FileFlags, doneChan chan<- struct{}) (*fs.File, error) {
- file, err := n.GetFile(ctx, nil, flags)
+ inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{Type: fs.Pipe})
+ d := fs.NewDirent(inode, "pipe")
+ file, err := n.GetFile(ctx, d, flags)
if err != nil {
t.Fatalf("open with flags %+v failed: %v", flags, err)
}
@@ -73,7 +75,9 @@ func testOpenOrDie(ctx context.Context, t *testing.T, n fs.InodeOperations, flag
}
func testOpen(ctx context.Context, t *testing.T, n fs.InodeOperations, flags fs.FileFlags, resChan chan<- openResult) (*fs.File, error) {
- file, err := n.GetFile(ctx, nil, flags)
+ inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{Type: fs.Pipe})
+ d := fs.NewDirent(inode, "pipe")
+ file, err := n.GetFile(ctx, d, flags)
if resChan != nil {
resChan <- openResult{file, err}
}
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index b65204492..73438dc62 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -71,11 +71,6 @@ type Pipe struct {
// This value is immutable.
atomicIOBytes int64
- // The dirent backing this pipe. Shared by all readers and writers.
- //
- // This value is immutable.
- Dirent *fs.Dirent
-
// The number of active readers for this pipe.
//
// Access atomically.
@@ -130,14 +125,20 @@ func NewPipe(ctx context.Context, isNamed bool, sizeBytes, atomicIOBytes int64)
if atomicIOBytes > sizeBytes {
atomicIOBytes = sizeBytes
}
- p := &Pipe{
+ return &Pipe{
isNamed: isNamed,
max: sizeBytes,
atomicIOBytes: atomicIOBytes,
}
+}
- // Build the fs.Dirent of this pipe, shared by all fs.Files associated
- // with this pipe.
+// NewConnectedPipe initializes a pipe and returns a pair of objects
+// representing the read and write ends of the pipe.
+func NewConnectedPipe(ctx context.Context, sizeBytes, atomicIOBytes int64) (*fs.File, *fs.File) {
+ p := NewPipe(ctx, false /* isNamed */, sizeBytes, atomicIOBytes)
+
+ // Build an fs.Dirent for the pipe which will be shared by both
+ // returned files.
perms := fs.FilePermissions{
User: fs.PermMask{Read: true, Write: true},
}
@@ -150,36 +151,32 @@ func NewPipe(ctx context.Context, isNamed bool, sizeBytes, atomicIOBytes int64)
BlockSize: int64(atomicIOBytes),
}
ms := fs.NewPseudoMountSource()
- p.Dirent = fs.NewDirent(fs.NewInode(iops, ms, sattr), fmt.Sprintf("pipe:[%d]", ino))
- return p
-}
-
-// NewConnectedPipe initializes a pipe and returns a pair of objects
-// representing the read and write ends of the pipe.
-func NewConnectedPipe(ctx context.Context, sizeBytes, atomicIOBytes int64) (*fs.File, *fs.File) {
- p := NewPipe(ctx, false /* isNamed */, sizeBytes, atomicIOBytes)
- return p.Open(ctx, fs.FileFlags{Read: true}), p.Open(ctx, fs.FileFlags{Write: true})
+ d := fs.NewDirent(fs.NewInode(iops, ms, sattr), fmt.Sprintf("pipe:[%d]", ino))
+ // The p.Open calls below will each take a reference on the Dirent. We
+ // must drop the one we already have.
+ defer d.DecRef()
+ return p.Open(ctx, d, fs.FileFlags{Read: true}), p.Open(ctx, d, fs.FileFlags{Write: true})
}
// Open opens the pipe and returns a new file.
//
// Precondition: at least one of flags.Read or flags.Write must be set.
-func (p *Pipe) Open(ctx context.Context, flags fs.FileFlags) *fs.File {
+func (p *Pipe) Open(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) *fs.File {
switch {
case flags.Read && flags.Write:
p.rOpen()
p.wOpen()
- return fs.NewFile(ctx, p.Dirent, flags, &ReaderWriter{
+ return fs.NewFile(ctx, d, flags, &ReaderWriter{
Pipe: p,
})
case flags.Read:
p.rOpen()
- return fs.NewFile(ctx, p.Dirent, flags, &Reader{
+ return fs.NewFile(ctx, d, flags, &Reader{
ReaderWriter: ReaderWriter{Pipe: p},
})
case flags.Write:
p.wOpen()
- return fs.NewFile(ctx, p.Dirent, flags, &Writer{
+ return fs.NewFile(ctx, d, flags, &Writer{
ReaderWriter: ReaderWriter{Pipe: p},
})
default:
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
index 4423e7efd..193447b17 100644
--- a/pkg/sentry/kernel/ptrace.go
+++ b/pkg/sentry/kernel/ptrace.go
@@ -19,6 +19,7 @@ import (
"gvisor.googlesource.com/gvisor/pkg/abi/linux"
"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/mm"
"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
"gvisor.googlesource.com/gvisor/pkg/syserror"
)
@@ -92,6 +93,14 @@ const (
// ptrace(2), subsection "Ptrace access mode checking". If attach is true, it
// checks for access mode PTRACE_MODE_ATTACH; otherwise, it checks for access
// mode PTRACE_MODE_READ.
+//
+// NOTE(b/30815691): The result of CanTrace is immediately stale (e.g., a
+// racing setuid(2) may change traceability). This may pose a risk when a task
+// changes from traceable to not traceable. This is only problematic across
+// execve, where privileges may increase.
+//
+// We currently do not implement privileged executables (set-user/group-ID bits
+// and file capabilities), so that case is not reachable.
func (t *Task) CanTrace(target *Task, attach bool) bool {
// "1. If the calling thread and the target thread are in the same thread
// group, access is always allowed." - ptrace(2)
@@ -162,7 +171,13 @@ func (t *Task) CanTrace(target *Task, attach bool) bool {
if cgid := callerCreds.RealKGID; cgid != targetCreds.RealKGID || cgid != targetCreds.EffectiveKGID || cgid != targetCreds.SavedKGID {
return false
}
- // TODO(b/31916171): dumpability check
+ var targetMM *mm.MemoryManager
+ target.WithMuLocked(func(t *Task) {
+ targetMM = t.MemoryManager()
+ })
+ if targetMM != nil && targetMM.Dumpability() != mm.UserDumpable {
+ return false
+ }
if callerCreds.UserNamespace != targetCreds.UserNamespace {
return false
}
diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go
index 0572053db..27cd3728b 100644
--- a/pkg/sentry/kernel/syscalls.go
+++ b/pkg/sentry/kernel/syscalls.go
@@ -32,6 +32,51 @@ import (
// syscall.
const maxSyscallNum = 2000
+// SyscallSupportLevel is a syscall support levels.
+type SyscallSupportLevel int
+
+// String returns a human readable represetation of the support level.
+func (l SyscallSupportLevel) String() string {
+ switch l {
+ case SupportUnimplemented:
+ return "Unimplemented"
+ case SupportPartial:
+ return "Partial Support"
+ case SupportFull:
+ return "Full Support"
+ default:
+ return "Undocumented"
+ }
+}
+
+const (
+ // SupportUndocumented indicates the syscall is not documented yet.
+ SupportUndocumented = iota
+
+ // SupportUnimplemented indicates the syscall is unimplemented.
+ SupportUnimplemented
+
+ // SupportPartial indicates the syscall is partially supported.
+ SupportPartial
+
+ // SupportFull indicates the syscall is fully supported.
+ SupportFull
+)
+
+// Syscall includes the syscall implementation and compatibility information.
+type Syscall struct {
+ // Name is the syscall name.
+ Name string
+ // Fn is the implementation of the syscall.
+ Fn SyscallFn
+ // SupportLevel is the level of support implemented in gVisor.
+ SupportLevel SyscallSupportLevel
+ // Note describes the compatibility of the syscall.
+ Note string
+ // URLs is set of URLs to any relevant bugs or issues.
+ URLs []string
+}
+
// SyscallFn is a syscall implementation.
type SyscallFn func(t *Task, args arch.SyscallArguments) (uintptr, *SyscallControl, error)
@@ -83,7 +128,7 @@ type SyscallFlagsTable struct {
// Init initializes the struct, with all syscalls in table set to enable.
//
// max is the largest syscall number in table.
-func (e *SyscallFlagsTable) init(table map[uintptr]SyscallFn, max uintptr) {
+func (e *SyscallFlagsTable) init(table map[uintptr]Syscall, max uintptr) {
e.enable = make([]uint32, max+1)
for num := range table {
e.enable[num] = syscallPresent
@@ -194,7 +239,7 @@ type SyscallTable struct {
AuditNumber uint32 `state:"manual"`
// Table is the collection of functions.
- Table map[uintptr]SyscallFn `state:"manual"`
+ Table map[uintptr]Syscall `state:"manual"`
// lookup is a fixed-size array that holds the syscalls (indexed by
// their numbers). It is used for fast look ups.
@@ -247,7 +292,7 @@ func LookupSyscallTable(os abi.OS, a arch.Arch) (*SyscallTable, bool) {
func RegisterSyscallTable(s *SyscallTable) {
if s.Table == nil {
// Ensure non-nil lookup table.
- s.Table = make(map[uintptr]SyscallFn)
+ s.Table = make(map[uintptr]Syscall)
}
if s.Emulate == nil {
// Ensure non-nil emulate table.
@@ -268,8 +313,8 @@ func RegisterSyscallTable(s *SyscallTable) {
s.lookup = make([]SyscallFn, max+1)
// Initialize the fast-lookup table.
- for num, fn := range s.Table {
- s.lookup[num] = fn
+ for num, sc := range s.Table {
+ s.lookup[num] = sc.Fn
}
s.FeatureEnable.init(s.Table, max)
@@ -303,5 +348,8 @@ func (s *SyscallTable) LookupEmulate(addr usermem.Addr) (uintptr, bool) {
// mapLookup is similar to Lookup, except that it only uses the syscall table,
// that is, it skips the fast look array. This is available for benchmarking.
func (s *SyscallTable) mapLookup(sysno uintptr) SyscallFn {
- return s.Table[sysno]
+ if sc, ok := s.Table[sysno]; ok {
+ return sc.Fn
+ }
+ return nil
}
diff --git a/pkg/sentry/kernel/table_test.go b/pkg/sentry/kernel/table_test.go
index 8f7cdb9f3..3f2b042c8 100644
--- a/pkg/sentry/kernel/table_test.go
+++ b/pkg/sentry/kernel/table_test.go
@@ -26,11 +26,13 @@ const (
)
func createSyscallTable() *SyscallTable {
- m := make(map[uintptr]SyscallFn)
+ m := make(map[uintptr]Syscall)
for i := uintptr(0); i <= maxTestSyscall; i++ {
j := i
- m[i] = func(*Task, arch.SyscallArguments) (uintptr, *SyscallControl, error) {
- return j, nil, nil
+ m[i] = Syscall{
+ Fn: func(*Task, arch.SyscallArguments) (uintptr, *SyscallControl, error) {
+ return j, nil, nil
+ },
}
}
diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go
index f9378c2de..4d889422f 100644
--- a/pkg/sentry/kernel/task.go
+++ b/pkg/sentry/kernel/task.go
@@ -455,12 +455,13 @@ type Task struct {
// single numa node, all policies are no-ops. We only track this information
// so that we can return reasonable values if the application calls
// get_mempolicy(2) after setting a non-default policy. Note that in the
- // real syscall, nodemask can be longer than 4 bytes, but we always report a
- // single node so never need to save more than a single bit.
+ // real syscall, nodemask can be longer than a single unsigned long, but we
+ // always report a single node so never need to save more than a single
+ // bit.
//
// numaPolicy and numaNodeMask are protected by mu.
numaPolicy int32
- numaNodeMask uint32
+ numaNodeMask uint64
// If netns is true, the task is in a non-root network namespace. Network
// namespaces aren't currently implemented in full; being in a network
diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go
index 5d1425d5c..35d5cb90c 100644
--- a/pkg/sentry/kernel/task_exec.go
+++ b/pkg/sentry/kernel/task_exec.go
@@ -68,6 +68,7 @@ import (
"gvisor.googlesource.com/gvisor/pkg/abi/linux"
"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/mm"
"gvisor.googlesource.com/gvisor/pkg/syserror"
)
@@ -198,6 +199,12 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState {
return flags.CloseOnExec
})
+ // NOTE(b/30815691): We currently do not implement privileged
+ // executables (set-user/group-ID bits and file capabilities). This
+ // allows us to unconditionally enable user dumpability on the new mm.
+ // See fs/exec.c:setup_new_exec.
+ r.tc.MemoryManager.SetDumpability(mm.UserDumpable)
+
// Switch to the new process.
t.MemoryManager().Deactivate()
t.mu.Lock()
diff --git a/pkg/sentry/kernel/task_identity.go b/pkg/sentry/kernel/task_identity.go
index 17f08729a..ec95f78d0 100644
--- a/pkg/sentry/kernel/task_identity.go
+++ b/pkg/sentry/kernel/task_identity.go
@@ -17,6 +17,7 @@ package kernel
import (
"gvisor.googlesource.com/gvisor/pkg/abi/linux"
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/mm"
"gvisor.googlesource.com/gvisor/pkg/syserror"
)
@@ -206,8 +207,17 @@ func (t *Task) setKUIDsUncheckedLocked(newR, newE, newS auth.KUID) {
// (filesystem UIDs aren't implemented, nor are any of the capabilities in
// question)
- // Not documented, but compare Linux's kernel/cred.c:commit_creds().
if oldE != newE {
+ // "[dumpability] is reset to the current value contained in
+ // the file /proc/sys/fs/suid_dumpable (which by default has
+ // the value 0), in the following circumstances: The process's
+ // effective user or group ID is changed." - prctl(2)
+ //
+ // (suid_dumpable isn't implemented, so we just use the
+ // default.
+ t.MemoryManager().SetDumpability(mm.NotDumpable)
+
+ // Not documented, but compare Linux's kernel/cred.c:commit_creds().
t.parentDeathSignal = 0
}
}
@@ -303,8 +313,18 @@ func (t *Task) setKGIDsUncheckedLocked(newR, newE, newS auth.KGID) {
t.creds = t.creds.Fork() // See doc for creds.
t.creds.RealKGID, t.creds.EffectiveKGID, t.creds.SavedKGID = newR, newE, newS
- // Not documented, but compare Linux's kernel/cred.c:commit_creds().
if oldE != newE {
+ // "[dumpability] is reset to the current value contained in
+ // the file /proc/sys/fs/suid_dumpable (which by default has
+ // the value 0), in the following circumstances: The process's
+ // effective user or group ID is changed." - prctl(2)
+ //
+ // (suid_dumpable isn't implemented, so we just use the
+ // default.
+ t.MemoryManager().SetDumpability(mm.NotDumpable)
+
+ // Not documented, but compare Linux's
+ // kernel/cred.c:commit_creds().
t.parentDeathSignal = 0
}
}
diff --git a/pkg/sentry/kernel/task_sched.go b/pkg/sentry/kernel/task_sched.go
index 5455f6ea9..1c94ab11b 100644
--- a/pkg/sentry/kernel/task_sched.go
+++ b/pkg/sentry/kernel/task_sched.go
@@ -622,14 +622,14 @@ func (t *Task) SetNiceness(n int) {
}
// NumaPolicy returns t's current numa policy.
-func (t *Task) NumaPolicy() (policy int32, nodeMask uint32) {
+func (t *Task) NumaPolicy() (policy int32, nodeMask uint64) {
t.mu.Lock()
defer t.mu.Unlock()
return t.numaPolicy, t.numaNodeMask
}
// SetNumaPolicy sets t's numa policy.
-func (t *Task) SetNumaPolicy(policy int32, nodeMask uint32) {
+func (t *Task) SetNumaPolicy(policy int32, nodeMask uint64) {
t.mu.Lock()
defer t.mu.Unlock()
t.numaPolicy = policy
diff --git a/pkg/sentry/memutil/BUILD b/pkg/sentry/memutil/BUILD
deleted file mode 100644
index 68b03d4cc..000000000
--- a/pkg/sentry/memutil/BUILD
+++ /dev/null
@@ -1,14 +0,0 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-
-package(licenses = ["notice"])
-
-go_library(
- name = "memutil",
- srcs = [
- "memutil.go",
- "memutil_unsafe.go",
- ],
- importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/memutil",
- visibility = ["//pkg/sentry:internal"],
- deps = ["@org_golang_x_sys//unix:go_default_library"],
-)
diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go
index 7a65a62a2..7646d5ab2 100644
--- a/pkg/sentry/mm/lifecycle.go
+++ b/pkg/sentry/mm/lifecycle.go
@@ -37,6 +37,7 @@ func NewMemoryManager(p platform.Platform, mfp pgalloc.MemoryFileProvider) *Memo
privateRefs: &privateRefs{},
users: 1,
auxv: arch.Auxv{},
+ dumpability: UserDumpable,
aioManager: aioManager{contexts: make(map[uint64]*AIOContext)},
}
}
@@ -79,8 +80,9 @@ func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
envv: mm.envv,
auxv: append(arch.Auxv(nil), mm.auxv...),
// IncRef'd below, once we know that there isn't an error.
- executable: mm.executable,
- aioManager: aioManager{contexts: make(map[uint64]*AIOContext)},
+ executable: mm.executable,
+ dumpability: mm.dumpability,
+ aioManager: aioManager{contexts: make(map[uint64]*AIOContext)},
}
// Copy vmas.
diff --git a/pkg/sentry/mm/metadata.go b/pkg/sentry/mm/metadata.go
index 9768e51f1..c218006ee 100644
--- a/pkg/sentry/mm/metadata.go
+++ b/pkg/sentry/mm/metadata.go
@@ -20,6 +20,36 @@ import (
"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
)
+// Dumpability describes if and how core dumps should be created.
+type Dumpability int
+
+const (
+ // NotDumpable indicates that core dumps should never be created.
+ NotDumpable Dumpability = iota
+
+ // UserDumpable indicates that core dumps should be created, owned by
+ // the current user.
+ UserDumpable
+
+ // RootDumpable indicates that core dumps should be created, owned by
+ // root.
+ RootDumpable
+)
+
+// Dumpability returns the dumpability.
+func (mm *MemoryManager) Dumpability() Dumpability {
+ mm.metadataMu.Lock()
+ defer mm.metadataMu.Unlock()
+ return mm.dumpability
+}
+
+// SetDumpability sets the dumpability.
+func (mm *MemoryManager) SetDumpability(d Dumpability) {
+ mm.metadataMu.Lock()
+ defer mm.metadataMu.Unlock()
+ mm.dumpability = d
+}
+
// ArgvStart returns the start of the application argument vector.
//
// There is no guarantee that this value is sensible w.r.t. ArgvEnd.
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index eb6defa2b..604866d04 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -219,6 +219,12 @@ type MemoryManager struct {
// executable is protected by metadataMu.
executable *fs.Dirent
+ // dumpability describes if and how this MemoryManager may be dumped to
+ // userspace.
+ //
+ // dumpability is protected by metadataMu.
+ dumpability Dumpability
+
// aioManager keeps track of AIOContexts used for async IOs. AIOManager
// must be cloned when CLONE_VM is used.
aioManager aioManager
@@ -270,6 +276,12 @@ type vma struct {
mlockMode memmap.MLockMode
+ // numaPolicy is the NUMA policy for this vma set by mbind().
+ numaPolicy int32
+
+ // numaNodemask is the NUMA nodemask for this vma set by mbind().
+ numaNodemask uint64
+
// If id is not nil, it controls the lifecycle of mappable and provides vma
// metadata shown in /proc/[pid]/maps, and the vma holds a reference.
id memmap.MappingIdentity
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
index 0368c6794..9cf136532 100644
--- a/pkg/sentry/mm/syscalls.go
+++ b/pkg/sentry/mm/syscalls.go
@@ -470,6 +470,16 @@ func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr usermem.Addr, oldSi
return 0, syserror.EINVAL
}
+ // Check that the new region is valid.
+ _, err := mm.findAvailableLocked(newSize, findAvailableOpts{
+ Addr: newAddr,
+ Fixed: true,
+ Unmap: true,
+ })
+ if err != nil {
+ return 0, err
+ }
+
// Unmap any mappings at the destination.
mm.unmapLocked(ctx, newAR)
@@ -963,6 +973,59 @@ func (mm *MemoryManager) MLockAll(ctx context.Context, opts MLockAllOpts) error
return nil
}
+// NumaPolicy implements the semantics of Linux's get_mempolicy(MPOL_F_ADDR).
+func (mm *MemoryManager) NumaPolicy(addr usermem.Addr) (int32, uint64, error) {
+ mm.mappingMu.RLock()
+ defer mm.mappingMu.RUnlock()
+ vseg := mm.vmas.FindSegment(addr)
+ if !vseg.Ok() {
+ return 0, 0, syserror.EFAULT
+ }
+ vma := vseg.ValuePtr()
+ return vma.numaPolicy, vma.numaNodemask, nil
+}
+
+// SetNumaPolicy implements the semantics of Linux's mbind().
+func (mm *MemoryManager) SetNumaPolicy(addr usermem.Addr, length uint64, policy int32, nodemask uint64) error {
+ if !addr.IsPageAligned() {
+ return syserror.EINVAL
+ }
+ // Linux allows this to overflow.
+ la, _ := usermem.Addr(length).RoundUp()
+ ar, ok := addr.ToRange(uint64(la))
+ if !ok {
+ return syserror.EINVAL
+ }
+ if ar.Length() == 0 {
+ return nil
+ }
+
+ mm.mappingMu.Lock()
+ defer mm.mappingMu.Unlock()
+ defer func() {
+ mm.vmas.MergeRange(ar)
+ mm.vmas.MergeAdjacent(ar)
+ }()
+ vseg := mm.vmas.LowerBoundSegment(ar.Start)
+ lastEnd := ar.Start
+ for {
+ if !vseg.Ok() || lastEnd < vseg.Start() {
+ // "EFAULT: ... there was an unmapped hole in the specified memory
+ // range specified [sic] by addr and len." - mbind(2)
+ return syserror.EFAULT
+ }
+ vseg = mm.vmas.Isolate(vseg, ar)
+ vma := vseg.ValuePtr()
+ vma.numaPolicy = policy
+ vma.numaNodemask = nodemask
+ lastEnd = vseg.End()
+ if ar.End <= lastEnd {
+ return nil
+ }
+ vseg, _ = vseg.NextNonEmpty()
+ }
+}
+
// Decommit implements the semantics of Linux's madvise(MADV_DONTNEED).
func (mm *MemoryManager) Decommit(addr usermem.Addr, length uint64) error {
ar, ok := addr.ToRange(length)
diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go
index 02203f79f..0af8de5b0 100644
--- a/pkg/sentry/mm/vma.go
+++ b/pkg/sentry/mm/vma.go
@@ -107,6 +107,7 @@ func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOp
private: opts.Private,
growsDown: opts.GrowsDown,
mlockMode: opts.MLockMode,
+ numaPolicy: linux.MPOL_DEFAULT,
id: opts.MappingIdentity,
hint: opts.Hint,
}
@@ -436,6 +437,8 @@ func (vmaSetFunctions) Merge(ar1 usermem.AddrRange, vma1 vma, ar2 usermem.AddrRa
vma1.private != vma2.private ||
vma1.growsDown != vma2.growsDown ||
vma1.mlockMode != vma2.mlockMode ||
+ vma1.numaPolicy != vma2.numaPolicy ||
+ vma1.numaNodemask != vma2.numaNodemask ||
vma1.id != vma2.id ||
vma1.hint != vma2.hint {
return vma{}, false
diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD
index 8a8a0e4e4..ca2d5ba6f 100644
--- a/pkg/sentry/pgalloc/BUILD
+++ b/pkg/sentry/pgalloc/BUILD
@@ -63,9 +63,10 @@ go_library(
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/log",
+ "//pkg/memutil",
"//pkg/sentry/arch",
"//pkg/sentry/context",
- "//pkg/sentry/memutil",
+ "//pkg/sentry/hostmm",
"//pkg/sentry/platform",
"//pkg/sentry/safemem",
"//pkg/sentry/usage",
diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go
index 2b9924ad7..6d91f1a7b 100644
--- a/pkg/sentry/pgalloc/pgalloc.go
+++ b/pkg/sentry/pgalloc/pgalloc.go
@@ -32,6 +32,7 @@ import (
"gvisor.googlesource.com/gvisor/pkg/log"
"gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/hostmm"
"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
@@ -162,6 +163,11 @@ type MemoryFile struct {
// evictionWG counts the number of goroutines currently performing evictions.
evictionWG sync.WaitGroup
+
+ // stopNotifyPressure stops memory cgroup pressure level
+ // notifications used to drive eviction. stopNotifyPressure is
+ // immutable.
+ stopNotifyPressure func()
}
// MemoryFileOpts provides options to NewMemoryFile.
@@ -169,6 +175,11 @@ type MemoryFileOpts struct {
// DelayedEviction controls the extent to which the MemoryFile may delay
// eviction of evictable allocations.
DelayedEviction DelayedEvictionType
+
+ // If UseHostMemcgPressure is true, use host memory cgroup pressure level
+ // notifications to determine when eviction is necessary. This option has
+ // no effect unless DelayedEviction is DelayedEvictionEnabled.
+ UseHostMemcgPressure bool
}
// DelayedEvictionType is the type of MemoryFileOpts.DelayedEviction.
@@ -186,9 +197,14 @@ const (
// evictable allocations until doing so is considered necessary to avoid
// performance degradation due to host memory pressure, or OOM kills.
//
- // As of this writing, DelayedEvictionEnabled delays evictions until the
- // reclaimer goroutine is out of work (pages to reclaim), then evicts all
- // pending evictable allocations immediately.
+ // As of this writing, the behavior of DelayedEvictionEnabled depends on
+ // whether or not MemoryFileOpts.UseHostMemcgPressure is enabled:
+ //
+ // - If UseHostMemcgPressure is true, evictions are delayed until memory
+ // pressure is indicated.
+ //
+ // - Otherwise, evictions are only delayed until the reclaimer goroutine
+ // is out of work (pages to reclaim).
DelayedEvictionEnabled
// DelayedEvictionManual requires that evictable allocations are only
@@ -292,6 +308,22 @@ func NewMemoryFile(file *os.File, opts MemoryFileOpts) (*MemoryFile, error) {
}
f.mappings.Store(make([]uintptr, initialSize/chunkSize))
f.reclaimCond.L = &f.mu
+
+ if f.opts.DelayedEviction == DelayedEvictionEnabled && f.opts.UseHostMemcgPressure {
+ stop, err := hostmm.NotifyCurrentMemcgPressureCallback(func() {
+ f.mu.Lock()
+ startedAny := f.startEvictionsLocked()
+ f.mu.Unlock()
+ if startedAny {
+ log.Debugf("pgalloc.MemoryFile performing evictions due to memcg pressure")
+ }
+ }, "low")
+ if err != nil {
+ return nil, fmt.Errorf("failed to configure memcg pressure level notifications: %v", err)
+ }
+ f.stopNotifyPressure = stop
+ }
+
go f.runReclaim() // S/R-SAFE: f.mu
// The Linux kernel contains an optional feature called "Integrity
@@ -692,9 +724,11 @@ func (f *MemoryFile) MarkEvictable(user EvictableMemoryUser, er EvictableRange)
// Kick off eviction immediately.
f.startEvictionGoroutineLocked(user, info)
case DelayedEvictionEnabled:
- // Ensure that the reclaimer goroutine is running, so that it can
- // start eviction when necessary.
- f.reclaimCond.Signal()
+ if !f.opts.UseHostMemcgPressure {
+ // Ensure that the reclaimer goroutine is running, so that it
+ // can start eviction when necessary.
+ f.reclaimCond.Signal()
+ }
}
}
}
@@ -992,11 +1026,12 @@ func (f *MemoryFile) runReclaim() {
}
f.markReclaimed(fr)
}
+
// We only get here if findReclaimable finds f.destroyed set and returns
// false.
f.mu.Lock()
- defer f.mu.Unlock()
if !f.destroyed {
+ f.mu.Unlock()
panic("findReclaimable broke out of reclaim loop, but destroyed is no longer set")
}
f.file.Close()
@@ -1016,6 +1051,13 @@ func (f *MemoryFile) runReclaim() {
}
// Similarly, invalidate f.mappings. (atomic.Value.Store(nil) panics.)
f.mappings.Store([]uintptr{})
+ f.mu.Unlock()
+
+ // This must be called without holding f.mu to avoid circular lock
+ // ordering.
+ if f.stopNotifyPressure != nil {
+ f.stopNotifyPressure()
+ }
}
func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) {
@@ -1029,7 +1071,7 @@ func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) {
if f.reclaimable {
break
}
- if f.opts.DelayedEviction == DelayedEvictionEnabled {
+ if f.opts.DelayedEviction == DelayedEvictionEnabled && !f.opts.UseHostMemcgPressure {
// No work to do. Evict any pending evictable allocations to
// get more reclaimable pages before going to sleep.
f.startEvictionsLocked()
@@ -1089,14 +1131,17 @@ func (f *MemoryFile) StartEvictions() {
}
// Preconditions: f.mu must be locked.
-func (f *MemoryFile) startEvictionsLocked() {
+func (f *MemoryFile) startEvictionsLocked() bool {
+ startedAny := false
for user, info := range f.evictable {
// Don't start multiple goroutines to evict the same user's
// allocations.
if !info.evicting {
f.startEvictionGoroutineLocked(user, info)
+ startedAny = true
}
}
+ return startedAny
}
// Preconditions: info == f.evictable[user]. !info.evicting. f.mu must be
diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD
index 9999e58f4..2931d6ddc 100644
--- a/pkg/sentry/platform/kvm/BUILD
+++ b/pkg/sentry/platform/kvm/BUILD
@@ -32,10 +32,10 @@ go_library(
"//pkg/atomicbitops",
"//pkg/cpuid",
"//pkg/log",
+ "//pkg/procid",
"//pkg/sentry/arch",
"//pkg/sentry/platform",
"//pkg/sentry/platform/interrupt",
- "//pkg/sentry/platform/procid",
"//pkg/sentry/platform/ring0",
"//pkg/sentry/platform/ring0/pagetables",
"//pkg/sentry/platform/safecopy",
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index f5953b96e..f8ccd86af 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -23,7 +23,7 @@ import (
"gvisor.googlesource.com/gvisor/pkg/atomicbitops"
"gvisor.googlesource.com/gvisor/pkg/log"
- "gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid"
+ "gvisor.googlesource.com/gvisor/pkg/procid"
"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0"
"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ring0/pagetables"
"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD
index e9e4a0d16..434d003a3 100644
--- a/pkg/sentry/platform/ptrace/BUILD
+++ b/pkg/sentry/platform/ptrace/BUILD
@@ -20,11 +20,11 @@ go_library(
deps = [
"//pkg/abi/linux",
"//pkg/log",
+ "//pkg/procid",
"//pkg/seccomp",
"//pkg/sentry/arch",
"//pkg/sentry/platform",
"//pkg/sentry/platform/interrupt",
- "//pkg/sentry/platform/procid",
"//pkg/sentry/platform/safecopy",
"//pkg/sentry/usermem",
"@org_golang_x_sys//unix:go_default_library",
diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go
index 83b43057f..d7800a55e 100644
--- a/pkg/sentry/platform/ptrace/subprocess.go
+++ b/pkg/sentry/platform/ptrace/subprocess.go
@@ -21,9 +21,10 @@ import (
"sync"
"syscall"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/procid"
"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
- "gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid"
"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
)
@@ -300,6 +301,18 @@ const (
killed
)
+func (t *thread) dumpAndPanic(message string) {
+ var regs syscall.PtraceRegs
+ message += "\n"
+ if err := t.getRegs(&regs); err == nil {
+ message += dumpRegs(&regs)
+ } else {
+ log.Warningf("unable to get registers: %v", err)
+ }
+ message += fmt.Sprintf("stubStart\t = %016x\n", stubStart)
+ panic(message)
+}
+
// wait waits for a stop event.
//
// Precondition: outcome is a valid waitOutcome.
@@ -320,7 +333,7 @@ func (t *thread) wait(outcome waitOutcome) syscall.Signal {
switch outcome {
case stopped:
if !status.Stopped() {
- panic(fmt.Sprintf("ptrace status unexpected: got %v, wanted stopped", status))
+ t.dumpAndPanic(fmt.Sprintf("ptrace status unexpected: got %v, wanted stopped", status))
}
stopSig := status.StopSignal()
if stopSig == 0 {
@@ -334,12 +347,12 @@ func (t *thread) wait(outcome waitOutcome) syscall.Signal {
return stopSig
case killed:
if !status.Exited() && !status.Signaled() {
- panic(fmt.Sprintf("ptrace status unexpected: got %v, wanted exited", status))
+ t.dumpAndPanic(fmt.Sprintf("ptrace status unexpected: got %v, wanted exited", status))
}
return syscall.Signal(status.ExitStatus())
default:
// Should not happen.
- panic(fmt.Sprintf("unknown outcome: %v", outcome))
+ t.dumpAndPanic(fmt.Sprintf("unknown outcome: %v", outcome))
}
}
}
diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go
index 77a0e908f..fdd21c8f8 100644
--- a/pkg/sentry/platform/ptrace/subprocess_amd64.go
+++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go
@@ -17,6 +17,8 @@
package ptrace
import (
+ "fmt"
+ "strings"
"syscall"
"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
@@ -102,3 +104,38 @@ func syscallReturnValue(regs *syscall.PtraceRegs) (uintptr, error) {
}
return uintptr(rval), nil
}
+
+func dumpRegs(regs *syscall.PtraceRegs) string {
+ var m strings.Builder
+
+ fmt.Fprintf(&m, "Registers:\n")
+ fmt.Fprintf(&m, "\tR15\t = %016x\n", regs.R15)
+ fmt.Fprintf(&m, "\tR14\t = %016x\n", regs.R14)
+ fmt.Fprintf(&m, "\tR13\t = %016x\n", regs.R13)
+ fmt.Fprintf(&m, "\tR12\t = %016x\n", regs.R12)
+ fmt.Fprintf(&m, "\tRbp\t = %016x\n", regs.Rbp)
+ fmt.Fprintf(&m, "\tRbx\t = %016x\n", regs.Rbx)
+ fmt.Fprintf(&m, "\tR11\t = %016x\n", regs.R11)
+ fmt.Fprintf(&m, "\tR10\t = %016x\n", regs.R10)
+ fmt.Fprintf(&m, "\tR9\t = %016x\n", regs.R9)
+ fmt.Fprintf(&m, "\tR8\t = %016x\n", regs.R8)
+ fmt.Fprintf(&m, "\tRax\t = %016x\n", regs.Rax)
+ fmt.Fprintf(&m, "\tRcx\t = %016x\n", regs.Rcx)
+ fmt.Fprintf(&m, "\tRdx\t = %016x\n", regs.Rdx)
+ fmt.Fprintf(&m, "\tRsi\t = %016x\n", regs.Rsi)
+ fmt.Fprintf(&m, "\tRdi\t = %016x\n", regs.Rdi)
+ fmt.Fprintf(&m, "\tOrig_rax = %016x\n", regs.Orig_rax)
+ fmt.Fprintf(&m, "\tRip\t = %016x\n", regs.Rip)
+ fmt.Fprintf(&m, "\tCs\t = %016x\n", regs.Cs)
+ fmt.Fprintf(&m, "\tEflags\t = %016x\n", regs.Eflags)
+ fmt.Fprintf(&m, "\tRsp\t = %016x\n", regs.Rsp)
+ fmt.Fprintf(&m, "\tSs\t = %016x\n", regs.Ss)
+ fmt.Fprintf(&m, "\tFs_base\t = %016x\n", regs.Fs_base)
+ fmt.Fprintf(&m, "\tGs_base\t = %016x\n", regs.Gs_base)
+ fmt.Fprintf(&m, "\tDs\t = %016x\n", regs.Ds)
+ fmt.Fprintf(&m, "\tEs\t = %016x\n", regs.Es)
+ fmt.Fprintf(&m, "\tFs\t = %016x\n", regs.Fs)
+ fmt.Fprintf(&m, "\tGs\t = %016x\n", regs.Gs)
+
+ return m.String()
+}
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go
index 2c07b4ac3..914be7486 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux.go
@@ -22,9 +22,9 @@ import (
"gvisor.googlesource.com/gvisor/pkg/abi/linux"
"gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/procid"
"gvisor.googlesource.com/gvisor/pkg/seccomp"
"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
- "gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid"
)
const syscallEvent syscall.Signal = 0x80
@@ -142,7 +142,7 @@ func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, erro
// down available calls only to what is needed.
rules := []seccomp.RuleSet{
// Rules for trapping vsyscall access.
- seccomp.RuleSet{
+ {
Rules: seccomp.SyscallRules{
syscall.SYS_GETTIMEOFDAY: {},
syscall.SYS_TIME: {},
diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go
index c0238691d..434d7ca2e 100644
--- a/pkg/sentry/socket/control/control.go
+++ b/pkg/sentry/socket/control/control.go
@@ -406,12 +406,20 @@ func makeCreds(t *kernel.Task, socketOrEndpoint interface{}) SCMCredentials {
return nil
}
if cr, ok := socketOrEndpoint.(transport.Credentialer); ok && (cr.Passcred() || cr.ConnectedPasscred()) {
- tcred := t.Credentials()
- return &scmCredentials{t, tcred.EffectiveKUID, tcred.EffectiveKGID}
+ return MakeCreds(t)
}
return nil
}
+// MakeCreds creates default SCMCredentials.
+func MakeCreds(t *kernel.Task) SCMCredentials {
+ if t == nil {
+ return nil
+ }
+ tcred := t.Credentials()
+ return &scmCredentials{t, tcred.EffectiveKUID, tcred.EffectiveKGID}
+}
+
// New creates default control messages if needed.
func New(t *kernel.Task, socketOrEndpoint interface{}, rights SCMRights) transport.ControlMessages {
return transport.ControlMessages{
diff --git a/pkg/sentry/socket/epsocket/BUILD b/pkg/sentry/socket/epsocket/BUILD
index 44bb97b5b..7e2679ea0 100644
--- a/pkg/sentry/socket/epsocket/BUILD
+++ b/pkg/sentry/socket/epsocket/BUILD
@@ -32,7 +32,6 @@ go_library(
"//pkg/sentry/kernel/time",
"//pkg/sentry/safemem",
"//pkg/sentry/socket",
- "//pkg/sentry/socket/unix/transport",
"//pkg/sentry/unimpl",
"//pkg/sentry/usermem",
"//pkg/syserr",
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index de4b963da..a50798cb3 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -44,7 +44,6 @@ import (
ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
"gvisor.googlesource.com/gvisor/pkg/sentry/socket"
- "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
"gvisor.googlesource.com/gvisor/pkg/sentry/unimpl"
"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
"gvisor.googlesource.com/gvisor/pkg/syserr"
@@ -52,6 +51,7 @@ import (
"gvisor.googlesource.com/gvisor/pkg/tcpip"
"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/tcp"
"gvisor.googlesource.com/gvisor/pkg/waiter"
)
@@ -227,7 +227,8 @@ type SocketOperations struct {
family int
Endpoint tcpip.Endpoint
- skType transport.SockType
+ skType linux.SockType
+ protocol int
// readMu protects access to the below fields.
readMu sync.Mutex `state:"nosave"`
@@ -252,8 +253,8 @@ type SocketOperations struct {
}
// New creates a new endpoint socket.
-func New(t *kernel.Task, family int, skType transport.SockType, queue *waiter.Queue, endpoint tcpip.Endpoint) (*fs.File, *syserr.Error) {
- if skType == transport.SockStream {
+func New(t *kernel.Task, family int, skType linux.SockType, protocol int, queue *waiter.Queue, endpoint tcpip.Endpoint) (*fs.File, *syserr.Error) {
+ if skType == linux.SOCK_STREAM {
if err := endpoint.SetSockOpt(tcpip.DelayOption(1)); err != nil {
return nil, syserr.TranslateNetstackError(err)
}
@@ -266,6 +267,7 @@ func New(t *kernel.Task, family int, skType transport.SockType, queue *waiter.Qu
family: family,
Endpoint: endpoint,
skType: skType,
+ protocol: protocol,
}), nil
}
@@ -550,7 +552,7 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
}
}
- ns, err := New(t, s.family, s.skType, wq, ep)
+ ns, err := New(t, s.family, s.skType, s.protocol, wq, ep)
if err != nil {
return 0, nil, 0, err
}
@@ -578,7 +580,7 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
}
fd, e := t.FDMap().NewFDFrom(0, ns, fdFlags, t.ThreadGroup().Limits())
- t.Kernel().RecordSocket(ns, s.family)
+ t.Kernel().RecordSocket(ns)
return fd, addr, addrLen, syserr.FromError(e)
}
@@ -637,7 +639,7 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name, outLen int) (
// GetSockOpt can be used to implement the linux syscall getsockopt(2) for
// sockets backed by a commonEndpoint.
-func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType transport.SockType, level, name, outLen int) (interface{}, *syserr.Error) {
+func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType linux.SockType, level, name, outLen int) (interface{}, *syserr.Error) {
switch level {
case linux.SOL_SOCKET:
return getSockOptSocket(t, s, ep, family, skType, name, outLen)
@@ -663,7 +665,7 @@ func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int,
}
// getSockOptSocket implements GetSockOpt when level is SOL_SOCKET.
-func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType transport.SockType, name, outLen int) (interface{}, *syserr.Error) {
+func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType linux.SockType, name, outLen int) (interface{}, *syserr.Error) {
// TODO(b/124056281): Stop rejecting short optLen values in getsockopt.
switch name {
case linux.SO_TYPE:
@@ -918,6 +920,30 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
t.Kernel().EmitUnimplementedEvent(t)
+ case linux.TCP_CONGESTION:
+ if outLen <= 0 {
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ var v tcpip.CongestionControlOption
+ if err := ep.GetSockOpt(&v); err != nil {
+ return nil, syserr.TranslateNetstackError(err)
+ }
+
+ // We match linux behaviour here where it returns the lower of
+ // TCP_CA_NAME_MAX bytes or the value of the option length.
+ //
+ // This is Linux's net/tcp.h TCP_CA_NAME_MAX.
+ const tcpCANameMax = 16
+
+ toCopy := tcpCANameMax
+ if outLen < tcpCANameMax {
+ toCopy = outLen
+ }
+ b := make([]byte, toCopy)
+ copy(b, v)
+ return b, nil
+
default:
emitUnimplementedEventTCP(t, name)
}
@@ -1220,6 +1246,12 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
}
return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.KeepaliveIntervalOption(time.Second * time.Duration(v))))
+ case linux.TCP_CONGESTION:
+ v := tcpip.CongestionControlOption(optVal)
+ if err := ep.SetSockOpt(v); err != nil {
+ return syserr.TranslateNetstackError(err)
+ }
+ return nil
case linux.TCP_REPAIR_OPTIONS:
t.Kernel().EmitUnimplementedEvent(t)
@@ -2281,3 +2313,51 @@ func nicStateFlagsToLinux(f stack.NICStateFlags) uint32 {
}
return rv
}
+
+// State implements socket.Socket.State. State translates the internal state
+// returned by netstack to values defined by Linux.
+func (s *SocketOperations) State() uint32 {
+ if s.family != linux.AF_INET && s.family != linux.AF_INET6 {
+ // States not implemented for this socket's family.
+ return 0
+ }
+
+ if !s.isPacketBased() {
+ // TCP socket.
+ switch tcp.EndpointState(s.Endpoint.State()) {
+ case tcp.StateEstablished:
+ return linux.TCP_ESTABLISHED
+ case tcp.StateSynSent:
+ return linux.TCP_SYN_SENT
+ case tcp.StateSynRecv:
+ return linux.TCP_SYN_RECV
+ case tcp.StateFinWait1:
+ return linux.TCP_FIN_WAIT1
+ case tcp.StateFinWait2:
+ return linux.TCP_FIN_WAIT2
+ case tcp.StateTimeWait:
+ return linux.TCP_TIME_WAIT
+ case tcp.StateClose, tcp.StateInitial, tcp.StateBound, tcp.StateConnecting, tcp.StateError:
+ return linux.TCP_CLOSE
+ case tcp.StateCloseWait:
+ return linux.TCP_CLOSE_WAIT
+ case tcp.StateLastAck:
+ return linux.TCP_LAST_ACK
+ case tcp.StateListen:
+ return linux.TCP_LISTEN
+ case tcp.StateClosing:
+ return linux.TCP_CLOSING
+ default:
+ // Internal or unknown state.
+ return 0
+ }
+ }
+
+ // TODO(b/112063468): Export states for UDP, ICMP, and raw sockets.
+ return 0
+}
+
+// Type implements socket.Socket.Type.
+func (s *SocketOperations) Type() (family int, skType linux.SockType, protocol int) {
+ return s.family, s.skType, s.protocol
+}
diff --git a/pkg/sentry/socket/epsocket/provider.go b/pkg/sentry/socket/epsocket/provider.go
index ec930d8d5..516582828 100644
--- a/pkg/sentry/socket/epsocket/provider.go
+++ b/pkg/sentry/socket/epsocket/provider.go
@@ -23,7 +23,6 @@ import (
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
"gvisor.googlesource.com/gvisor/pkg/sentry/socket"
- "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
"gvisor.googlesource.com/gvisor/pkg/syserr"
"gvisor.googlesource.com/gvisor/pkg/tcpip"
"gvisor.googlesource.com/gvisor/pkg/tcpip/header"
@@ -42,7 +41,7 @@ type provider struct {
// getTransportProtocol figures out transport protocol. Currently only TCP,
// UDP, and ICMP are supported.
-func getTransportProtocol(ctx context.Context, stype transport.SockType, protocol int) (tcpip.TransportProtocolNumber, *syserr.Error) {
+func getTransportProtocol(ctx context.Context, stype linux.SockType, protocol int) (tcpip.TransportProtocolNumber, *syserr.Error) {
switch stype {
case linux.SOCK_STREAM:
if protocol != 0 && protocol != syscall.IPPROTO_TCP {
@@ -80,7 +79,7 @@ func getTransportProtocol(ctx context.Context, stype transport.SockType, protoco
}
// Socket creates a new socket object for the AF_INET or AF_INET6 family.
-func (p *provider) Socket(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *syserr.Error) {
+func (p *provider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *syserr.Error) {
// Fail right away if we don't have a stack.
stack := t.NetworkContext()
if stack == nil {
@@ -112,11 +111,11 @@ func (p *provider) Socket(t *kernel.Task, stype transport.SockType, protocol int
return nil, syserr.TranslateNetstackError(e)
}
- return New(t, p.family, stype, wq, ep)
+ return New(t, p.family, stype, protocol, wq, ep)
}
// Pair just returns nil sockets (not supported).
-func (*provider) Pair(*kernel.Task, transport.SockType, int) (*fs.File, *fs.File, *syserr.Error) {
+func (*provider) Pair(*kernel.Task, linux.SockType, int) (*fs.File, *fs.File, *syserr.Error) {
return nil, nil, nil
}
diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD
index a469af7ac..975f47bc3 100644
--- a/pkg/sentry/socket/hostinet/BUILD
+++ b/pkg/sentry/socket/hostinet/BUILD
@@ -30,7 +30,6 @@ go_library(
"//pkg/sentry/kernel/time",
"//pkg/sentry/safemem",
"//pkg/sentry/socket",
- "//pkg/sentry/socket/unix/transport",
"//pkg/sentry/usermem",
"//pkg/syserr",
"//pkg/syserror",
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index 41f9693bb..c62c8d8f1 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -19,7 +19,9 @@ import (
"syscall"
"gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/binary"
"gvisor.googlesource.com/gvisor/pkg/fdnotifier"
+ "gvisor.googlesource.com/gvisor/pkg/log"
"gvisor.googlesource.com/gvisor/pkg/sentry/context"
"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
@@ -28,7 +30,6 @@ import (
ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
"gvisor.googlesource.com/gvisor/pkg/sentry/socket"
- "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
"gvisor.googlesource.com/gvisor/pkg/syserr"
"gvisor.googlesource.com/gvisor/pkg/syserror"
@@ -55,15 +56,22 @@ type socketOperations struct {
fsutil.FileUseInodeUnstableAttr `state:"nosave"`
socket.SendReceiveTimeout
- family int // Read-only.
- fd int // must be O_NONBLOCK
- queue waiter.Queue
+ family int // Read-only.
+ stype linux.SockType // Read-only.
+ protocol int // Read-only.
+ fd int // must be O_NONBLOCK
+ queue waiter.Queue
}
var _ = socket.Socket(&socketOperations{})
-func newSocketFile(ctx context.Context, family int, fd int, nonblock bool) (*fs.File, *syserr.Error) {
- s := &socketOperations{family: family, fd: fd}
+func newSocketFile(ctx context.Context, family int, stype linux.SockType, protocol int, fd int, nonblock bool) (*fs.File, *syserr.Error) {
+ s := &socketOperations{
+ family: family,
+ stype: stype,
+ protocol: protocol,
+ fd: fd,
+ }
if err := fdnotifier.AddFD(int32(fd), &s.queue); err != nil {
return nil, syserr.FromError(err)
}
@@ -221,7 +229,7 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
return 0, peerAddr, peerAddrlen, syserr.FromError(syscallErr)
}
- f, err := newSocketFile(t, s.family, fd, flags&syscall.SOCK_NONBLOCK != 0)
+ f, err := newSocketFile(t, s.family, s.stype, s.protocol, fd, flags&syscall.SOCK_NONBLOCK != 0)
if err != nil {
syscall.Close(fd)
return 0, nil, 0, err
@@ -232,7 +240,7 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
CloseOnExec: flags&syscall.SOCK_CLOEXEC != 0,
}
kfd, kerr := t.FDMap().NewFDFrom(0, f, fdFlags, t.ThreadGroup().Limits())
- t.Kernel().RecordSocket(f, s.family)
+ t.Kernel().RecordSocket(f)
return kfd, peerAddr, peerAddrlen, syserr.FromError(kerr)
}
@@ -519,12 +527,39 @@ func translateIOSyscallError(err error) error {
return err
}
+// State implements socket.Socket.State.
+func (s *socketOperations) State() uint32 {
+ info := linux.TCPInfo{}
+ buf, err := getsockopt(s.fd, syscall.SOL_TCP, syscall.TCP_INFO, linux.SizeOfTCPInfo)
+ if err != nil {
+ if err != syscall.ENOPROTOOPT {
+ log.Warningf("Failed to get TCP socket info from %+v: %v", s, err)
+ }
+ // For non-TCP sockets, silently ignore the failure.
+ return 0
+ }
+ if len(buf) != linux.SizeOfTCPInfo {
+ // Unmarshal below will panic if getsockopt returns a buffer of
+ // unexpected size.
+ log.Warningf("Failed to get TCP socket info from %+v: getsockopt(2) returned %d bytes, expecting %d bytes.", s, len(buf), linux.SizeOfTCPInfo)
+ return 0
+ }
+
+ binary.Unmarshal(buf, usermem.ByteOrder, &info)
+ return uint32(info.State)
+}
+
+// Type implements socket.Socket.Type.
+func (s *socketOperations) Type() (family int, skType linux.SockType, protocol int) {
+ return s.family, s.stype, s.protocol
+}
+
type socketProvider struct {
family int
}
// Socket implements socket.Provider.Socket.
-func (p *socketProvider) Socket(t *kernel.Task, stypeflags transport.SockType, protocol int) (*fs.File, *syserr.Error) {
+func (p *socketProvider) Socket(t *kernel.Task, stypeflags linux.SockType, protocol int) (*fs.File, *syserr.Error) {
// Check that we are using the host network stack.
stack := t.NetworkContext()
if stack == nil {
@@ -535,7 +570,7 @@ func (p *socketProvider) Socket(t *kernel.Task, stypeflags transport.SockType, p
}
// Only accept TCP and UDP.
- stype := int(stypeflags) & linux.SOCK_TYPE_MASK
+ stype := stypeflags & linux.SOCK_TYPE_MASK
switch stype {
case syscall.SOCK_STREAM:
switch protocol {
@@ -558,15 +593,15 @@ func (p *socketProvider) Socket(t *kernel.Task, stypeflags transport.SockType, p
// Conservatively ignore all flags specified by the application and add
// SOCK_NONBLOCK since socketOperations requires it. Pass a protocol of 0
// to simplify the syscall filters, since 0 and IPPROTO_* are equivalent.
- fd, err := syscall.Socket(p.family, stype|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
+ fd, err := syscall.Socket(p.family, int(stype)|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0)
if err != nil {
return nil, syserr.FromError(err)
}
- return newSocketFile(t, p.family, fd, stypeflags&syscall.SOCK_NONBLOCK != 0)
+ return newSocketFile(t, p.family, stype, protocol, fd, stypeflags&syscall.SOCK_NONBLOCK != 0)
}
// Pair implements socket.Provider.Pair.
-func (p *socketProvider) Pair(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) {
+func (p *socketProvider) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) {
// Not supported by AF_INET/AF_INET6.
return nil, nil, nil
}
diff --git a/pkg/sentry/socket/netlink/provider.go b/pkg/sentry/socket/netlink/provider.go
index 76cf12fd4..5dc103877 100644
--- a/pkg/sentry/socket/netlink/provider.go
+++ b/pkg/sentry/socket/netlink/provider.go
@@ -22,7 +22,6 @@ import (
"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
"gvisor.googlesource.com/gvisor/pkg/sentry/socket"
- "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
"gvisor.googlesource.com/gvisor/pkg/syserr"
)
@@ -66,10 +65,10 @@ type socketProvider struct {
}
// Socket implements socket.Provider.Socket.
-func (*socketProvider) Socket(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *syserr.Error) {
+func (*socketProvider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *syserr.Error) {
// Netlink sockets must be specified as datagram or raw, but they
// behave the same regardless of type.
- if stype != transport.SockDgram && stype != transport.SockRaw {
+ if stype != linux.SOCK_DGRAM && stype != linux.SOCK_RAW {
return nil, syserr.ErrSocketNotSupported
}
@@ -83,7 +82,7 @@ func (*socketProvider) Socket(t *kernel.Task, stype transport.SockType, protocol
return nil, err
}
- s, err := NewSocket(t, p)
+ s, err := NewSocket(t, stype, p)
if err != nil {
return nil, err
}
@@ -94,7 +93,7 @@ func (*socketProvider) Socket(t *kernel.Task, stype transport.SockType, protocol
}
// Pair implements socket.Provider.Pair by returning an error.
-func (*socketProvider) Pair(*kernel.Task, transport.SockType, int) (*fs.File, *fs.File, *syserr.Error) {
+func (*socketProvider) Pair(*kernel.Task, linux.SockType, int) (*fs.File, *fs.File, *syserr.Error) {
// Netlink sockets never supports creating socket pairs.
return nil, nil, syserr.ErrNotSupported
}
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index afd06ca33..62659784a 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -80,6 +80,10 @@ type Socket struct {
// protocol is the netlink protocol implementation.
protocol Protocol
+ // skType is the socket type. This is either SOCK_DGRAM or SOCK_RAW for
+ // netlink sockets.
+ skType linux.SockType
+
// ep is a datagram unix endpoint used to buffer messages sent from the
// kernel to userspace. RecvMsg reads messages from this endpoint.
ep transport.Endpoint
@@ -105,7 +109,7 @@ type Socket struct {
var _ socket.Socket = (*Socket)(nil)
// NewSocket creates a new Socket.
-func NewSocket(t *kernel.Task, protocol Protocol) (*Socket, *syserr.Error) {
+func NewSocket(t *kernel.Task, skType linux.SockType, protocol Protocol) (*Socket, *syserr.Error) {
// Datagram endpoint used to buffer kernel -> user messages.
ep := transport.NewConnectionless()
@@ -126,6 +130,7 @@ func NewSocket(t *kernel.Task, protocol Protocol) (*Socket, *syserr.Error) {
return &Socket{
ports: t.Kernel().NetlinkPorts(),
protocol: protocol,
+ skType: skType,
ep: ep,
connection: connection,
sendBufferSize: defaultSendBufferSize,
@@ -616,3 +621,13 @@ func (s *Socket) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence,
n, err := s.sendMsg(ctx, src, nil, 0, socket.ControlMessages{})
return int64(n), err.ToError()
}
+
+// State implements socket.Socket.State.
+func (s *Socket) State() uint32 {
+ return s.ep.State()
+}
+
+// Type implements socket.Socket.Type.
+func (s *Socket) Type() (family int, skType linux.SockType, protocol int) {
+ return linux.AF_NETLINK, s.skType, s.protocol.Protocol()
+}
diff --git a/pkg/sentry/socket/rpcinet/BUILD b/pkg/sentry/socket/rpcinet/BUILD
index 4da14a1e0..33ba20de7 100644
--- a/pkg/sentry/socket/rpcinet/BUILD
+++ b/pkg/sentry/socket/rpcinet/BUILD
@@ -31,7 +31,6 @@ go_library(
"//pkg/sentry/socket/hostinet",
"//pkg/sentry/socket/rpcinet/conn",
"//pkg/sentry/socket/rpcinet/notifier",
- "//pkg/sentry/socket/unix/transport",
"//pkg/sentry/unimpl",
"//pkg/sentry/usermem",
"//pkg/syserr",
diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index 55e0b6665..c22ff1ff0 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -32,7 +32,6 @@ import (
"gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/conn"
"gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/notifier"
pb "gvisor.googlesource.com/gvisor/pkg/sentry/socket/rpcinet/syscall_rpc_go_proto"
- "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport"
"gvisor.googlesource.com/gvisor/pkg/sentry/unimpl"
"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
"gvisor.googlesource.com/gvisor/pkg/syserr"
@@ -54,7 +53,10 @@ type socketOperations struct {
fsutil.FileUseInodeUnstableAttr `state:"nosave"`
socket.SendReceiveTimeout
- family int // Read-only.
+ family int // Read-only.
+ stype linux.SockType // Read-only.
+ protocol int // Read-only.
+
fd uint32 // must be O_NONBLOCK
wq *waiter.Queue
rpcConn *conn.RPCConnection
@@ -70,7 +72,7 @@ type socketOperations struct {
var _ = socket.Socket(&socketOperations{})
// New creates a new RPC socket.
-func newSocketFile(ctx context.Context, stack *Stack, family int, skType int, protocol int) (*fs.File, *syserr.Error) {
+func newSocketFile(ctx context.Context, stack *Stack, family int, skType linux.SockType, protocol int) (*fs.File, *syserr.Error) {
id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_Socket{&pb.SocketRequest{Family: int64(family), Type: int64(skType | syscall.SOCK_NONBLOCK), Protocol: int64(protocol)}}}, false /* ignoreResult */)
<-c
@@ -87,6 +89,8 @@ func newSocketFile(ctx context.Context, stack *Stack, family int, skType int, pr
defer dirent.DecRef()
return fs.NewFile(ctx, dirent, fs.FileFlags{Read: true, Write: true}, &socketOperations{
family: family,
+ stype: skType,
+ protocol: protocol,
wq: &wq,
fd: fd,
rpcConn: stack.rpcConn,
@@ -333,7 +337,7 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
if err != nil {
return 0, nil, 0, syserr.FromError(err)
}
- t.Kernel().RecordSocket(file, s.family)
+ t.Kernel().RecordSocket(file)
if peerRequested {
return fd, payload.Address.Address, payload.Address.Length, nil
@@ -830,12 +834,23 @@ func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
}
}
+// State implements socket.Socket.State.
+func (s *socketOperations) State() uint32 {
+ // TODO(b/127845868): Define a new rpc to query the socket state.
+ return 0
+}
+
+// Type implements socket.Socket.Type.
+func (s *socketOperations) Type() (family int, skType linux.SockType, protocol int) {
+ return s.family, s.stype, s.protocol
+}
+
type socketProvider struct {
family int
}
// Socket implements socket.Provider.Socket.
-func (p *socketProvider) Socket(t *kernel.Task, stypeflags transport.SockType, protocol int) (*fs.File, *syserr.Error) {
+func (p *socketProvider) Socket(t *kernel.Task, stypeflags linux.SockType, protocol int) (*fs.File, *syserr.Error) {
// Check that we are using the RPC network stack.
stack := t.NetworkContext()
if stack == nil {
@@ -851,7 +866,7 @@ func (p *socketProvider) Socket(t *kernel.Task, stypeflags transport.SockType, p
//
// Try to restrict the flags we will accept to minimize backwards
// incompatibility with netstack.
- stype := int(stypeflags) & linux.SOCK_TYPE_MASK
+ stype := stypeflags & linux.SOCK_TYPE_MASK
switch stype {
case syscall.SOCK_STREAM:
switch protocol {
@@ -871,11 +886,11 @@ func (p *socketProvider) Socket(t *kernel.Task, stypeflags transport.SockType, p
return nil, nil
}
- return newSocketFile(t, s, p.family, stype, 0)
+ return newSocketFile(t, s, p.family, stype, protocol)
}
// Pair implements socket.Provider.Pair.
-func (p *socketProvider) Pair(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) {
+func (p *socketProvider) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) {
// Not supported by AF_INET/AF_INET6.
return nil, nil, nil
}
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
index 9393acd28..d60944b6b 100644
--- a/pkg/sentry/socket/socket.go
+++ b/pkg/sentry/socket/socket.go
@@ -116,6 +116,13 @@ type Socket interface {
// SendTimeout gets the current timeout (in ns) for send operations. Zero
// means no timeout, and negative means DONTWAIT.
SendTimeout() int64
+
+ // State returns the current state of the socket, as represented by Linux in
+ // procfs. The returned state value is protocol-specific.
+ State() uint32
+
+ // Type returns the family, socket type and protocol of the socket.
+ Type() (family int, skType linux.SockType, protocol int)
}
// Provider is the interface implemented by providers of sockets for specific
@@ -126,12 +133,12 @@ type Provider interface {
// If a nil Socket _and_ a nil error is returned, it means that the
// protocol is not supported. A non-nil error should only be returned
// if the protocol is supported, but an error occurs during creation.
- Socket(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *syserr.Error)
+ Socket(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *syserr.Error)
// Pair creates a pair of connected sockets.
//
// See Socket for error information.
- Pair(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error)
+ Pair(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error)
}
// families holds a map of all known address families and their providers.
@@ -145,14 +152,14 @@ func RegisterProvider(family int, provider Provider) {
}
// New creates a new socket with the given family, type and protocol.
-func New(t *kernel.Task, family int, stype transport.SockType, protocol int) (*fs.File, *syserr.Error) {
+func New(t *kernel.Task, family int, stype linux.SockType, protocol int) (*fs.File, *syserr.Error) {
for _, p := range families[family] {
s, err := p.Socket(t, stype, protocol)
if err != nil {
return nil, err
}
if s != nil {
- t.Kernel().RecordSocket(s, family)
+ t.Kernel().RecordSocket(s)
return s, nil
}
}
@@ -162,7 +169,7 @@ func New(t *kernel.Task, family int, stype transport.SockType, protocol int) (*f
// Pair creates a new connected socket pair with the given family, type and
// protocol.
-func Pair(t *kernel.Task, family int, stype transport.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) {
+func Pair(t *kernel.Task, family int, stype linux.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) {
providers, ok := families[family]
if !ok {
return nil, nil, syserr.ErrAddressFamilyNotSupported
@@ -175,8 +182,8 @@ func Pair(t *kernel.Task, family int, stype transport.SockType, protocol int) (*
}
if s1 != nil && s2 != nil {
k := t.Kernel()
- k.RecordSocket(s1, family)
- k.RecordSocket(s2, family)
+ k.RecordSocket(s1)
+ k.RecordSocket(s2)
return s1, s2, nil
}
}
diff --git a/pkg/sentry/socket/unix/transport/BUILD b/pkg/sentry/socket/unix/transport/BUILD
index 5a2de0c4c..52f324eed 100644
--- a/pkg/sentry/socket/unix/transport/BUILD
+++ b/pkg/sentry/socket/unix/transport/BUILD
@@ -28,6 +28,7 @@ go_library(
importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix/transport",
visibility = ["//:sandbox"],
deps = [
+ "//pkg/abi/linux",
"//pkg/ilist",
"//pkg/refs",
"//pkg/syserr",
diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go
index 18e492862..db79ac904 100644
--- a/pkg/sentry/socket/unix/transport/connectioned.go
+++ b/pkg/sentry/socket/unix/transport/connectioned.go
@@ -17,6 +17,7 @@ package transport
import (
"sync"
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
"gvisor.googlesource.com/gvisor/pkg/syserr"
"gvisor.googlesource.com/gvisor/pkg/tcpip"
"gvisor.googlesource.com/gvisor/pkg/waiter"
@@ -44,7 +45,7 @@ type ConnectingEndpoint interface {
// Type returns the socket type, typically either SockStream or
// SockSeqpacket. The connection attempt must be aborted if this
// value doesn't match the ConnectableEndpoint's type.
- Type() SockType
+ Type() linux.SockType
// GetLocalAddress returns the bound path.
GetLocalAddress() (tcpip.FullAddress, *tcpip.Error)
@@ -100,7 +101,7 @@ type connectionedEndpoint struct {
// stype is used by connecting sockets to ensure that they are the
// same type. The value is typically either tcpip.SockSeqpacket or
// tcpip.SockStream.
- stype SockType
+ stype linux.SockType
// acceptedChan is per the TCP endpoint implementation. Note that the
// sockets in this channel are _already in the connected state_, and
@@ -111,7 +112,7 @@ type connectionedEndpoint struct {
}
// NewConnectioned creates a new unbound connectionedEndpoint.
-func NewConnectioned(stype SockType, uid UniqueIDProvider) Endpoint {
+func NewConnectioned(stype linux.SockType, uid UniqueIDProvider) Endpoint {
return &connectionedEndpoint{
baseEndpoint: baseEndpoint{Queue: &waiter.Queue{}},
id: uid.UniqueID(),
@@ -121,7 +122,7 @@ func NewConnectioned(stype SockType, uid UniqueIDProvider) Endpoint {
}
// NewPair allocates a new pair of connected unix-domain connectionedEndpoints.
-func NewPair(stype SockType, uid UniqueIDProvider) (Endpoint, Endpoint) {
+func NewPair(stype linux.SockType, uid UniqueIDProvider) (Endpoint, Endpoint) {
a := &connectionedEndpoint{
baseEndpoint: baseEndpoint{Queue: &waiter.Queue{}},
id: uid.UniqueID(),
@@ -138,7 +139,7 @@ func NewPair(stype SockType, uid UniqueIDProvider) (Endpoint, Endpoint) {
q1 := &queue{ReaderQueue: a.Queue, WriterQueue: b.Queue, limit: initialLimit}
q2 := &queue{ReaderQueue: b.Queue, WriterQueue: a.Queue, limit: initialLimit}
- if stype == SockStream {
+ if stype == linux.SOCK_STREAM {
a.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{q1}}
b.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{q2}}
} else {
@@ -162,7 +163,7 @@ func NewPair(stype SockType, uid UniqueIDProvider) (Endpoint, Endpoint) {
// NewExternal creates a new externally backed Endpoint. It behaves like a
// socketpair.
-func NewExternal(stype SockType, uid UniqueIDProvider, queue *waiter.Queue, receiver Receiver, connected ConnectedEndpoint) Endpoint {
+func NewExternal(stype linux.SockType, uid UniqueIDProvider, queue *waiter.Queue, receiver Receiver, connected ConnectedEndpoint) Endpoint {
return &connectionedEndpoint{
baseEndpoint: baseEndpoint{Queue: queue, receiver: receiver, connected: connected},
id: uid.UniqueID(),
@@ -177,7 +178,7 @@ func (e *connectionedEndpoint) ID() uint64 {
}
// Type implements ConnectingEndpoint.Type and Endpoint.Type.
-func (e *connectionedEndpoint) Type() SockType {
+func (e *connectionedEndpoint) Type() linux.SockType {
return e.stype
}
@@ -293,7 +294,7 @@ func (e *connectionedEndpoint) BidirectionalConnect(ce ConnectingEndpoint, retur
}
writeQueue := &queue{ReaderQueue: ne.Queue, WriterQueue: ce.WaiterQueue(), limit: initialLimit}
- if e.stype == SockStream {
+ if e.stype == linux.SOCK_STREAM {
ne.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{readQueue: writeQueue}}
} else {
ne.receiver = &queueReceiver{readQueue: writeQueue}
@@ -308,7 +309,7 @@ func (e *connectionedEndpoint) BidirectionalConnect(ce ConnectingEndpoint, retur
writeQueue: writeQueue,
}
readQueue.IncRef()
- if e.stype == SockStream {
+ if e.stype == linux.SOCK_STREAM {
returnConnect(&streamQueueReceiver{queueReceiver: queueReceiver{readQueue: readQueue}}, connected)
} else {
returnConnect(&queueReceiver{readQueue: readQueue}, connected)
@@ -428,7 +429,7 @@ func (e *connectionedEndpoint) Bind(addr tcpip.FullAddress, commit func() *syser
func (e *connectionedEndpoint) SendMsg(data [][]byte, c ControlMessages, to BoundEndpoint) (uintptr, *syserr.Error) {
// Stream sockets do not support specifying the endpoint. Seqpacket
// sockets ignore the passed endpoint.
- if e.stype == SockStream && to != nil {
+ if e.stype == linux.SOCK_STREAM && to != nil {
return 0, syserr.ErrNotSupported
}
return e.baseEndpoint.SendMsg(data, c, to)
@@ -458,3 +459,11 @@ func (e *connectionedEndpoint) Readiness(mask waiter.EventMask) waiter.EventMask
return ready
}
+
+// State implements socket.Socket.State.
+func (e *connectionedEndpoint) State() uint32 {
+ if e.Connected() {
+ return linux.SS_CONNECTED
+ }
+ return linux.SS_UNCONNECTED
+}
diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go
index 43ff875e4..81ebfba10 100644
--- a/pkg/sentry/socket/unix/transport/connectionless.go
+++ b/pkg/sentry/socket/unix/transport/connectionless.go
@@ -15,6 +15,7 @@
package transport
import (
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
"gvisor.googlesource.com/gvisor/pkg/syserr"
"gvisor.googlesource.com/gvisor/pkg/tcpip"
"gvisor.googlesource.com/gvisor/pkg/waiter"
@@ -118,8 +119,8 @@ func (e *connectionlessEndpoint) SendMsg(data [][]byte, c ControlMessages, to Bo
}
// Type implements Endpoint.Type.
-func (e *connectionlessEndpoint) Type() SockType {
- return SockDgram
+func (e *connectionlessEndpoint) Type() linux.SockType {
+ return linux.SOCK_DGRAM
}
// Connect attempts to connect directly to server.
@@ -194,3 +195,18 @@ func (e *connectionlessEndpoint) Readiness(mask waiter.EventMask) waiter.EventMa
return ready
}
+
+// State implements socket.Socket.State.
+func (e *connectionlessEndpoint) State() uint32 {
+ e.Lock()
+ defer e.Unlock()
+
+ switch {
+ case e.isBound():
+ return linux.SS_UNCONNECTED
+ case e.Connected():
+ return linux.SS_CONNECTING
+ default:
+ return linux.SS_DISCONNECTING
+ }
+}
diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go
index b734b4c20..5c55c529e 100644
--- a/pkg/sentry/socket/unix/transport/unix.go
+++ b/pkg/sentry/socket/unix/transport/unix.go
@@ -19,6 +19,7 @@ import (
"sync"
"sync/atomic"
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
"gvisor.googlesource.com/gvisor/pkg/syserr"
"gvisor.googlesource.com/gvisor/pkg/tcpip"
"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
@@ -28,21 +29,6 @@ import (
// initialLimit is the starting limit for the socket buffers.
const initialLimit = 16 * 1024
-// A SockType is a type (as opposed to family) of sockets. These are enumerated
-// in the syscall package as syscall.SOCK_* constants.
-type SockType int
-
-const (
- // SockStream corresponds to syscall.SOCK_STREAM.
- SockStream SockType = 1
- // SockDgram corresponds to syscall.SOCK_DGRAM.
- SockDgram SockType = 2
- // SockRaw corresponds to syscall.SOCK_RAW.
- SockRaw SockType = 3
- // SockSeqpacket corresponds to syscall.SOCK_SEQPACKET.
- SockSeqpacket SockType = 5
-)
-
// A RightsControlMessage is a control message containing FDs.
type RightsControlMessage interface {
// Clone returns a copy of the RightsControlMessage.
@@ -175,7 +161,7 @@ type Endpoint interface {
// Type return the socket type, typically either SockStream, SockDgram
// or SockSeqpacket.
- Type() SockType
+ Type() linux.SockType
// GetLocalAddress returns the address to which the endpoint is bound.
GetLocalAddress() (tcpip.FullAddress, *tcpip.Error)
@@ -191,6 +177,10 @@ type Endpoint interface {
// GetSockOpt gets a socket option. opt should be a pointer to one of the
// tcpip.*Option types.
GetSockOpt(opt interface{}) *tcpip.Error
+
+ // State returns the current state of the socket, as represented by Linux in
+ // procfs.
+ State() uint32
}
// A Credentialer is a socket or endpoint that supports the SO_PASSCRED socket
@@ -237,6 +227,10 @@ type BoundEndpoint interface {
// endpoint.
UnidirectionalConnect() (ConnectedEndpoint, *syserr.Error)
+ // Passcred returns whether or not the SO_PASSCRED socket option is
+ // enabled on this end.
+ Passcred() bool
+
// Release releases any resources held by the BoundEndpoint. It must be
// called before dropping all references to a BoundEndpoint returned by a
// function.
@@ -621,7 +615,7 @@ type connectedEndpoint struct {
GetLocalAddress() (tcpip.FullAddress, *tcpip.Error)
// Type implements Endpoint.Type.
- Type() SockType
+ Type() linux.SockType
}
writeQueue *queue
@@ -645,7 +639,7 @@ func (e *connectedEndpoint) Send(data [][]byte, controlMessages ControlMessages,
}
truncate := false
- if e.endpoint.Type() == SockStream {
+ if e.endpoint.Type() == linux.SOCK_STREAM {
// Since stream sockets don't preserve message boundaries, we
// can write only as much of the message as fits in the queue.
truncate = true
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index 1414be0c6..b07e8d67b 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -17,6 +17,7 @@
package unix
import (
+ "fmt"
"strings"
"syscall"
@@ -55,22 +56,22 @@ type SocketOperations struct {
refs.AtomicRefCount
socket.SendReceiveTimeout
- ep transport.Endpoint
- isPacket bool
+ ep transport.Endpoint
+ stype linux.SockType
}
// New creates a new unix socket.
-func New(ctx context.Context, endpoint transport.Endpoint, isPacket bool) *fs.File {
+func New(ctx context.Context, endpoint transport.Endpoint, stype linux.SockType) *fs.File {
dirent := socket.NewDirent(ctx, unixSocketDevice)
defer dirent.DecRef()
- return NewWithDirent(ctx, dirent, endpoint, isPacket, fs.FileFlags{Read: true, Write: true})
+ return NewWithDirent(ctx, dirent, endpoint, stype, fs.FileFlags{Read: true, Write: true})
}
// NewWithDirent creates a new unix socket using an existing dirent.
-func NewWithDirent(ctx context.Context, d *fs.Dirent, ep transport.Endpoint, isPacket bool, flags fs.FileFlags) *fs.File {
+func NewWithDirent(ctx context.Context, d *fs.Dirent, ep transport.Endpoint, stype linux.SockType, flags fs.FileFlags) *fs.File {
return fs.NewFile(ctx, d, flags, &SocketOperations{
- ep: ep,
- isPacket: isPacket,
+ ep: ep,
+ stype: stype,
})
}
@@ -88,6 +89,18 @@ func (s *SocketOperations) Release() {
s.DecRef()
}
+func (s *SocketOperations) isPacket() bool {
+ switch s.stype {
+ case linux.SOCK_DGRAM, linux.SOCK_SEQPACKET:
+ return true
+ case linux.SOCK_STREAM:
+ return false
+ default:
+ // We shouldn't have allowed any other socket types during creation.
+ panic(fmt.Sprintf("Invalid socket type %d", s.stype))
+ }
+}
+
// Endpoint extracts the transport.Endpoint.
func (s *SocketOperations) Endpoint() transport.Endpoint {
return s.ep
@@ -193,7 +206,7 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
}
}
- ns := New(t, ep, s.isPacket)
+ ns := New(t, ep, s.stype)
defer ns.DecRef()
if flags&linux.SOCK_NONBLOCK != 0 {
@@ -221,7 +234,7 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
return 0, nil, 0, syserr.FromError(e)
}
- t.Kernel().RecordSocket(ns, linux.AF_UNIX)
+ t.Kernel().RecordSocket(ns)
return fd, addr, addrLen, nil
}
@@ -385,6 +398,10 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
}
defer ep.Release()
w.To = ep
+
+ if ep.Passcred() && w.Control.Credentials == nil {
+ w.Control.Credentials = control.MakeCreds(t)
+ }
}
n, err := src.CopyInTo(t, &w)
@@ -483,6 +500,7 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
peek := flags&linux.MSG_PEEK != 0
dontWait := flags&linux.MSG_DONTWAIT != 0
waitAll := flags&linux.MSG_WAITALL != 0
+ isPacket := s.isPacket()
// Calculate the number of FDs for which we have space and if we are
// requesting credentials.
@@ -516,7 +534,7 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
if n, err := dst.CopyOutFrom(t, &r); err != syserror.ErrWouldBlock || dontWait {
var from interface{}
var fromLen uint32
- if r.From != nil {
+ if r.From != nil && len([]byte(r.From.Addr)) != 0 {
from, fromLen = epsocket.ConvertAddress(linux.AF_UNIX, *r.From)
}
@@ -524,8 +542,8 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
msgFlags |= linux.MSG_CTRUNC
}
- if err != nil || dontWait || !waitAll || s.isPacket || n >= dst.NumBytes() {
- if s.isPacket && n < int64(r.MsgSize) {
+ if err != nil || dontWait || !waitAll || isPacket || n >= dst.NumBytes() {
+ if isPacket && n < int64(r.MsgSize) {
msgFlags |= linux.MSG_TRUNC
}
@@ -566,11 +584,11 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
total += n
}
- if err != nil || !waitAll || s.isPacket || n >= dst.NumBytes() {
+ if err != nil || !waitAll || isPacket || n >= dst.NumBytes() {
if total > 0 {
err = nil
}
- if s.isPacket && n < int64(r.MsgSize) {
+ if isPacket && n < int64(r.MsgSize) {
msgFlags |= linux.MSG_TRUNC
}
return int(total), msgFlags, from, fromLen, socket.ControlMessages{Unix: r.Control}, syserr.FromError(err)
@@ -592,11 +610,22 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
}
}
+// State implements socket.Socket.State.
+func (s *SocketOperations) State() uint32 {
+ return s.ep.State()
+}
+
+// Type implements socket.Socket.Type.
+func (s *SocketOperations) Type() (family int, skType linux.SockType, protocol int) {
+ // Unix domain sockets always have a protocol of 0.
+ return linux.AF_UNIX, s.stype, 0
+}
+
// provider is a unix domain socket provider.
type provider struct{}
// Socket returns a new unix domain socket.
-func (*provider) Socket(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *syserr.Error) {
+func (*provider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *syserr.Error) {
// Check arguments.
if protocol != 0 && protocol != linux.AF_UNIX /* PF_UNIX */ {
return nil, syserr.ErrProtocolNotSupported
@@ -604,43 +633,36 @@ func (*provider) Socket(t *kernel.Task, stype transport.SockType, protocol int)
// Create the endpoint and socket.
var ep transport.Endpoint
- var isPacket bool
switch stype {
case linux.SOCK_DGRAM:
- isPacket = true
ep = transport.NewConnectionless()
- case linux.SOCK_SEQPACKET:
- isPacket = true
- fallthrough
- case linux.SOCK_STREAM:
+ case linux.SOCK_SEQPACKET, linux.SOCK_STREAM:
ep = transport.NewConnectioned(stype, t.Kernel())
default:
return nil, syserr.ErrInvalidArgument
}
- return New(t, ep, isPacket), nil
+ return New(t, ep, stype), nil
}
// Pair creates a new pair of AF_UNIX connected sockets.
-func (*provider) Pair(t *kernel.Task, stype transport.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) {
+func (*provider) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) {
// Check arguments.
if protocol != 0 && protocol != linux.AF_UNIX /* PF_UNIX */ {
return nil, nil, syserr.ErrProtocolNotSupported
}
- var isPacket bool
switch stype {
- case linux.SOCK_STREAM:
- case linux.SOCK_DGRAM, linux.SOCK_SEQPACKET:
- isPacket = true
+ case linux.SOCK_STREAM, linux.SOCK_DGRAM, linux.SOCK_SEQPACKET:
+ // Ok
default:
return nil, nil, syserr.ErrInvalidArgument
}
// Create the endpoints and sockets.
ep1, ep2 := transport.NewPair(stype, t.Kernel())
- s1 := New(t, ep1, isPacket)
- s2 := New(t, ep2, isPacket)
+ s1 := New(t, ep1, stype)
+ s2 := New(t, ep2, stype)
return s1, s2, nil
}
diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go
index dbe53b9a2..0b5ef84c4 100644
--- a/pkg/sentry/strace/socket.go
+++ b/pkg/sentry/strace/socket.go
@@ -76,13 +76,13 @@ var SocketFamily = abi.ValueSet{
// SocketType are the possible socket(2) types.
var SocketType = abi.ValueSet{
- linux.SOCK_STREAM: "SOCK_STREAM",
- linux.SOCK_DGRAM: "SOCK_DGRAM",
- linux.SOCK_RAW: "SOCK_RAW",
- linux.SOCK_RDM: "SOCK_RDM",
- linux.SOCK_SEQPACKET: "SOCK_SEQPACKET",
- linux.SOCK_DCCP: "SOCK_DCCP",
- linux.SOCK_PACKET: "SOCK_PACKET",
+ uint64(linux.SOCK_STREAM): "SOCK_STREAM",
+ uint64(linux.SOCK_DGRAM): "SOCK_DGRAM",
+ uint64(linux.SOCK_RAW): "SOCK_RAW",
+ uint64(linux.SOCK_RDM): "SOCK_RDM",
+ uint64(linux.SOCK_SEQPACKET): "SOCK_SEQPACKET",
+ uint64(linux.SOCK_DCCP): "SOCK_DCCP",
+ uint64(linux.SOCK_PACKET): "SOCK_PACKET",
}
// SocketFlagSet are the possible socket(2) flags.
diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD
index f76989ae2..1c057526b 100644
--- a/pkg/sentry/syscalls/linux/BUILD
+++ b/pkg/sentry/syscalls/linux/BUILD
@@ -19,6 +19,7 @@ go_library(
"sys_identity.go",
"sys_inotify.go",
"sys_lseek.go",
+ "sys_mempolicy.go",
"sys_mmap.go",
"sys_mount.go",
"sys_pipe.go",
diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go
index 1ba3695fb..72146ea63 100644
--- a/pkg/sentry/syscalls/linux/error.go
+++ b/pkg/sentry/syscalls/linux/error.go
@@ -92,6 +92,10 @@ func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op strin
// TODO(gvisor.dev/issue/161): In some cases SIGPIPE should
// also be sent to the application.
return nil
+ case syserror.ECONNRESET:
+ // For TCP sendfile connections, we may have a reset. But we
+ // should just return n as the result.
+ return nil
case syserror.ErrWouldBlock:
// Syscall would block, but completed a partial read/write.
// This case should only be returned by IssueIO for nonblocking
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index 3e4d312af..5251c2463 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -34,33 +34,6 @@ const _AUDIT_ARCH_X86_64 = 0xc000003e
// AMD64 is a table of Linux amd64 syscall API with the corresponding syscall
// numbers from Linux 4.4. The entries commented out are those syscalls we
// don't currently support.
-//
-// Syscall support is documented as annotations in Go comments of the form:
-// @Syscall(<name>, <key:value>, ...)
-//
-// Supported args and values are:
-//
-// - arg: A syscall option. This entry only applies to the syscall when given
-// this option.
-// - support: Indicates support level
-// - UNIMPLEMENTED: Unimplemented (default, implies returns:ENOSYS)
-// - PARTIAL: Partial support. Details should be provided in note.
-// - FULL: Full support
-// - returns: Indicates a known return value. Values are syscall errors. This
-// is treated as a string so you can use something like
-// "returns:EPERM or ENOSYS".
-// - issue: A Github issue number.
-// - note: A note
-//
-// Example:
-// // @Syscall(mmap, arg:MAP_PRIVATE, support:FULL, note:Private memory fully supported)
-// // @Syscall(mmap, arg:MAP_SHARED, issue:123, note:Shared memory not supported)
-// // @Syscall(setxattr, returns:ENOTSUP, note:Requires file system support)
-//
-// Annotations should be placed as close to their implementation as possible
-// (preferrably as part of a supporting function's Godoc) and should be
-// updated as syscall support changes. Unimplemented syscalls are documented
-// here due to their lack of a supporting function or method.
var AMD64 = &kernel.SyscallTable{
OS: abi.Linux,
Arch: arch.AMD64,
@@ -74,405 +47,338 @@ var AMD64 = &kernel.SyscallTable{
Version: "#1 SMP Sun Jan 10 15:06:54 PST 2016",
},
AuditNumber: _AUDIT_ARCH_X86_64,
- Table: map[uintptr]kernel.SyscallFn{
- 0: Read,
- 1: Write,
- 2: Open,
- 3: Close,
- 4: Stat,
- 5: Fstat,
- 6: Lstat,
- 7: Poll,
- 8: Lseek,
- 9: Mmap,
- 10: Mprotect,
- 11: Munmap,
- 12: Brk,
- 13: RtSigaction,
- 14: RtSigprocmask,
- 15: RtSigreturn,
- 16: Ioctl,
- 17: Pread64,
- 18: Pwrite64,
- 19: Readv,
- 20: Writev,
- 21: Access,
- 22: Pipe,
- 23: Select,
- 24: SchedYield,
- 25: Mremap,
- 26: Msync,
- 27: Mincore,
- 28: Madvise,
- 29: Shmget,
- 30: Shmat,
- 31: Shmctl,
- 32: Dup,
- 33: Dup2,
- 34: Pause,
- 35: Nanosleep,
- 36: Getitimer,
- 37: Alarm,
- 38: Setitimer,
- 39: Getpid,
- 40: Sendfile,
- 41: Socket,
- 42: Connect,
- 43: Accept,
- 44: SendTo,
- 45: RecvFrom,
- 46: SendMsg,
- 47: RecvMsg,
- 48: Shutdown,
- 49: Bind,
- 50: Listen,
- 51: GetSockName,
- 52: GetPeerName,
- 53: SocketPair,
- 54: SetSockOpt,
- 55: GetSockOpt,
- 56: Clone,
- 57: Fork,
- 58: Vfork,
- 59: Execve,
- 60: Exit,
- 61: Wait4,
- 62: Kill,
- 63: Uname,
- 64: Semget,
- 65: Semop,
- 66: Semctl,
- 67: Shmdt,
- // 68: @Syscall(Msgget), TODO(b/29354921)
- // 69: @Syscall(Msgsnd), TODO(b/29354921)
- // 70: @Syscall(Msgrcv), TODO(b/29354921)
- // 71: @Syscall(Msgctl), TODO(b/29354921)
- 72: Fcntl,
- 73: Flock,
- 74: Fsync,
- 75: Fdatasync,
- 76: Truncate,
- 77: Ftruncate,
- 78: Getdents,
- 79: Getcwd,
- 80: Chdir,
- 81: Fchdir,
- 82: Rename,
- 83: Mkdir,
- 84: Rmdir,
- 85: Creat,
- 86: Link,
- 87: Unlink,
- 88: Symlink,
- 89: Readlink,
- 90: Chmod,
- 91: Fchmod,
- 92: Chown,
- 93: Fchown,
- 94: Lchown,
- 95: Umask,
- 96: Gettimeofday,
- 97: Getrlimit,
- 98: Getrusage,
- 99: Sysinfo,
- 100: Times,
- 101: Ptrace,
- 102: Getuid,
- 103: Syslog,
- 104: Getgid,
- 105: Setuid,
- 106: Setgid,
- 107: Geteuid,
- 108: Getegid,
- 109: Setpgid,
- 110: Getppid,
- 111: Getpgrp,
- 112: Setsid,
- 113: Setreuid,
- 114: Setregid,
- 115: Getgroups,
- 116: Setgroups,
- 117: Setresuid,
- 118: Getresuid,
- 119: Setresgid,
- 120: Getresgid,
- 121: Getpgid,
- // 122: @Syscall(Setfsuid), TODO(b/112851702)
- // 123: @Syscall(Setfsgid), TODO(b/112851702)
- 124: Getsid,
- 125: Capget,
- 126: Capset,
- 127: RtSigpending,
- 128: RtSigtimedwait,
- 129: RtSigqueueinfo,
- 130: RtSigsuspend,
- 131: Sigaltstack,
- 132: Utime,
- 133: Mknod,
- // @Syscall(Uselib, note:Obsolete)
- 134: syscalls.Error(syscall.ENOSYS),
- // @Syscall(SetPersonality, returns:EINVAL, note:Unable to change personality)
- 135: syscalls.ErrorWithEvent(syscall.EINVAL),
- // @Syscall(Ustat, note:Needs filesystem support)
- 136: syscalls.ErrorWithEvent(syscall.ENOSYS),
- 137: Statfs,
- 138: Fstatfs,
- // 139: @Syscall(Sysfs), TODO(gvisor.dev/issue/165)
- 140: Getpriority,
- 141: Setpriority,
- // @Syscall(SchedSetparam, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_nice; ENOSYS otherwise)
- 142: syscalls.CapError(linux.CAP_SYS_NICE), // requires cap_sys_nice
- 143: SchedGetparam,
- 144: SchedSetscheduler,
- 145: SchedGetscheduler,
- 146: SchedGetPriorityMax,
- 147: SchedGetPriorityMin,
- // @Syscall(SchedRrGetInterval, returns:EPERM)
- 148: syscalls.ErrorWithEvent(syscall.EPERM),
- 149: Mlock,
- 150: Munlock,
- 151: Mlockall,
- 152: Munlockall,
- // @Syscall(Vhangup, returns:EPERM)
- 153: syscalls.CapError(linux.CAP_SYS_TTY_CONFIG),
- // @Syscall(ModifyLdt, returns:EPERM)
- 154: syscalls.Error(syscall.EPERM),
- // @Syscall(PivotRoot, returns:EPERM)
- 155: syscalls.Error(syscall.EPERM),
- // @Syscall(Sysctl, returns:EPERM)
- 156: syscalls.Error(syscall.EPERM), // syscall is "worthless"
- 157: Prctl,
- 158: ArchPrctl,
- // @Syscall(Adjtimex, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_time; ENOSYS otherwise)
- 159: syscalls.CapError(linux.CAP_SYS_TIME), // requires cap_sys_time
- 160: Setrlimit,
- 161: Chroot,
- 162: Sync,
- // @Syscall(Acct, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_pacct; ENOSYS otherwise)
- 163: syscalls.CapError(linux.CAP_SYS_PACCT), // requires cap_sys_pacct
- // @Syscall(Settimeofday, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_time; ENOSYS otherwise)
- 164: syscalls.CapError(linux.CAP_SYS_TIME), // requires cap_sys_time
- 165: Mount,
- 166: Umount2,
- // @Syscall(Swapon, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_admin; ENOSYS otherwise)
- 167: syscalls.CapError(linux.CAP_SYS_ADMIN), // requires cap_sys_admin
- // @Syscall(Swapoff, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_admin; ENOSYS otherwise)
- 168: syscalls.CapError(linux.CAP_SYS_ADMIN), // requires cap_sys_admin
- // @Syscall(Reboot, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_boot; ENOSYS otherwise)
- 169: syscalls.CapError(linux.CAP_SYS_BOOT), // requires cap_sys_boot
- 170: Sethostname,
- 171: Setdomainname,
- // @Syscall(Iopl, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_rawio; ENOSYS otherwise)
- 172: syscalls.CapError(linux.CAP_SYS_RAWIO), // requires cap_sys_rawio
- // @Syscall(Ioperm, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_rawio; ENOSYS otherwise)
- 173: syscalls.CapError(linux.CAP_SYS_RAWIO), // requires cap_sys_rawio
- // @Syscall(CreateModule, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_module; ENOSYS otherwise)
- 174: syscalls.CapError(linux.CAP_SYS_MODULE), // CreateModule, requires cap_sys_module
- // @Syscall(InitModule, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_module; ENOSYS otherwise)
- 175: syscalls.CapError(linux.CAP_SYS_MODULE), // requires cap_sys_module
- // @Syscall(DeleteModule, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_module; ENOSYS otherwise)
- 176: syscalls.CapError(linux.CAP_SYS_MODULE), // requires cap_sys_module
- // @Syscall(GetKernelSyms, note:Not supported in > 2.6)
- 177: syscalls.Error(syscall.ENOSYS),
- // @Syscall(QueryModule, note:Not supported in > 2.6)
- 178: syscalls.Error(syscall.ENOSYS),
- // @Syscall(Quotactl, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_admin; ENOSYS otherwise)
- 179: syscalls.CapError(linux.CAP_SYS_ADMIN), // requires cap_sys_admin (most operations)
- // @Syscall(Nfsservctl, note:Does not exist > 3.1)
- 180: syscalls.Error(syscall.ENOSYS),
- // @Syscall(Getpmsg, note:Not implemented in Linux)
- 181: syscalls.Error(syscall.ENOSYS),
- // @Syscall(Putpmsg, note:Not implemented in Linux)
- 182: syscalls.Error(syscall.ENOSYS),
- // @Syscall(AfsSyscall, note:Not implemented in Linux)
- 183: syscalls.Error(syscall.ENOSYS),
- // @Syscall(Tuxcall, note:Not implemented in Linux)
- 184: syscalls.Error(syscall.ENOSYS),
- // @Syscall(Security, note:Not implemented in Linux)
- 185: syscalls.Error(syscall.ENOSYS),
- 186: Gettid,
- 187: nil, // @Syscall(Readahead), TODO(b/29351341)
- // @Syscall(Setxattr, returns:ENOTSUP, note:Requires filesystem support)
- 188: syscalls.ErrorWithEvent(syscall.ENOTSUP),
- // @Syscall(Lsetxattr, returns:ENOTSUP, note:Requires filesystem support)
- 189: syscalls.ErrorWithEvent(syscall.ENOTSUP),
- // @Syscall(Fsetxattr, returns:ENOTSUP, note:Requires filesystem support)
- 190: syscalls.ErrorWithEvent(syscall.ENOTSUP),
- // @Syscall(Getxattr, returns:ENOTSUP, note:Requires filesystem support)
- 191: syscalls.ErrorWithEvent(syscall.ENOTSUP),
- // @Syscall(Lgetxattr, returns:ENOTSUP, note:Requires filesystem support)
- 192: syscalls.ErrorWithEvent(syscall.ENOTSUP),
- // @Syscall(Fgetxattr, returns:ENOTSUP, note:Requires filesystem support)
- 193: syscalls.ErrorWithEvent(syscall.ENOTSUP),
- // @Syscall(Listxattr, returns:ENOTSUP, note:Requires filesystem support)
- 194: syscalls.ErrorWithEvent(syscall.ENOTSUP),
- // @Syscall(Llistxattr, returns:ENOTSUP, note:Requires filesystem support)
- 195: syscalls.ErrorWithEvent(syscall.ENOTSUP),
- // @Syscall(Flistxattr, returns:ENOTSUP, note:Requires filesystem support)
- 196: syscalls.ErrorWithEvent(syscall.ENOTSUP),
- // @Syscall(Removexattr, returns:ENOTSUP, note:Requires filesystem support)
- 197: syscalls.ErrorWithEvent(syscall.ENOTSUP),
- // @Syscall(Lremovexattr, returns:ENOTSUP, note:Requires filesystem support)
- 198: syscalls.ErrorWithEvent(syscall.ENOTSUP),
- // @Syscall(Fremovexattr, returns:ENOTSUP, note:Requires filesystem support)
- 199: syscalls.ErrorWithEvent(syscall.ENOTSUP),
- 200: Tkill,
- 201: Time,
- 202: Futex,
- 203: SchedSetaffinity,
- 204: SchedGetaffinity,
- // @Syscall(SetThreadArea, note:Expected to return ENOSYS on 64-bit)
- 205: syscalls.Error(syscall.ENOSYS),
- 206: IoSetup,
- 207: IoDestroy,
- 208: IoGetevents,
- 209: IoSubmit,
- 210: IoCancel,
- // @Syscall(GetThreadArea, note:Expected to return ENOSYS on 64-bit)
- 211: syscalls.Error(syscall.ENOSYS),
- // @Syscall(LookupDcookie, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_admin; ENOSYS otherwise)
- 212: syscalls.CapError(linux.CAP_SYS_ADMIN), // requires cap_sys_admin
- 213: EpollCreate,
- // @Syscall(EpollCtlOld, note:Deprecated)
- 214: syscalls.ErrorWithEvent(syscall.ENOSYS), // deprecated (afaik, unused)
- // @Syscall(EpollWaitOld, note:Deprecated)
- 215: syscalls.ErrorWithEvent(syscall.ENOSYS), // deprecated (afaik, unused)
- // @Syscall(RemapFilePages, note:Deprecated)
- 216: syscalls.ErrorWithEvent(syscall.ENOSYS), // deprecated since 3.16
- 217: Getdents64,
- 218: SetTidAddress,
- 219: RestartSyscall,
- // 220: @Syscall(Semtimedop), TODO(b/29354920)
- 221: Fadvise64,
- 222: TimerCreate,
- 223: TimerSettime,
- 224: TimerGettime,
- 225: TimerGetoverrun,
- 226: TimerDelete,
- 227: ClockSettime,
- 228: ClockGettime,
- 229: ClockGetres,
- 230: ClockNanosleep,
- 231: ExitGroup,
- 232: EpollWait,
- 233: EpollCtl,
- 234: Tgkill,
- 235: Utimes,
- // @Syscall(Vserver, note:Not implemented by Linux)
- 236: syscalls.Error(syscall.ENOSYS), // Vserver, not implemented by Linux
- // @Syscall(Mbind, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_nice; ENOSYS otherwise), TODO(b/117792295)
- 237: syscalls.CapError(linux.CAP_SYS_NICE), // may require cap_sys_nice
- 238: SetMempolicy,
- 239: GetMempolicy,
- // 240: @Syscall(MqOpen), TODO(b/29354921)
- // 241: @Syscall(MqUnlink), TODO(b/29354921)
- // 242: @Syscall(MqTimedsend), TODO(b/29354921)
- // 243: @Syscall(MqTimedreceive), TODO(b/29354921)
- // 244: @Syscall(MqNotify), TODO(b/29354921)
- // 245: @Syscall(MqGetsetattr), TODO(b/29354921)
- 246: syscalls.CapError(linux.CAP_SYS_BOOT), // kexec_load, requires cap_sys_boot
- 247: Waitid,
- // @Syscall(AddKey, returns:EACCES, note:Not available to user)
- 248: syscalls.Error(syscall.EACCES),
- // @Syscall(RequestKey, returns:EACCES, note:Not available to user)
- 249: syscalls.Error(syscall.EACCES),
- // @Syscall(Keyctl, returns:EACCES, note:Not available to user)
- 250: syscalls.Error(syscall.EACCES),
- // @Syscall(IoprioSet, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_admin; ENOSYS otherwise)
- 251: syscalls.CapError(linux.CAP_SYS_ADMIN), // requires cap_sys_nice or cap_sys_admin (depending)
- // @Syscall(IoprioGet, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_admin; ENOSYS otherwise)
- 252: syscalls.CapError(linux.CAP_SYS_ADMIN), // requires cap_sys_nice or cap_sys_admin (depending)
- 253: InotifyInit,
- 254: InotifyAddWatch,
- 255: InotifyRmWatch,
- // @Syscall(MigratePages, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_nice; ENOSYS otherwise)
- 256: syscalls.CapError(linux.CAP_SYS_NICE),
- 257: Openat,
- 258: Mkdirat,
- 259: Mknodat,
- 260: Fchownat,
- 261: Futimesat,
- 262: Fstatat,
- 263: Unlinkat,
- 264: Renameat,
- 265: Linkat,
- 266: Symlinkat,
- 267: Readlinkat,
- 268: Fchmodat,
- 269: Faccessat,
- 270: Pselect,
- 271: Ppoll,
- 272: Unshare,
- // @Syscall(SetRobustList, note:Obsolete)
- 273: syscalls.Error(syscall.ENOSYS),
- // @Syscall(GetRobustList, note:Obsolete)
- 274: syscalls.Error(syscall.ENOSYS),
- 275: Splice,
- // 276: @Syscall(Tee), TODO(b/29354098)
- 277: SyncFileRange,
- // 278: @Syscall(Vmsplice), TODO(b/29354098)
- // @Syscall(MovePages, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_nice; ENOSYS otherwise)
- 279: syscalls.CapError(linux.CAP_SYS_NICE), // requires cap_sys_nice (mostly)
- 280: Utimensat,
- 281: EpollPwait,
- // 282: @Syscall(Signalfd), TODO(b/19846426)
- 283: TimerfdCreate,
- 284: Eventfd,
- 285: Fallocate,
- 286: TimerfdSettime,
- 287: TimerfdGettime,
- 288: Accept4,
- // 289: @Syscall(Signalfd4), TODO(b/19846426)
- 290: Eventfd2,
- 291: EpollCreate1,
- 292: Dup3,
- 293: Pipe2,
- 294: InotifyInit1,
- 295: Preadv,
- 296: Pwritev,
- 297: RtTgsigqueueinfo,
- // @Syscall(PerfEventOpen, returns:ENODEV, note:No support for perf counters)
- 298: syscalls.ErrorWithEvent(syscall.ENODEV),
- 299: RecvMMsg,
- // @Syscall(FanotifyInit, note:Needs CONFIG_FANOTIFY)
- 300: syscalls.ErrorWithEvent(syscall.ENOSYS),
- // @Syscall(FanotifyMark, note:Needs CONFIG_FANOTIFY)
- 301: syscalls.ErrorWithEvent(syscall.ENOSYS),
- 302: Prlimit64,
- // @Syscall(NameToHandleAt, returns:EOPNOTSUPP, note:Needs filesystem support)
- 303: syscalls.ErrorWithEvent(syscall.EOPNOTSUPP),
- // @Syscall(OpenByHandleAt, returns:EOPNOTSUPP, note:Needs filesystem support)
- 304: syscalls.ErrorWithEvent(syscall.EOPNOTSUPP),
- // @Syscall(ClockAdjtime, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_module; ENOSYS otherwise)
- 305: syscalls.CapError(linux.CAP_SYS_TIME), // requires cap_sys_time
- 306: Syncfs,
- 307: SendMMsg,
- // 308: @Syscall(Setns), TODO(b/29354995)
- 309: Getcpu,
- // 310: @Syscall(ProcessVmReadv), TODO(gvisor.dev/issue/158) may require cap_sys_ptrace
- // 311: @Syscall(ProcessVmWritev), TODO(gvisor.dev/issue/158) may require cap_sys_ptrace
- // @Syscall(Kcmp, returns:EPERM or ENOSYS, note:Requires cap_sys_ptrace)
- 312: syscalls.CapError(linux.CAP_SYS_PTRACE),
- // @Syscall(FinitModule, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_module; ENOSYS otherwise)
- 313: syscalls.CapError(linux.CAP_SYS_MODULE),
- // 314: @Syscall(SchedSetattr), TODO(b/118902272), we have no scheduler
- // 315: @Syscall(SchedGetattr), TODO(b/118902272), we have no scheduler
- // 316: @Syscall(Renameat2), TODO(b/118902772)
- 317: Seccomp,
- 318: GetRandom,
- 319: MemfdCreate,
- // @Syscall(KexecFileLoad, EPERM or ENOSYS, note:Infeasible to support. Returns EPERM if the process does not have cap_sys_boot; ENOSYS otherwise)
- 320: syscalls.CapError(linux.CAP_SYS_BOOT),
- // @Syscall(Bpf, returns:EPERM or ENOSYS, note:Returns EPERM if the process does not have cap_sys_boot; ENOSYS otherwise)
- 321: syscalls.CapError(linux.CAP_SYS_ADMIN), // requires cap_sys_admin for all commands
- // 322: @Syscall(Execveat), TODO(b/118901836)
- // 323: @Syscall(Userfaultfd), TODO(b/118906345)
- // 324: @Syscall(Membarrier), TODO(b/118904897)
- 325: Mlock2,
+ Table: map[uintptr]kernel.Syscall{
+ 0: syscalls.Supported("read", Read),
+ 1: syscalls.Supported("write", Write),
+ 2: syscalls.Supported("open", Open),
+ 3: syscalls.Supported("close", Close),
+ 4: syscalls.Undocumented("stat", Stat),
+ 5: syscalls.Undocumented("fstat", Fstat),
+ 6: syscalls.Undocumented("lstat", Lstat),
+ 7: syscalls.Undocumented("poll", Poll),
+ 8: syscalls.Undocumented("lseek", Lseek),
+ 9: syscalls.Undocumented("mmap", Mmap),
+ 10: syscalls.Undocumented("mprotect", Mprotect),
+ 11: syscalls.Undocumented("munmap", Munmap),
+ 12: syscalls.Undocumented("brk", Brk),
+ 13: syscalls.Undocumented("rt_sigaction", RtSigaction),
+ 14: syscalls.Undocumented("rt_sigprocmask", RtSigprocmask),
+ 15: syscalls.Undocumented("rt_sigreturn", RtSigreturn),
+ 16: syscalls.Undocumented("ioctl", Ioctl),
+ 17: syscalls.Undocumented("pread64", Pread64),
+ 18: syscalls.Undocumented("pwrite64", Pwrite64),
+ 19: syscalls.Undocumented("readv", Readv),
+ 20: syscalls.Undocumented("writev", Writev),
+ 21: syscalls.Undocumented("access", Access),
+ 22: syscalls.Undocumented("pipe", Pipe),
+ 23: syscalls.Undocumented("select", Select),
+ 24: syscalls.Undocumented("sched_yield", SchedYield),
+ 25: syscalls.Undocumented("mremap", Mremap),
+ 26: syscalls.Undocumented("msync", Msync),
+ 27: syscalls.Undocumented("mincore", Mincore),
+ 28: syscalls.Undocumented("madvise", Madvise),
+ 29: syscalls.Undocumented("shmget", Shmget),
+ 30: syscalls.Undocumented("shmat", Shmat),
+ 31: syscalls.Undocumented("shmctl", Shmctl),
+ 32: syscalls.Undocumented("dup", Dup),
+ 33: syscalls.Undocumented("dup2", Dup2),
+ 34: syscalls.Undocumented("pause", Pause),
+ 35: syscalls.Undocumented("nanosleep", Nanosleep),
+ 36: syscalls.Undocumented("getitimer", Getitimer),
+ 37: syscalls.Undocumented("alarm", Alarm),
+ 38: syscalls.Undocumented("setitimer", Setitimer),
+ 39: syscalls.Undocumented("getpid", Getpid),
+ 40: syscalls.Undocumented("sendfile", Sendfile),
+ 41: syscalls.Undocumented("socket", Socket),
+ 42: syscalls.Undocumented("connect", Connect),
+ 43: syscalls.Undocumented("accept", Accept),
+ 44: syscalls.Undocumented("sendto", SendTo),
+ 45: syscalls.Undocumented("recvfrom", RecvFrom),
+ 46: syscalls.Undocumented("sendmsg", SendMsg),
+ 47: syscalls.Undocumented("recvmsg", RecvMsg),
+ 48: syscalls.Undocumented("shutdown", Shutdown),
+ 49: syscalls.Undocumented("bind", Bind),
+ 50: syscalls.Undocumented("listen", Listen),
+ 51: syscalls.Undocumented("getsockname", GetSockName),
+ 52: syscalls.Undocumented("getpeername", GetPeerName),
+ 53: syscalls.Undocumented("socketpair", SocketPair),
+ 54: syscalls.Undocumented("setsockopt", SetSockOpt),
+ 55: syscalls.Undocumented("getsockopt", GetSockOpt),
+ 56: syscalls.Undocumented("clone", Clone),
+ 57: syscalls.Undocumented("fork", Fork),
+ 58: syscalls.Undocumented("vfork", Vfork),
+ 59: syscalls.Undocumented("execve", Execve),
+ 60: syscalls.Undocumented("exit", Exit),
+ 61: syscalls.Undocumented("wait4", Wait4),
+ 62: syscalls.Undocumented("kill", Kill),
+ 63: syscalls.Undocumented("uname", Uname),
+ 64: syscalls.Undocumented("semget", Semget),
+ 65: syscalls.Undocumented("semop", Semop),
+ 66: syscalls.Undocumented("semctl", Semctl),
+ 67: syscalls.Undocumented("shmdt", Shmdt),
+ 68: syscalls.ErrorWithEvent("msgget", syscall.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
+ 69: syscalls.ErrorWithEvent("msgsnd", syscall.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
+ 70: syscalls.ErrorWithEvent("msgrcv", syscall.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
+ 71: syscalls.ErrorWithEvent("msgctl", syscall.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
+ 72: syscalls.Undocumented("fcntl", Fcntl),
+ 73: syscalls.Undocumented("flock", Flock),
+ 74: syscalls.Undocumented("fsync", Fsync),
+ 75: syscalls.Undocumented("fdatasync", Fdatasync),
+ 76: syscalls.Undocumented("truncate", Truncate),
+ 77: syscalls.Undocumented("ftruncate", Ftruncate),
+ 78: syscalls.Undocumented("getdents", Getdents),
+ 79: syscalls.Undocumented("getcwd", Getcwd),
+ 80: syscalls.Undocumented("chdir", Chdir),
+ 81: syscalls.Undocumented("fchdir", Fchdir),
+ 82: syscalls.Undocumented("rename", Rename),
+ 83: syscalls.Undocumented("mkdir", Mkdir),
+ 84: syscalls.Undocumented("rmdir", Rmdir),
+ 85: syscalls.Undocumented("creat", Creat),
+ 86: syscalls.Undocumented("link", Link),
+ 87: syscalls.Undocumented("link", Unlink),
+ 88: syscalls.Undocumented("symlink", Symlink),
+ 89: syscalls.Undocumented("readlink", Readlink),
+ 90: syscalls.Undocumented("chmod", Chmod),
+ 91: syscalls.Undocumented("fchmod", Fchmod),
+ 92: syscalls.Undocumented("chown", Chown),
+ 93: syscalls.Undocumented("fchown", Fchown),
+ 94: syscalls.Undocumented("lchown", Lchown),
+ 95: syscalls.Undocumented("umask", Umask),
+ 96: syscalls.Undocumented("gettimeofday", Gettimeofday),
+ 97: syscalls.Undocumented("getrlimit", Getrlimit),
+ 98: syscalls.Undocumented("getrusage", Getrusage),
+ 99: syscalls.Undocumented("sysinfo", Sysinfo),
+ 100: syscalls.Undocumented("times", Times),
+ 101: syscalls.Undocumented("ptrace", Ptrace),
+ 102: syscalls.Undocumented("getuid", Getuid),
+ 103: syscalls.Undocumented("syslog", Syslog),
+ 104: syscalls.Undocumented("getgid", Getgid),
+ 105: syscalls.Undocumented("setuid", Setuid),
+ 106: syscalls.Undocumented("setgid", Setgid),
+ 107: syscalls.Undocumented("geteuid", Geteuid),
+ 108: syscalls.Undocumented("getegid", Getegid),
+ 109: syscalls.Undocumented("setpgid", Setpgid),
+ 110: syscalls.Undocumented("getppid", Getppid),
+ 111: syscalls.Undocumented("getpgrp", Getpgrp),
+ 112: syscalls.Undocumented("setsid", Setsid),
+ 113: syscalls.Undocumented("setreuid", Setreuid),
+ 114: syscalls.Undocumented("setregid", Setregid),
+ 115: syscalls.Undocumented("getgroups", Getgroups),
+ 116: syscalls.Undocumented("setgroups", Setgroups),
+ 117: syscalls.Undocumented("setresuid", Setresuid),
+ 118: syscalls.Undocumented("getresuid", Getresuid),
+ 119: syscalls.Undocumented("setresgid", Setresgid),
+ 120: syscalls.Undocumented("setresgid", Getresgid),
+ 121: syscalls.Undocumented("getpgid", Getpgid),
+ 122: syscalls.ErrorWithEvent("setfsuid", syscall.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702)
+ 123: syscalls.ErrorWithEvent("setfsgid", syscall.ENOSYS, "", []string{"gvisor.dev/issue/260"}), // TODO(b/112851702)
+ 124: syscalls.Undocumented("getsid", Getsid),
+ 125: syscalls.Undocumented("capget", Capget),
+ 126: syscalls.Undocumented("capset", Capset),
+ 127: syscalls.Undocumented("rt_sigpending", RtSigpending),
+ 128: syscalls.Undocumented("rt_sigtimedwait", RtSigtimedwait),
+ 129: syscalls.Undocumented("rt_sigqueueinfo", RtSigqueueinfo),
+ 130: syscalls.Undocumented("rt_sigsuspend", RtSigsuspend),
+ 131: syscalls.Undocumented("sigaltstack", Sigaltstack),
+ 132: syscalls.Undocumented("utime", Utime),
+ 133: syscalls.Undocumented("mknod", Mknod),
+ 134: syscalls.Error("uselib", syscall.ENOSYS, "Obsolete", nil),
+ 135: syscalls.ErrorWithEvent("personality", syscall.EINVAL, "Unable to change personality.", nil),
+ 136: syscalls.ErrorWithEvent("ustat", syscall.ENOSYS, "Needs filesystem support.", nil),
+ 137: syscalls.Undocumented("statfs", Statfs),
+ 138: syscalls.Undocumented("fstatfs", Fstatfs),
+ 139: syscalls.ErrorWithEvent("sysfs", syscall.ENOSYS, "", []string{"gvisor.dev/issue/165"}),
+ 140: syscalls.Undocumented("getpriority", Getpriority),
+ 141: syscalls.Undocumented("setpriority", Setpriority),
+ 142: syscalls.CapError("sched_setparam", linux.CAP_SYS_NICE, "", nil),
+ 143: syscalls.Undocumented("sched_getparam", SchedGetparam),
+ 144: syscalls.Undocumented("sched_setscheduler", SchedSetscheduler),
+ 145: syscalls.Undocumented("sched_getscheduler", SchedGetscheduler),
+ 146: syscalls.Undocumented("sched_get_priority_max", SchedGetPriorityMax),
+ 147: syscalls.Undocumented("sched_get_priority_min", SchedGetPriorityMin),
+ 148: syscalls.ErrorWithEvent("sched_rr_get_interval", syscall.EPERM, "", nil),
+ 149: syscalls.Undocumented("mlock", Mlock),
+ 150: syscalls.Undocumented("munlock", Munlock),
+ 151: syscalls.Undocumented("mlockall", Mlockall),
+ 152: syscalls.Undocumented("munlockall", Munlockall),
+ 153: syscalls.CapError("vhangup", linux.CAP_SYS_TTY_CONFIG, "", nil),
+ 154: syscalls.Error("modify_ldt", syscall.EPERM, "", nil),
+ 155: syscalls.Error("pivot_root", syscall.EPERM, "", nil),
+ 156: syscalls.Error("sysctl", syscall.EPERM, `syscall is "worthless"`, nil),
+ 157: syscalls.Undocumented("prctl", Prctl),
+ 158: syscalls.Undocumented("arch_prctl", ArchPrctl),
+ 159: syscalls.CapError("adjtimex", linux.CAP_SYS_TIME, "", nil),
+ 160: syscalls.Undocumented("setrlimit", Setrlimit),
+ 161: syscalls.Undocumented("chroot", Chroot),
+ 162: syscalls.Undocumented("sync", Sync),
+ 163: syscalls.CapError("acct", linux.CAP_SYS_PACCT, "", nil),
+ 164: syscalls.CapError("settimeofday", linux.CAP_SYS_TIME, "", nil),
+ 165: syscalls.Undocumented("mount", Mount),
+ 166: syscalls.Undocumented("umount2", Umount2),
+ 167: syscalls.CapError("swapon", linux.CAP_SYS_ADMIN, "", nil),
+ 168: syscalls.CapError("swapoff", linux.CAP_SYS_ADMIN, "", nil),
+ 169: syscalls.CapError("reboot", linux.CAP_SYS_BOOT, "", nil),
+ 170: syscalls.Undocumented("sethostname", Sethostname),
+ 171: syscalls.Undocumented("setdomainname", Setdomainname),
+ 172: syscalls.CapError("iopl", linux.CAP_SYS_RAWIO, "", nil),
+ 173: syscalls.CapError("ioperm", linux.CAP_SYS_RAWIO, "", nil),
+ 174: syscalls.CapError("create_module", linux.CAP_SYS_MODULE, "", nil),
+ 175: syscalls.CapError("init_module", linux.CAP_SYS_MODULE, "", nil),
+ 176: syscalls.CapError("delete_module", linux.CAP_SYS_MODULE, "", nil),
+ 177: syscalls.Error("get_kernel_syms", syscall.ENOSYS, "Not supported in > 2.6", nil),
+ 178: syscalls.Error("query_module", syscall.ENOSYS, "Not supported in > 2.6", nil),
+ 179: syscalls.CapError("quotactl", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_admin for most operations
+ 180: syscalls.Error("nfsservctl", syscall.ENOSYS, "Does not exist > 3.1", nil),
+ 181: syscalls.Error("getpmsg", syscall.ENOSYS, "Not implemented in Linux", nil),
+ 182: syscalls.Error("putpmsg", syscall.ENOSYS, "Not implemented in Linux", nil),
+ 183: syscalls.Error("afs_syscall", syscall.ENOSYS, "Not implemented in Linux", nil),
+ 184: syscalls.Error("tuxcall", syscall.ENOSYS, "Not implemented in Linux", nil),
+ 185: syscalls.Error("security", syscall.ENOSYS, "Not implemented in Linux", nil),
+ 186: syscalls.Undocumented("gettid", Gettid),
+ 187: syscalls.ErrorWithEvent("readahead", syscall.ENOSYS, "", []string{"gvisor.dev/issue/261"}), // TODO(b/29351341)
+ 188: syscalls.ErrorWithEvent("setxattr", syscall.ENOTSUP, "Requires filesystem support", nil),
+ 189: syscalls.ErrorWithEvent("lsetxattr", syscall.ENOTSUP, "Requires filesystem support", nil),
+ 190: syscalls.ErrorWithEvent("fsetxattr", syscall.ENOTSUP, "Requires filesystem support", nil),
+ 191: syscalls.ErrorWithEvent("getxattr", syscall.ENOTSUP, "Requires filesystem support", nil),
+ 192: syscalls.ErrorWithEvent("lgetxattr", syscall.ENOTSUP, "Requires filesystem support", nil),
+ 193: syscalls.ErrorWithEvent("fgetxattr", syscall.ENOTSUP, "Requires filesystem support", nil),
+ 194: syscalls.ErrorWithEvent("listxattr", syscall.ENOTSUP, "Requires filesystem support", nil),
+ 195: syscalls.ErrorWithEvent("llistxattr", syscall.ENOTSUP, "Requires filesystem support", nil),
+ 196: syscalls.ErrorWithEvent("flistxattr", syscall.ENOTSUP, "Requires filesystem support", nil),
+ 197: syscalls.ErrorWithEvent("removexattr", syscall.ENOTSUP, "Requires filesystem support", nil),
+ 198: syscalls.ErrorWithEvent("lremovexattr", syscall.ENOTSUP, "Requires filesystem support", nil),
+ 199: syscalls.ErrorWithEvent("fremovexattr", syscall.ENOTSUP, "Requires filesystem support", nil),
+ 200: syscalls.Undocumented("tkill", Tkill),
+ 201: syscalls.Undocumented("time", Time),
+ 202: syscalls.Undocumented("futex", Futex),
+ 203: syscalls.Undocumented("sched_setaffinity", SchedSetaffinity),
+ 204: syscalls.Undocumented("sched_getaffinity", SchedGetaffinity),
+ 205: syscalls.Error("set_thread_area", syscall.ENOSYS, "Expected to return ENOSYS on 64-bit", nil),
+ 206: syscalls.Undocumented("io_setup", IoSetup),
+ 207: syscalls.Undocumented("io_destroy", IoDestroy),
+ 208: syscalls.Undocumented("io_getevents", IoGetevents),
+ 209: syscalls.Undocumented("io_submit", IoSubmit),
+ 210: syscalls.Undocumented("io_cancel", IoCancel),
+ 211: syscalls.Error("get_thread_area", syscall.ENOSYS, "Expected to return ENOSYS on 64-bit", nil),
+ 212: syscalls.CapError("lookup_dcookie", linux.CAP_SYS_ADMIN, "", nil),
+ 213: syscalls.Undocumented("epoll_create", EpollCreate),
+ 214: syscalls.ErrorWithEvent("epoll_ctl_old", syscall.ENOSYS, "Deprecated", nil),
+ 215: syscalls.ErrorWithEvent("epoll_wait_old", syscall.ENOSYS, "Deprecated", nil),
+ 216: syscalls.ErrorWithEvent("remap_file_pages", syscall.ENOSYS, "Deprecated since 3.16", nil),
+ 217: syscalls.Undocumented("getdents64", Getdents64),
+ 218: syscalls.Undocumented("set_tid_address", SetTidAddress),
+ 219: syscalls.Undocumented("restart_syscall", RestartSyscall),
+ 220: syscalls.ErrorWithEvent("semtimedop", syscall.ENOSYS, "", []string{"gvisor.dev/issue/137"}), // TODO(b/29354920)
+ 221: syscalls.Undocumented("fadvise64", Fadvise64),
+ 222: syscalls.Undocumented("timer_create", TimerCreate),
+ 223: syscalls.Undocumented("timer_settime", TimerSettime),
+ 224: syscalls.Undocumented("timer_gettime", TimerGettime),
+ 225: syscalls.Undocumented("timer_getoverrun", TimerGetoverrun),
+ 226: syscalls.Undocumented("timer_delete", TimerDelete),
+ 227: syscalls.Undocumented("clock_settime", ClockSettime),
+ 228: syscalls.Undocumented("clock_gettime", ClockGettime),
+ 229: syscalls.Undocumented("clock_getres", ClockGetres),
+ 230: syscalls.Undocumented("clock_nanosleep", ClockNanosleep),
+ 231: syscalls.Undocumented("exit_group", ExitGroup),
+ 232: syscalls.Undocumented("epoll_wait", EpollWait),
+ 233: syscalls.Undocumented("epoll_ctl", EpollCtl),
+ 234: syscalls.Undocumented("tgkill", Tgkill),
+ 235: syscalls.Undocumented("utimes", Utimes),
+ 236: syscalls.Error("vserver", syscall.ENOSYS, "Not implemented by Linux", nil),
+ 237: syscalls.PartiallySupported("mbind", Mbind, "Stub implementation. Only a single NUMA node is advertised, and mempolicy is ignored accordingly, but mbind() will succeed and has effects reflected by get_mempolicy.", []string{"gvisor.dev/issue/262"}),
+ 238: syscalls.Undocumented("set_mempolicy", SetMempolicy),
+ 239: syscalls.Undocumented("get_mempolicy", GetMempolicy),
+ 240: syscalls.ErrorWithEvent("mq_open", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
+ 241: syscalls.ErrorWithEvent("mq_unlink", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
+ 242: syscalls.ErrorWithEvent("mq_timedsend", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
+ 243: syscalls.ErrorWithEvent("mq_timedreceive", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
+ 244: syscalls.ErrorWithEvent("mq_notify", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
+ 245: syscalls.ErrorWithEvent("mq_getsetattr", syscall.ENOSYS, "", []string{"gvisor.dev/issue/136"}), // TODO(b/29354921)
+ 246: syscalls.CapError("kexec_load", linux.CAP_SYS_BOOT, "", nil),
+ 247: syscalls.Undocumented("waitid", Waitid),
+ 248: syscalls.Error("add_key", syscall.EACCES, "Not available to user", nil),
+ 249: syscalls.Error("request_key", syscall.EACCES, "Not available to user", nil),
+ 250: syscalls.Error("keyctl", syscall.EACCES, "Not available to user", nil),
+ 251: syscalls.CapError("ioprio_set", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending)
+ 252: syscalls.CapError("ioprio_get", linux.CAP_SYS_ADMIN, "", nil), // requires cap_sys_nice or cap_sys_admin (depending)
+ 253: syscalls.Undocumented("inotify_init", InotifyInit),
+ 254: syscalls.Undocumented("inotify_add_watch", InotifyAddWatch),
+ 255: syscalls.Undocumented("inotify_rm_watch", InotifyRmWatch),
+ 256: syscalls.CapError("migrate_pages", linux.CAP_SYS_NICE, "", nil),
+ 257: syscalls.Undocumented("openat", Openat),
+ 258: syscalls.Undocumented("mkdirat", Mkdirat),
+ 259: syscalls.Undocumented("mknodat", Mknodat),
+ 260: syscalls.Undocumented("fchownat", Fchownat),
+ 261: syscalls.Undocumented("futimesat", Futimesat),
+ 262: syscalls.Undocumented("fstatat", Fstatat),
+ 263: syscalls.Undocumented("unlinkat", Unlinkat),
+ 264: syscalls.Undocumented("renameat", Renameat),
+ 265: syscalls.Undocumented("linkat", Linkat),
+ 266: syscalls.Undocumented("symlinkat", Symlinkat),
+ 267: syscalls.Undocumented("readlinkat", Readlinkat),
+ 268: syscalls.Undocumented("fchmodat", Fchmodat),
+ 269: syscalls.Undocumented("faccessat", Faccessat),
+ 270: syscalls.Undocumented("pselect", Pselect),
+ 271: syscalls.Undocumented("ppoll", Ppoll),
+ 272: syscalls.Undocumented("unshare", Unshare),
+ 273: syscalls.Error("set_robust_list", syscall.ENOSYS, "Obsolete", nil),
+ 274: syscalls.Error("get_robust_list", syscall.ENOSYS, "Obsolete", nil),
+ 275: syscalls.PartiallySupported("splice", Splice, "Stub implementation", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098)
+ 276: syscalls.ErrorWithEvent("tee", syscall.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098)
+ 277: syscalls.Undocumented("sync_file_range", SyncFileRange),
+ 278: syscalls.ErrorWithEvent("vmsplice", syscall.ENOSYS, "", []string{"gvisor.dev/issue/138"}), // TODO(b/29354098)
+ 279: syscalls.CapError("move_pages", linux.CAP_SYS_NICE, "", nil), // requires cap_sys_nice (mostly)
+ 280: syscalls.Undocumented("utimensat", Utimensat),
+ 281: syscalls.Undocumented("epoll_pwait", EpollPwait),
+ 282: syscalls.ErrorWithEvent("signalfd", syscall.ENOSYS, "", []string{"gvisor.dev/issue/139"}), // TODO(b/19846426)
+ 283: syscalls.Undocumented("timerfd_create", TimerfdCreate),
+ 284: syscalls.Undocumented("eventfd", Eventfd),
+ 285: syscalls.Undocumented("fallocate", Fallocate),
+ 286: syscalls.Undocumented("timerfd_settime", TimerfdSettime),
+ 287: syscalls.Undocumented("timerfd_gettime", TimerfdGettime),
+ 288: syscalls.Undocumented("accept4", Accept4),
+ 289: syscalls.ErrorWithEvent("signalfd4", syscall.ENOSYS, "", []string{"gvisor.dev/issue/139"}), // TODO(b/19846426)
+ 290: syscalls.Undocumented("eventfd2", Eventfd2),
+ 291: syscalls.Undocumented("epoll_create1", EpollCreate1),
+ 292: syscalls.Undocumented("dup3", Dup3),
+ 293: syscalls.Undocumented("pipe2", Pipe2),
+ 294: syscalls.Undocumented("inotify_init1", InotifyInit1),
+ 295: syscalls.Undocumented("preadv", Preadv),
+ 296: syscalls.Undocumented("pwritev", Pwritev),
+ 297: syscalls.Undocumented("rt_tgsigqueueinfo", RtTgsigqueueinfo),
+ 298: syscalls.ErrorWithEvent("perf_event_open", syscall.ENODEV, "No support for perf counters", nil),
+ 299: syscalls.Undocumented("recvmmsg", RecvMMsg),
+ 300: syscalls.ErrorWithEvent("fanotify_init", syscall.ENOSYS, "Needs CONFIG_FANOTIFY", nil),
+ 301: syscalls.ErrorWithEvent("fanotify_mark", syscall.ENOSYS, "Needs CONFIG_FANOTIFY", nil),
+ 302: syscalls.Undocumented("prlimit64", Prlimit64),
+ 303: syscalls.ErrorWithEvent("name_to_handle_at", syscall.EOPNOTSUPP, "Needs filesystem support", nil),
+ 304: syscalls.ErrorWithEvent("open_by_handle_at", syscall.EOPNOTSUPP, "Needs filesystem support", nil),
+ 305: syscalls.CapError("clock_adjtime", linux.CAP_SYS_TIME, "", nil),
+ 306: syscalls.Undocumented("syncfs", Syncfs),
+ 307: syscalls.Undocumented("sendmmsg", SendMMsg),
+ 308: syscalls.ErrorWithEvent("setns", syscall.EOPNOTSUPP, "Needs filesystem support", []string{"gvisor.dev/issue/140"}), // TODO(b/29354995)
+ 309: syscalls.Undocumented("getcpu", Getcpu),
+ 310: syscalls.ErrorWithEvent("process_vm_readv", syscall.ENOSYS, "", []string{"gvisor.dev/issue/158"}),
+ 311: syscalls.ErrorWithEvent("process_vm_writev", syscall.ENOSYS, "", []string{"gvisor.dev/issue/158"}),
+ 312: syscalls.CapError("kcmp", linux.CAP_SYS_PTRACE, "", nil),
+ 313: syscalls.CapError("finit_module", linux.CAP_SYS_MODULE, "", nil),
+ 314: syscalls.ErrorWithEvent("sched_setattr", syscall.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272)
+ 315: syscalls.ErrorWithEvent("sched_getattr", syscall.ENOSYS, "gVisor does not implement a scheduler.", []string{"gvisor.dev/issue/264"}), // TODO(b/118902272)
+ 316: syscalls.ErrorWithEvent("renameat2", syscall.ENOSYS, "", []string{"gvisor.dev/issue/263"}), // TODO(b/118902772)
+ 317: syscalls.Undocumented("seccomp", Seccomp),
+ 318: syscalls.Undocumented("getrandom", GetRandom),
+ 319: syscalls.Undocumented("memfd_create", MemfdCreate),
+ 320: syscalls.CapError("kexec_file_load", linux.CAP_SYS_BOOT, "", nil),
+ 321: syscalls.CapError("bpf", linux.CAP_SYS_ADMIN, "", nil),
+ 322: syscalls.ErrorWithEvent("execveat", syscall.ENOSYS, "", []string{"gvisor.dev/issue/265"}), // TODO(b/118901836)
+ 323: syscalls.ErrorWithEvent("userfaultfd", syscall.ENOSYS, "", []string{"gvisor.dev/issue/266"}), // TODO(b/118906345)
+ 324: syscalls.ErrorWithEvent("membarrier", syscall.ENOSYS, "", []string{"gvisor.dev/issue/267"}), // TODO(b/118904897)
+ 325: syscalls.Undocumented("mlock2", Mlock2),
+
// Syscalls after 325 are "backports" from versions of Linux after 4.4.
- // 326: @Syscall(CopyFileRange),
- 327: Preadv2,
- 328: Pwritev2,
+ 326: syscalls.ErrorWithEvent("copy_file_range", syscall.ENOSYS, "", nil),
+ 327: syscalls.Undocumented("preadv2", Preadv2),
+ 328: syscalls.Undocumented("pwritev2", Pwritev2),
},
Emulate: map[usermem.Addr]uintptr{
diff --git a/pkg/sentry/syscalls/linux/sys_mempolicy.go b/pkg/sentry/syscalls/linux/sys_mempolicy.go
new file mode 100644
index 000000000..652b2c206
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/sys_mempolicy.go
@@ -0,0 +1,312 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package linux
+
+import (
+ "fmt"
+
+ "gvisor.googlesource.com/gvisor/pkg/abi/linux"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+)
+
+// We unconditionally report a single NUMA node. This also means that our
+// "nodemask_t" is a single unsigned long (uint64).
+const (
+ maxNodes = 1
+ allowedNodemask = (1 << maxNodes) - 1
+)
+
+func copyInNodemask(t *kernel.Task, addr usermem.Addr, maxnode uint32) (uint64, error) {
+ // "nodemask points to a bit mask of node IDs that contains up to maxnode
+ // bits. The bit mask size is rounded to the next multiple of
+ // sizeof(unsigned long), but the kernel will use bits only up to maxnode.
+ // A NULL value of nodemask or a maxnode value of zero specifies the empty
+ // set of nodes. If the value of maxnode is zero, the nodemask argument is
+ // ignored." - set_mempolicy(2). Unfortunately, most of this is inaccurate
+ // because of what appears to be a bug: mm/mempolicy.c:get_nodes() uses
+ // maxnode-1, not maxnode, as the number of bits.
+ bits := maxnode - 1
+ if bits > usermem.PageSize*8 { // also handles overflow from maxnode == 0
+ return 0, syserror.EINVAL
+ }
+ if bits == 0 {
+ return 0, nil
+ }
+ // Copy in the whole nodemask.
+ numUint64 := (bits + 63) / 64
+ buf := t.CopyScratchBuffer(int(numUint64) * 8)
+ if _, err := t.CopyInBytes(addr, buf); err != nil {
+ return 0, err
+ }
+ val := usermem.ByteOrder.Uint64(buf)
+ // Check that only allowed bits in the first unsigned long in the nodemask
+ // are set.
+ if val&^allowedNodemask != 0 {
+ return 0, syserror.EINVAL
+ }
+ // Check that all remaining bits in the nodemask are 0.
+ for i := 8; i < len(buf); i++ {
+ if buf[i] != 0 {
+ return 0, syserror.EINVAL
+ }
+ }
+ return val, nil
+}
+
+func copyOutNodemask(t *kernel.Task, addr usermem.Addr, maxnode uint32, val uint64) error {
+ // mm/mempolicy.c:copy_nodes_to_user() also uses maxnode-1 as the number of
+ // bits.
+ bits := maxnode - 1
+ if bits > usermem.PageSize*8 { // also handles overflow from maxnode == 0
+ return syserror.EINVAL
+ }
+ if bits == 0 {
+ return nil
+ }
+ // Copy out the first unsigned long in the nodemask.
+ buf := t.CopyScratchBuffer(8)
+ usermem.ByteOrder.PutUint64(buf, val)
+ if _, err := t.CopyOutBytes(addr, buf); err != nil {
+ return err
+ }
+ // Zero out remaining unsigned longs in the nodemask.
+ if bits > 64 {
+ remAddr, ok := addr.AddLength(8)
+ if !ok {
+ return syserror.EFAULT
+ }
+ remUint64 := (bits - 1) / 64
+ if _, err := t.MemoryManager().ZeroOut(t, remAddr, int64(remUint64)*8, usermem.IOOpts{
+ AddressSpaceActive: true,
+ }); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+// GetMempolicy implements the syscall get_mempolicy(2).
+func GetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ mode := args[0].Pointer()
+ nodemask := args[1].Pointer()
+ maxnode := args[2].Uint()
+ addr := args[3].Pointer()
+ flags := args[4].Uint()
+
+ if flags&^(linux.MPOL_F_NODE|linux.MPOL_F_ADDR|linux.MPOL_F_MEMS_ALLOWED) != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ nodeFlag := flags&linux.MPOL_F_NODE != 0
+ addrFlag := flags&linux.MPOL_F_ADDR != 0
+ memsAllowed := flags&linux.MPOL_F_MEMS_ALLOWED != 0
+
+ // "EINVAL: The value specified by maxnode is less than the number of node
+ // IDs supported by the system." - get_mempolicy(2)
+ if nodemask != 0 && maxnode < maxNodes {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // "If flags specifies MPOL_F_MEMS_ALLOWED [...], the mode argument is
+ // ignored and the set of nodes (memories) that the thread is allowed to
+ // specify in subsequent calls to mbind(2) or set_mempolicy(2) (in the
+ // absence of any mode flags) is returned in nodemask."
+ if memsAllowed {
+ // "It is not permitted to combine MPOL_F_MEMS_ALLOWED with either
+ // MPOL_F_ADDR or MPOL_F_NODE."
+ if nodeFlag || addrFlag {
+ return 0, nil, syserror.EINVAL
+ }
+ if err := copyOutNodemask(t, nodemask, maxnode, allowedNodemask); err != nil {
+ return 0, nil, err
+ }
+ return 0, nil, nil
+ }
+
+ // "If flags specifies MPOL_F_ADDR, then information is returned about the
+ // policy governing the memory address given in addr. ... If the mode
+ // argument is not NULL, then get_mempolicy() will store the policy mode
+ // and any optional mode flags of the requested NUMA policy in the location
+ // pointed to by this argument. If nodemask is not NULL, then the nodemask
+ // associated with the policy will be stored in the location pointed to by
+ // this argument."
+ if addrFlag {
+ policy, nodemaskVal, err := t.MemoryManager().NumaPolicy(addr)
+ if err != nil {
+ return 0, nil, err
+ }
+ if nodeFlag {
+ // "If flags specifies both MPOL_F_NODE and MPOL_F_ADDR,
+ // get_mempolicy() will return the node ID of the node on which the
+ // address addr is allocated into the location pointed to by mode.
+ // If no page has yet been allocated for the specified address,
+ // get_mempolicy() will allocate a page as if the thread had
+ // performed a read (load) access to that address, and return the
+ // ID of the node where that page was allocated."
+ buf := t.CopyScratchBuffer(1)
+ _, err := t.CopyInBytes(addr, buf)
+ if err != nil {
+ return 0, nil, err
+ }
+ policy = 0 // maxNodes == 1
+ }
+ if mode != 0 {
+ if _, err := t.CopyOut(mode, policy); err != nil {
+ return 0, nil, err
+ }
+ }
+ if nodemask != 0 {
+ if err := copyOutNodemask(t, nodemask, maxnode, nodemaskVal); err != nil {
+ return 0, nil, err
+ }
+ }
+ return 0, nil, nil
+ }
+
+ // "EINVAL: ... flags specified MPOL_F_ADDR and addr is NULL, or flags did
+ // not specify MPOL_F_ADDR and addr is not NULL." This is partially
+ // inaccurate: if flags specifies MPOL_F_ADDR,
+ // mm/mempolicy.c:do_get_mempolicy() doesn't special-case NULL; it will
+ // just (usually) fail to find a VMA at address 0 and return EFAULT.
+ if addr != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+
+ // "If flags is specified as 0, then information about the calling thread's
+ // default policy (as set by set_mempolicy(2)) is returned, in the buffers
+ // pointed to by mode and nodemask. ... If flags specifies MPOL_F_NODE, but
+ // not MPOL_F_ADDR, and the thread's current policy is MPOL_INTERLEAVE,
+ // then get_mempolicy() will return in the location pointed to by a
+ // non-NULL mode argument, the node ID of the next node that will be used
+ // for interleaving of internal kernel pages allocated on behalf of the
+ // thread."
+ policy, nodemaskVal := t.NumaPolicy()
+ if nodeFlag {
+ if policy&^linux.MPOL_MODE_FLAGS != linux.MPOL_INTERLEAVE {
+ return 0, nil, syserror.EINVAL
+ }
+ policy = 0 // maxNodes == 1
+ }
+ if mode != 0 {
+ if _, err := t.CopyOut(mode, policy); err != nil {
+ return 0, nil, err
+ }
+ }
+ if nodemask != 0 {
+ if err := copyOutNodemask(t, nodemask, maxnode, nodemaskVal); err != nil {
+ return 0, nil, err
+ }
+ }
+ return 0, nil, nil
+}
+
+// SetMempolicy implements the syscall set_mempolicy(2).
+func SetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ modeWithFlags := args[0].Int()
+ nodemask := args[1].Pointer()
+ maxnode := args[2].Uint()
+
+ modeWithFlags, nodemaskVal, err := copyInMempolicyNodemask(t, modeWithFlags, nodemask, maxnode)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ t.SetNumaPolicy(modeWithFlags, nodemaskVal)
+ return 0, nil, nil
+}
+
+// Mbind implements the syscall mbind(2).
+func Mbind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ length := args[1].Uint64()
+ mode := args[2].Int()
+ nodemask := args[3].Pointer()
+ maxnode := args[4].Uint()
+ flags := args[5].Uint()
+
+ if flags&^linux.MPOL_MF_VALID != 0 {
+ return 0, nil, syserror.EINVAL
+ }
+ // "If MPOL_MF_MOVE_ALL is passed in flags ... [the] calling thread must be
+ // privileged (CAP_SYS_NICE) to use this flag." - mbind(2)
+ if flags&linux.MPOL_MF_MOVE_ALL != 0 && !t.HasCapability(linux.CAP_SYS_NICE) {
+ return 0, nil, syserror.EPERM
+ }
+
+ mode, nodemaskVal, err := copyInMempolicyNodemask(t, mode, nodemask, maxnode)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ // Since we claim to have only a single node, all flags can be ignored
+ // (since all pages must already be on that single node).
+ err = t.MemoryManager().SetNumaPolicy(addr, length, mode, nodemaskVal)
+ return 0, nil, err
+}
+
+func copyInMempolicyNodemask(t *kernel.Task, modeWithFlags int32, nodemask usermem.Addr, maxnode uint32) (int32, uint64, error) {
+ flags := modeWithFlags & linux.MPOL_MODE_FLAGS
+ mode := modeWithFlags &^ linux.MPOL_MODE_FLAGS
+ if flags == linux.MPOL_MODE_FLAGS {
+ // Can't specify both mode flags simultaneously.
+ return 0, 0, syserror.EINVAL
+ }
+ if mode < 0 || mode >= linux.MPOL_MAX {
+ // Must specify a valid mode.
+ return 0, 0, syserror.EINVAL
+ }
+
+ var nodemaskVal uint64
+ if nodemask != 0 {
+ var err error
+ nodemaskVal, err = copyInNodemask(t, nodemask, maxnode)
+ if err != nil {
+ return 0, 0, err
+ }
+ }
+
+ switch mode {
+ case linux.MPOL_DEFAULT:
+ // "nodemask must be specified as NULL." - set_mempolicy(2). This is inaccurate;
+ // Linux allows a nodemask to be specified, as long as it is empty.
+ if nodemaskVal != 0 {
+ return 0, 0, syserror.EINVAL
+ }
+ case linux.MPOL_BIND, linux.MPOL_INTERLEAVE:
+ // These require a non-empty nodemask.
+ if nodemaskVal == 0 {
+ return 0, 0, syserror.EINVAL
+ }
+ case linux.MPOL_PREFERRED:
+ // This permits an empty nodemask, as long as no flags are set.
+ if nodemaskVal == 0 && flags != 0 {
+ return 0, 0, syserror.EINVAL
+ }
+ case linux.MPOL_LOCAL:
+ // This requires an empty nodemask and no flags set ...
+ if nodemaskVal != 0 || flags != 0 {
+ return 0, 0, syserror.EINVAL
+ }
+ // ... and is implemented as MPOL_PREFERRED.
+ mode = linux.MPOL_PREFERRED
+ default:
+ // Unknown mode, which we should have rejected above.
+ panic(fmt.Sprintf("unknown mode: %v", mode))
+ }
+
+ return mode | flags, nodemaskVal, nil
+}
diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go
index 64a6e639c..9926f0ac5 100644
--- a/pkg/sentry/syscalls/linux/sys_mmap.go
+++ b/pkg/sentry/syscalls/linux/sys_mmap.go
@@ -204,151 +204,6 @@ func Madvise(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
}
}
-func copyOutIfNotNull(t *kernel.Task, ptr usermem.Addr, val interface{}) (int, error) {
- if ptr != 0 {
- return t.CopyOut(ptr, val)
- }
- return 0, nil
-}
-
-// GetMempolicy implements the syscall get_mempolicy(2).
-func GetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- mode := args[0].Pointer()
- nodemask := args[1].Pointer()
- maxnode := args[2].Uint()
- addr := args[3].Pointer()
- flags := args[4].Uint()
-
- memsAllowed := flags&linux.MPOL_F_MEMS_ALLOWED != 0
- nodeFlag := flags&linux.MPOL_F_NODE != 0
- addrFlag := flags&linux.MPOL_F_ADDR != 0
-
- // TODO(rahat): Once sysfs is implemented, report a single numa node in
- // /sys/devices/system/node.
- if nodemask != 0 && maxnode < 1 {
- return 0, nil, syserror.EINVAL
- }
-
- // 'addr' provided iff 'addrFlag' set.
- if addrFlag == (addr == 0) {
- return 0, nil, syserror.EINVAL
- }
-
- // Default policy for the thread.
- if flags == 0 {
- policy, nodemaskVal := t.NumaPolicy()
- if _, err := copyOutIfNotNull(t, mode, policy); err != nil {
- return 0, nil, syserror.EFAULT
- }
- if _, err := copyOutIfNotNull(t, nodemask, nodemaskVal); err != nil {
- return 0, nil, syserror.EFAULT
- }
- return 0, nil, nil
- }
-
- // Report all nodes available to caller.
- if memsAllowed {
- // MPOL_F_NODE and MPOL_F_ADDR not allowed with MPOL_F_MEMS_ALLOWED.
- if nodeFlag || addrFlag {
- return 0, nil, syserror.EINVAL
- }
-
- // Report a single numa node.
- if _, err := copyOutIfNotNull(t, nodemask, uint32(0x1)); err != nil {
- return 0, nil, syserror.EFAULT
- }
- return 0, nil, nil
- }
-
- if addrFlag {
- if nodeFlag {
- // Return the id for the node where 'addr' resides, via 'mode'.
- //
- // The real get_mempolicy(2) allocates the page referenced by 'addr'
- // by simulating a read, if it is unallocated before the call. It
- // then returns the node the page is allocated on through the mode
- // pointer.
- b := t.CopyScratchBuffer(1)
- _, err := t.CopyInBytes(addr, b)
- if err != nil {
- return 0, nil, syserror.EFAULT
- }
- if _, err := copyOutIfNotNull(t, mode, int32(0)); err != nil {
- return 0, nil, syserror.EFAULT
- }
- } else {
- storedPolicy, _ := t.NumaPolicy()
- // Return the policy governing the memory referenced by 'addr'.
- if _, err := copyOutIfNotNull(t, mode, int32(storedPolicy)); err != nil {
- return 0, nil, syserror.EFAULT
- }
- }
- return 0, nil, nil
- }
-
- storedPolicy, _ := t.NumaPolicy()
- if nodeFlag && (storedPolicy&^linux.MPOL_MODE_FLAGS == linux.MPOL_INTERLEAVE) {
- // Policy for current thread is to interleave memory between
- // nodes. Return the next node we'll allocate on. Since we only have a
- // single node, this is always node 0.
- if _, err := copyOutIfNotNull(t, mode, int32(0)); err != nil {
- return 0, nil, syserror.EFAULT
- }
- return 0, nil, nil
- }
-
- return 0, nil, syserror.EINVAL
-}
-
-func allowedNodesMask() uint32 {
- const maxNodes = 1
- return ^uint32((1 << maxNodes) - 1)
-}
-
-// SetMempolicy implements the syscall set_mempolicy(2).
-func SetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- modeWithFlags := args[0].Int()
- nodemask := args[1].Pointer()
- maxnode := args[2].Uint()
-
- if nodemask != 0 && maxnode < 1 {
- return 0, nil, syserror.EINVAL
- }
-
- if modeWithFlags&linux.MPOL_MODE_FLAGS == linux.MPOL_MODE_FLAGS {
- // Can't specify multiple modes simultaneously.
- return 0, nil, syserror.EINVAL
- }
-
- mode := modeWithFlags &^ linux.MPOL_MODE_FLAGS
- if mode < 0 || mode >= linux.MPOL_MAX {
- // Must specify a valid mode.
- return 0, nil, syserror.EINVAL
- }
-
- var nodemaskVal uint32
- // Nodemask may be empty for some policy modes.
- if nodemask != 0 && maxnode > 0 {
- if _, err := t.CopyIn(nodemask, &nodemaskVal); err != nil {
- return 0, nil, syserror.EFAULT
- }
- }
-
- if (mode == linux.MPOL_INTERLEAVE || mode == linux.MPOL_BIND) && nodemaskVal == 0 {
- // Mode requires a non-empty nodemask, but got an empty nodemask.
- return 0, nil, syserror.EINVAL
- }
-
- if nodemaskVal&allowedNodesMask() != 0 {
- // Invalid node specified.
- return 0, nil, syserror.EINVAL
- }
-
- t.SetNumaPolicy(int32(modeWithFlags), nodemaskVal)
-
- return 0, nil, nil
-}
-
// Mincore implements the syscall mincore(2).
func Mincore(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
addr := args[0].Pointer()
diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go
index 117ae1a0e..1b7e5616b 100644
--- a/pkg/sentry/syscalls/linux/sys_prctl.go
+++ b/pkg/sentry/syscalls/linux/sys_prctl.go
@@ -15,6 +15,7 @@
package linux
import (
+ "fmt"
"syscall"
"gvisor.googlesource.com/gvisor/pkg/abi/linux"
@@ -23,6 +24,7 @@ import (
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/mm"
)
// Prctl implements linux syscall prctl(2).
@@ -44,6 +46,33 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
_, err := t.CopyOut(args[1].Pointer(), int32(t.ParentDeathSignal()))
return 0, nil, err
+ case linux.PR_GET_DUMPABLE:
+ d := t.MemoryManager().Dumpability()
+ switch d {
+ case mm.NotDumpable:
+ return linux.SUID_DUMP_DISABLE, nil, nil
+ case mm.UserDumpable:
+ return linux.SUID_DUMP_USER, nil, nil
+ case mm.RootDumpable:
+ return linux.SUID_DUMP_ROOT, nil, nil
+ default:
+ panic(fmt.Sprintf("Unknown dumpability %v", d))
+ }
+
+ case linux.PR_SET_DUMPABLE:
+ var d mm.Dumpability
+ switch args[1].Int() {
+ case linux.SUID_DUMP_DISABLE:
+ d = mm.NotDumpable
+ case linux.SUID_DUMP_USER:
+ d = mm.UserDumpable
+ default:
+ // N.B. Userspace may not pass SUID_DUMP_ROOT.
+ return 0, nil, syscall.EINVAL
+ }
+ t.MemoryManager().SetDumpability(d)
+ return 0, nil, nil
+
case linux.PR_GET_KEEPCAPS:
if t.Credentials().KeepCaps {
return 1, nil, nil
@@ -171,9 +200,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
}
return 0, nil, t.DropBoundingCapability(cp)
- case linux.PR_GET_DUMPABLE,
- linux.PR_SET_DUMPABLE,
- linux.PR_GET_TIMING,
+ case linux.PR_GET_TIMING,
linux.PR_SET_TIMING,
linux.PR_GET_TSC,
linux.PR_SET_TSC,
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index 8f4dbf3bc..31295a6a9 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -188,7 +188,7 @@ func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
}
// Create the new socket.
- s, e := socket.New(t, domain, transport.SockType(stype&0xf), protocol)
+ s, e := socket.New(t, domain, linux.SockType(stype&0xf), protocol)
if e != nil {
return 0, nil, e.ToError()
}
@@ -227,7 +227,7 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
}
// Create the socket pair.
- s1, s2, e := socket.Pair(t, domain, transport.SockType(stype&0xf), protocol)
+ s1, s2, e := socket.Pair(t, domain, linux.SockType(stype&0xf), protocol)
if e != nil {
return 0, nil, e.ToError()
}
diff --git a/pkg/sentry/syscalls/syscalls.go b/pkg/sentry/syscalls/syscalls.go
index 5d10b3824..48c114232 100644
--- a/pkg/sentry/syscalls/syscalls.go
+++ b/pkg/sentry/syscalls/syscalls.go
@@ -25,37 +25,97 @@
package syscalls
import (
+ "fmt"
+ "syscall"
+
"gvisor.googlesource.com/gvisor/pkg/abi/linux"
"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
"gvisor.googlesource.com/gvisor/pkg/syserror"
)
+// Supported returns a syscall that is fully supported.
+func Supported(name string, fn kernel.SyscallFn) kernel.Syscall {
+ return kernel.Syscall{
+ Name: name,
+ Fn: fn,
+ SupportLevel: kernel.SupportFull,
+ Note: "Full Support",
+ }
+}
+
+// Undocumented returns a syscall that is undocumented.
+func Undocumented(name string, fn kernel.SyscallFn) kernel.Syscall {
+ return kernel.Syscall{
+ Name: name,
+ Fn: fn,
+ SupportLevel: kernel.SupportUndocumented,
+ }
+}
+
+// PartiallySupported returns a syscall that has a partial implementation.
+func PartiallySupported(name string, fn kernel.SyscallFn, note string, urls []string) kernel.Syscall {
+ return kernel.Syscall{
+ Name: name,
+ Fn: fn,
+ SupportLevel: kernel.SupportPartial,
+ Note: note,
+ URLs: urls,
+ }
+}
+
// Error returns a syscall handler that will always give the passed error.
-func Error(err error) kernel.SyscallFn {
- return func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- return 0, nil, err
+func Error(name string, err syscall.Errno, note string, urls []string) kernel.Syscall {
+ if note != "" {
+ note = note + "; "
+ }
+ return kernel.Syscall{
+ Name: name,
+ Fn: func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ return 0, nil, err
+ },
+ SupportLevel: kernel.SupportUnimplemented,
+ Note: fmt.Sprintf("%sReturns %q", note, err.Error()),
+ URLs: urls,
}
}
// ErrorWithEvent gives a syscall function that sends an unimplemented
// syscall event via the event channel and returns the passed error.
-func ErrorWithEvent(err error) kernel.SyscallFn {
- return func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- t.Kernel().EmitUnimplementedEvent(t)
- return 0, nil, err
+func ErrorWithEvent(name string, err syscall.Errno, note string, urls []string) kernel.Syscall {
+ if note != "" {
+ note = note + "; "
+ }
+ return kernel.Syscall{
+ Name: name,
+ Fn: func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ t.Kernel().EmitUnimplementedEvent(t)
+ return 0, nil, err
+ },
+ SupportLevel: kernel.SupportUnimplemented,
+ Note: fmt.Sprintf("%sReturns %q", note, err.Error()),
+ URLs: urls,
}
}
// CapError gives a syscall function that checks for capability c. If the task
// has the capability, it returns ENOSYS, otherwise EPERM. To unprivileged
// tasks, it will seem like there is an implementation.
-func CapError(c linux.Capability) kernel.SyscallFn {
- return func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
- if !t.HasCapability(c) {
- return 0, nil, syserror.EPERM
- }
- t.Kernel().EmitUnimplementedEvent(t)
- return 0, nil, syserror.ENOSYS
+func CapError(name string, c linux.Capability, note string, urls []string) kernel.Syscall {
+ if note != "" {
+ note = note + "; "
+ }
+ return kernel.Syscall{
+ Name: name,
+ Fn: func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ if !t.HasCapability(c) {
+ return 0, nil, syserror.EPERM
+ }
+ t.Kernel().EmitUnimplementedEvent(t)
+ return 0, nil, syserror.ENOSYS
+ },
+ SupportLevel: kernel.SupportUnimplemented,
+ Note: fmt.Sprintf("%sReturns %q if the process does not have %s; %q otherwise", note, syserror.EPERM, c.String(), syserror.ENOSYS),
+ URLs: urls,
}
}
diff --git a/pkg/sentry/time/BUILD b/pkg/sentry/time/BUILD
index b2f8f6832..b50579a92 100644
--- a/pkg/sentry/time/BUILD
+++ b/pkg/sentry/time/BUILD
@@ -32,7 +32,7 @@ go_library(
"tsc_arm64.s",
],
importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/time",
- visibility = ["//pkg/sentry:internal"],
+ visibility = ["//:sandbox"],
deps = [
"//pkg/log",
"//pkg/metric",
diff --git a/pkg/sentry/usage/BUILD b/pkg/sentry/usage/BUILD
index 09198496b..860733061 100644
--- a/pkg/sentry/usage/BUILD
+++ b/pkg/sentry/usage/BUILD
@@ -17,6 +17,6 @@ go_library(
],
deps = [
"//pkg/bits",
- "//pkg/sentry/memutil",
+ "//pkg/memutil",
],
)
diff --git a/pkg/sentry/usage/memory.go b/pkg/sentry/usage/memory.go
index c316f1597..9ed974ccb 100644
--- a/pkg/sentry/usage/memory.go
+++ b/pkg/sentry/usage/memory.go
@@ -22,7 +22,7 @@ import (
"syscall"
"gvisor.googlesource.com/gvisor/pkg/bits"
- "gvisor.googlesource.com/gvisor/pkg/sentry/memutil"
+ "gvisor.googlesource.com/gvisor/pkg/memutil"
)
// MemoryKind represents a type of memory used by the application.
diff --git a/pkg/sentry/usermem/usermem.go b/pkg/sentry/usermem/usermem.go
index 31e4d6ada..9dde327a2 100644
--- a/pkg/sentry/usermem/usermem.go
+++ b/pkg/sentry/usermem/usermem.go
@@ -222,9 +222,11 @@ func CopyObjectIn(ctx context.Context, uio IO, addr Addr, dst interface{}, opts
return int(r.Addr - addr), nil
}
-// copyStringIncrement is the maximum number of bytes that are copied from
-// virtual memory at a time by CopyStringIn.
-const copyStringIncrement = 64
+// CopyStringIn tuning parameters, defined outside that function for tests.
+const (
+ copyStringIncrement = 64
+ copyStringMaxInitBufLen = 256
+)
// CopyStringIn copies a NUL-terminated string of unknown length from the
// memory mapped at addr in uio and returns it as a string (not including the
@@ -234,31 +236,38 @@ const copyStringIncrement = 64
//
// Preconditions: As for IO.CopyFromUser. maxlen >= 0.
func CopyStringIn(ctx context.Context, uio IO, addr Addr, maxlen int, opts IOOpts) (string, error) {
- buf := make([]byte, maxlen)
+ initLen := maxlen
+ if initLen > copyStringMaxInitBufLen {
+ initLen = copyStringMaxInitBufLen
+ }
+ buf := make([]byte, initLen)
var done int
for done < maxlen {
- start, ok := addr.AddLength(uint64(done))
- if !ok {
- // Last page of kernel memory. The application can't use this
- // anyway.
- return stringFromImmutableBytes(buf[:done]), syserror.EFAULT
- }
// Read up to copyStringIncrement bytes at a time.
readlen := copyStringIncrement
if readlen > maxlen-done {
readlen = maxlen - done
}
- end, ok := start.AddLength(uint64(readlen))
+ end, ok := addr.AddLength(uint64(readlen))
if !ok {
return stringFromImmutableBytes(buf[:done]), syserror.EFAULT
}
// Shorten the read to avoid crossing page boundaries, since faulting
// in a page unnecessarily is expensive. This also ensures that partial
// copies up to the end of application-mappable memory succeed.
- if start.RoundDown() != end.RoundDown() {
+ if addr.RoundDown() != end.RoundDown() {
end = end.RoundDown()
+ readlen = int(end - addr)
+ }
+ // Ensure that our buffer is large enough to accommodate the read.
+ if done+readlen > len(buf) {
+ newBufLen := len(buf) * 2
+ if newBufLen > maxlen {
+ newBufLen = maxlen
+ }
+ buf = append(buf, make([]byte, newBufLen-len(buf))...)
}
- n, err := uio.CopyIn(ctx, start, buf[done:done+int(end-start)], opts)
+ n, err := uio.CopyIn(ctx, addr, buf[done:done+readlen], opts)
// Look for the terminating zero byte, which may have occurred before
// hitting err.
for i, c := range buf[done : done+n] {
@@ -270,6 +279,7 @@ func CopyStringIn(ctx context.Context, uio IO, addr Addr, maxlen int, opts IOOpt
if err != nil {
return stringFromImmutableBytes(buf[:done]), err
}
+ addr = end
}
return stringFromImmutableBytes(buf), syserror.ENAMETOOLONG
}
diff --git a/pkg/sentry/usermem/usermem_test.go b/pkg/sentry/usermem/usermem_test.go
index 4a07118b7..575e5039d 100644
--- a/pkg/sentry/usermem/usermem_test.go
+++ b/pkg/sentry/usermem/usermem_test.go
@@ -192,6 +192,7 @@ func TestCopyObject(t *testing.T) {
}
func TestCopyStringInShort(t *testing.T) {
+ // Tests for string length <= copyStringIncrement.
want := strings.Repeat("A", copyStringIncrement-2)
mem := want + "\x00"
if got, err := CopyStringIn(newContext(), newBytesIOString(mem), 0, 2*copyStringIncrement, IOOpts{}); got != want || err != nil {
@@ -200,13 +201,25 @@ func TestCopyStringInShort(t *testing.T) {
}
func TestCopyStringInLong(t *testing.T) {
- want := strings.Repeat("A", copyStringIncrement+1)
+ // Tests for copyStringIncrement < string length <= copyStringMaxInitBufLen
+ // (requiring multiple calls to IO.CopyIn()).
+ want := strings.Repeat("A", copyStringIncrement*3/4) + strings.Repeat("B", copyStringIncrement*3/4)
mem := want + "\x00"
if got, err := CopyStringIn(newContext(), newBytesIOString(mem), 0, 2*copyStringIncrement, IOOpts{}); got != want || err != nil {
t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, nil)", got, err, want)
}
}
+func TestCopyStringInVeryLong(t *testing.T) {
+ // Tests for string length > copyStringMaxInitBufLen (requiring buffer
+ // reallocation).
+ want := strings.Repeat("A", copyStringMaxInitBufLen*3/4) + strings.Repeat("B", copyStringMaxInitBufLen*3/4)
+ mem := want + "\x00"
+ if got, err := CopyStringIn(newContext(), newBytesIOString(mem), 0, 2*copyStringMaxInitBufLen, IOOpts{}); got != want || err != nil {
+ t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, nil)", got, err, want)
+ }
+}
+
func TestCopyStringInNoTerminatingZeroByte(t *testing.T) {
want := strings.Repeat("A", copyStringIncrement-1)
got, err := CopyStringIn(newContext(), newBytesIOString(want), 0, 2*copyStringIncrement, IOOpts{})
diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go
index 1f889c2a0..b88e2e7bf 100644
--- a/pkg/tcpip/link/fdbased/endpoint.go
+++ b/pkg/tcpip/link/fdbased/endpoint.go
@@ -21,12 +21,29 @@
// FD based endpoints can be used in the networking stack by calling New() to
// create a new endpoint, and then passing it as an argument to
// Stack.CreateNIC().
+//
+// FD based endpoints can use more than one file descriptor to read incoming
+// packets. If there are more than one FDs specified and the underlying FD is an
+// AF_PACKET then the endpoint will enable FANOUT mode on the socket so that the
+// host kernel will consistently hash the packets to the sockets. This ensures
+// that packets for the same TCP streams are not reordered.
+//
+// Similarly if more than one FD's are specified where the underlying FD is not
+// AF_PACKET then it's the caller's responsibility to ensure that all inbound
+// packets on the descriptors are consistently 5 tuple hashed to one of the
+// descriptors to prevent TCP reordering.
+//
+// Since netstack today does not compute 5 tuple hashes for outgoing packets we
+// only use the first FD to write outbound packets. Once 5 tuple hashes for
+// all outbound packets are available we will make use of all underlying FD's to
+// write outbound packets.
package fdbased
import (
"fmt"
"syscall"
+ "golang.org/x/sys/unix"
"gvisor.googlesource.com/gvisor/pkg/tcpip"
"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
"gvisor.googlesource.com/gvisor/pkg/tcpip/header"
@@ -65,8 +82,10 @@ const (
)
type endpoint struct {
- // fd is the file descriptor used to send and receive packets.
- fd int
+ // fds is the set of file descriptors each identifying one inbound/outbound
+ // channel. The endpoint will dispatch from all inbound channels as well as
+ // hash outbound packets to specific channels based on the packet hash.
+ fds []int
// mtu (maximum transmission unit) is the maximum size of a packet.
mtu uint32
@@ -85,8 +104,8 @@ type endpoint struct {
// its end of the communication pipe.
closed func(*tcpip.Error)
- inboundDispatcher linkDispatcher
- dispatcher stack.NetworkDispatcher
+ inboundDispatchers []linkDispatcher
+ dispatcher stack.NetworkDispatcher
// packetDispatchMode controls the packet dispatcher used by this
// endpoint.
@@ -99,17 +118,47 @@ type endpoint struct {
// Options specify the details about the fd-based endpoint to be created.
type Options struct {
- FD int
- MTU uint32
- EthernetHeader bool
- ClosedFunc func(*tcpip.Error)
- Address tcpip.LinkAddress
- SaveRestore bool
- DisconnectOk bool
- GSOMaxSize uint32
+ // FDs is a set of FDs used to read/write packets.
+ FDs []int
+
+ // MTU is the mtu to use for this endpoint.
+ MTU uint32
+
+ // EthernetHeader if true, indicates that the endpoint should read/write
+ // ethernet frames instead of IP packets.
+ EthernetHeader bool
+
+ // ClosedFunc is a function to be called when an endpoint's peer (if
+ // any) closes its end of the communication pipe.
+ ClosedFunc func(*tcpip.Error)
+
+ // Address is the link address for this endpoint. Only used if
+ // EthernetHeader is true.
+ Address tcpip.LinkAddress
+
+ // SaveRestore if true, indicates that this NIC capability set should
+ // include CapabilitySaveRestore
+ SaveRestore bool
+
+ // DisconnectOk if true, indicates that this NIC capability set should
+ // include CapabilityDisconnectOk.
+ DisconnectOk bool
+
+ // GSOMaxSize is the maximum GSO packet size. It is zero if GSO is
+ // disabled.
+ GSOMaxSize uint32
+
+ // PacketDispatchMode specifies the type of inbound dispatcher to be
+ // used for this endpoint.
PacketDispatchMode PacketDispatchMode
- TXChecksumOffload bool
- RXChecksumOffload bool
+
+ // TXChecksumOffload if true, indicates that this endpoints capability
+ // set should include CapabilityTXChecksumOffload.
+ TXChecksumOffload bool
+
+ // RXChecksumOffload if true, indicates that this endpoints capability
+ // set should include CapabilityRXChecksumOffload.
+ RXChecksumOffload bool
}
// New creates a new fd-based endpoint.
@@ -117,10 +166,6 @@ type Options struct {
// Makes fd non-blocking, but does not take ownership of fd, which must remain
// open for the lifetime of the returned endpoint.
func New(opts *Options) (tcpip.LinkEndpointID, error) {
- if err := syscall.SetNonblock(opts.FD, true); err != nil {
- return 0, fmt.Errorf("syscall.SetNonblock(%v) failed: %v", opts.FD, err)
- }
-
caps := stack.LinkEndpointCapabilities(0)
if opts.RXChecksumOffload {
caps |= stack.CapabilityRXChecksumOffload
@@ -144,8 +189,12 @@ func New(opts *Options) (tcpip.LinkEndpointID, error) {
caps |= stack.CapabilityDisconnectOk
}
+ if len(opts.FDs) == 0 {
+ return 0, fmt.Errorf("opts.FD is empty, at least one FD must be specified")
+ }
+
e := &endpoint{
- fd: opts.FD,
+ fds: opts.FDs,
mtu: opts.MTU,
caps: caps,
closed: opts.ClosedFunc,
@@ -154,46 +203,71 @@ func New(opts *Options) (tcpip.LinkEndpointID, error) {
packetDispatchMode: opts.PacketDispatchMode,
}
- isSocket, err := isSocketFD(e.fd)
- if err != nil {
- return 0, err
- }
- if isSocket {
- if opts.GSOMaxSize != 0 {
- e.caps |= stack.CapabilityGSO
- e.gsoMaxSize = opts.GSOMaxSize
+ // Create per channel dispatchers.
+ for i := 0; i < len(e.fds); i++ {
+ fd := e.fds[i]
+ if err := syscall.SetNonblock(fd, true); err != nil {
+ return 0, fmt.Errorf("syscall.SetNonblock(%v) failed: %v", fd, err)
}
- }
- e.inboundDispatcher, err = createInboundDispatcher(e, isSocket)
- if err != nil {
- return 0, fmt.Errorf("createInboundDispatcher(...) = %v", err)
+
+ isSocket, err := isSocketFD(fd)
+ if err != nil {
+ return 0, err
+ }
+ if isSocket {
+ if opts.GSOMaxSize != 0 {
+ e.caps |= stack.CapabilityGSO
+ e.gsoMaxSize = opts.GSOMaxSize
+ }
+ }
+ inboundDispatcher, err := createInboundDispatcher(e, fd, isSocket)
+ if err != nil {
+ return 0, fmt.Errorf("createInboundDispatcher(...) = %v", err)
+ }
+ e.inboundDispatchers = append(e.inboundDispatchers, inboundDispatcher)
}
return stack.RegisterLinkEndpoint(e), nil
}
-func createInboundDispatcher(e *endpoint, isSocket bool) (linkDispatcher, error) {
+func createInboundDispatcher(e *endpoint, fd int, isSocket bool) (linkDispatcher, error) {
// By default use the readv() dispatcher as it works with all kinds of
// FDs (tap/tun/unix domain sockets and af_packet).
- inboundDispatcher, err := newReadVDispatcher(e.fd, e)
+ inboundDispatcher, err := newReadVDispatcher(fd, e)
if err != nil {
- return nil, fmt.Errorf("newReadVDispatcher(%d, %+v) = %v", e.fd, e, err)
+ return nil, fmt.Errorf("newReadVDispatcher(%d, %+v) = %v", fd, e, err)
}
if isSocket {
+ sa, err := unix.Getsockname(fd)
+ if err != nil {
+ return nil, fmt.Errorf("unix.Getsockname(%d) = %v", fd, err)
+ }
+ switch sa.(type) {
+ case *unix.SockaddrLinklayer:
+ // enable PACKET_FANOUT mode is the underlying socket is
+ // of type AF_PACKET.
+ const fanoutID = 1
+ const fanoutType = 0x8000 // PACKET_FANOUT_HASH | PACKET_FANOUT_FLAG_DEFRAG
+ fanoutArg := fanoutID | fanoutType<<16
+ if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_FANOUT, fanoutArg); err != nil {
+ return nil, fmt.Errorf("failed to enable PACKET_FANOUT option: %v", err)
+ }
+ }
+
switch e.packetDispatchMode {
case PacketMMap:
- inboundDispatcher, err = newPacketMMapDispatcher(e.fd, e)
+ inboundDispatcher, err = newPacketMMapDispatcher(fd, e)
if err != nil {
- return nil, fmt.Errorf("newPacketMMapDispatcher(%d, %+v) = %v", e.fd, e, err)
+ return nil, fmt.Errorf("newPacketMMapDispatcher(%d, %+v) = %v", fd, e, err)
}
case RecvMMsg:
// If the provided FD is a socket then we optimize
// packet reads by using recvmmsg() instead of read() to
// read packets in a batch.
- inboundDispatcher, err = newRecvMMsgDispatcher(e.fd, e)
+ inboundDispatcher, err = newRecvMMsgDispatcher(fd, e)
if err != nil {
- return nil, fmt.Errorf("newRecvMMsgDispatcher(%d, %+v) = %v", e.fd, e, err)
+ return nil, fmt.Errorf("newRecvMMsgDispatcher(%d, %+v) = %v", fd, e, err)
}
}
}
@@ -215,7 +289,9 @@ func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) {
// Link endpoints are not savable. When transportation endpoints are
// saved, they stop sending outgoing packets and all incoming packets
// are rejected.
- go e.dispatchLoop() // S/R-SAFE: See above.
+ for i := range e.inboundDispatchers {
+ go e.dispatchLoop(e.inboundDispatchers[i]) // S/R-SAFE: See above.
+ }
}
// IsAttached implements stack.LinkEndpoint.IsAttached.
@@ -305,26 +381,26 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen
}
}
- return rawfile.NonBlockingWrite3(e.fd, vnetHdrBuf, hdr.View(), payload.ToView())
+ return rawfile.NonBlockingWrite3(e.fds[0], vnetHdrBuf, hdr.View(), payload.ToView())
}
if payload.Size() == 0 {
- return rawfile.NonBlockingWrite(e.fd, hdr.View())
+ return rawfile.NonBlockingWrite(e.fds[0], hdr.View())
}
- return rawfile.NonBlockingWrite3(e.fd, hdr.View(), payload.ToView(), nil)
+ return rawfile.NonBlockingWrite3(e.fds[0], hdr.View(), payload.ToView(), nil)
}
// WriteRawPacket writes a raw packet directly to the file descriptor.
func (e *endpoint) WriteRawPacket(dest tcpip.Address, packet []byte) *tcpip.Error {
- return rawfile.NonBlockingWrite(e.fd, packet)
+ return rawfile.NonBlockingWrite(e.fds[0], packet)
}
// dispatchLoop reads packets from the file descriptor in a loop and dispatches
// them to the network stack.
-func (e *endpoint) dispatchLoop() *tcpip.Error {
+func (e *endpoint) dispatchLoop(inboundDispatcher linkDispatcher) *tcpip.Error {
for {
- cont, err := e.inboundDispatcher.dispatch()
+ cont, err := inboundDispatcher.dispatch()
if err != nil || !cont {
if e.closed != nil {
e.closed(err)
@@ -363,7 +439,7 @@ func NewInjectable(fd int, mtu uint32, capabilities stack.LinkEndpointCapabiliti
syscall.SetNonblock(fd, true)
e := &InjectableEndpoint{endpoint: endpoint{
- fd: fd,
+ fds: []int{fd},
mtu: mtu,
caps: capabilities,
}}
diff --git a/pkg/tcpip/link/fdbased/endpoint_test.go b/pkg/tcpip/link/fdbased/endpoint_test.go
index fd1722074..ba3e09192 100644
--- a/pkg/tcpip/link/fdbased/endpoint_test.go
+++ b/pkg/tcpip/link/fdbased/endpoint_test.go
@@ -67,7 +67,7 @@ func newContext(t *testing.T, opt *Options) *context {
done <- struct{}{}
}
- opt.FD = fds[1]
+ opt.FDs = []int{fds[1]}
epID, err := New(opt)
if err != nil {
t.Fatalf("Failed to create FD endpoint: %v", err)
diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go
index fccabd554..98581e50e 100644
--- a/pkg/tcpip/link/sniffer/sniffer.go
+++ b/pkg/tcpip/link/sniffer/sniffer.go
@@ -118,7 +118,7 @@ func NewWithFile(lower tcpip.LinkEndpointID, file *os.File, snapLen uint32) (tcp
// logs the packet before forwarding to the actual dispatcher.
func (e *endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) {
if atomic.LoadUint32(&LogPackets) == 1 && e.file == nil {
- logPacket("recv", protocol, vv.First())
+ logPacket("recv", protocol, vv.First(), nil)
}
if e.file != nil && atomic.LoadUint32(&LogPacketsToFile) == 1 {
vs := vv.Views()
@@ -198,7 +198,7 @@ func (e *endpoint) GSOMaxSize() uint32 {
// the request to the lower endpoint.
func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) *tcpip.Error {
if atomic.LoadUint32(&LogPackets) == 1 && e.file == nil {
- logPacket("send", protocol, hdr.View())
+ logPacket("send", protocol, hdr.View(), gso)
}
if e.file != nil && atomic.LoadUint32(&LogPacketsToFile) == 1 {
hdrBuf := hdr.View()
@@ -240,7 +240,7 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen
return e.lower.WritePacket(r, gso, hdr, payload, protocol)
}
-func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, b buffer.View) {
+func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, b buffer.View, gso *stack.GSO) {
// Figure out the network layer info.
var transProto uint8
src := tcpip.Address("unknown")
@@ -404,5 +404,9 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, b buffer.Vie
return
}
+ if gso != nil {
+ details += fmt.Sprintf(" gso: %+v", gso)
+ }
+
log.Infof("%s %s %v:%v -> %v:%v len:%d id:%04x %s", prefix, transName, src, srcPort, dst, dstPort, size, id, details)
}
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index da07a39e5..44b1d5b9b 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -215,7 +215,9 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen
views[0] = hdr.View()
views = append(views, payload.Views()...)
vv := buffer.NewVectorisedView(len(views[0])+payload.Size(), views)
- e.HandlePacket(r, vv)
+ loopedR := r.MakeLoopedRoute()
+ e.HandlePacket(&loopedR, vv)
+ loopedR.Release()
}
if loop&stack.PacketOut == 0 {
return nil
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index 4b8cd496b..bcae98e1f 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -108,7 +108,9 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, hdr buffer.Prepen
views[0] = hdr.View()
views = append(views, payload.Views()...)
vv := buffer.NewVectorisedView(len(views[0])+payload.Size(), views)
- e.HandlePacket(r, vv)
+ loopedR := r.MakeLoopedRoute()
+ e.HandlePacket(&loopedR, vv)
+ loopedR.Release()
}
if loop&stack.PacketOut == 0 {
return nil
diff --git a/pkg/tcpip/sample/tun_tcp_connect/main.go b/pkg/tcpip/sample/tun_tcp_connect/main.go
index 1681de56e..1fa899e7e 100644
--- a/pkg/tcpip/sample/tun_tcp_connect/main.go
+++ b/pkg/tcpip/sample/tun_tcp_connect/main.go
@@ -137,7 +137,7 @@ func main() {
log.Fatal(err)
}
- linkID, err := fdbased.New(&fdbased.Options{FD: fd, MTU: mtu})
+ linkID, err := fdbased.New(&fdbased.Options{FDs: []int{fd}, MTU: mtu})
if err != nil {
log.Fatal(err)
}
diff --git a/pkg/tcpip/sample/tun_tcp_echo/main.go b/pkg/tcpip/sample/tun_tcp_echo/main.go
index 642607f83..d47085581 100644
--- a/pkg/tcpip/sample/tun_tcp_echo/main.go
+++ b/pkg/tcpip/sample/tun_tcp_echo/main.go
@@ -129,7 +129,7 @@ func main() {
}
linkID, err := fdbased.New(&fdbased.Options{
- FD: fd,
+ FDs: []int{fd},
MTU: mtu,
EthernetHeader: *tap,
Address: tcpip.LinkAddress(maddr),
diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go
index 3d4c282a9..55ed02479 100644
--- a/pkg/tcpip/stack/route.go
+++ b/pkg/tcpip/stack/route.go
@@ -187,3 +187,13 @@ func (r *Route) Clone() Route {
r.ref.incRef()
return *r
}
+
+// MakeLoopedRoute duplicates the given route and tweaks it in case of multicast.
+func (r *Route) MakeLoopedRoute() Route {
+ l := r.Clone()
+ if header.IsV4MulticastAddress(r.RemoteAddress) || header.IsV6MulticastAddress(r.RemoteAddress) {
+ l.RemoteAddress, l.LocalAddress = l.LocalAddress, l.RemoteAddress
+ l.RemoteLinkAddress = l.LocalLinkAddress
+ }
+ return l
+}
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index 8d74f1543..e8a9392b5 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -188,6 +188,10 @@ func (f *fakeTransportEndpoint) HandleControlPacket(stack.TransportEndpointID, s
f.proto.controlCount++
}
+func (f *fakeTransportEndpoint) State() uint32 {
+ return 0
+}
+
type fakeTransportGoodOption bool
type fakeTransportBadOption bool
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index f9886c6e4..04c776205 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -377,6 +377,10 @@ type Endpoint interface {
// GetSockOpt gets a socket option. opt should be a pointer to one of the
// *Option types.
GetSockOpt(opt interface{}) *Error
+
+ // State returns a socket's lifecycle state. The returned value is
+ // protocol-specific and is primarily used for diagnostics.
+ State() uint32
}
// WriteOptions contains options for Endpoint.Write.
@@ -468,6 +472,14 @@ type KeepaliveIntervalOption time.Duration
// closed.
type KeepaliveCountOption int
+// CongestionControlOption is used by SetSockOpt/GetSockOpt to set/get
+// the current congestion control algorithm.
+type CongestionControlOption string
+
+// AvailableCongestionControlOption is used to query the supported congestion
+// control algorithms.
+type AvailableCongestionControlOption string
+
// MulticastTTLOption is used by SetSockOpt/GetSockOpt to control the default
// TTL value for multicast messages. The default is 1.
type MulticastTTLOption uint8
diff --git a/pkg/tcpip/transport/icmp/BUILD b/pkg/tcpip/transport/icmp/BUILD
index 9aa6f3978..84a2b53b7 100644
--- a/pkg/tcpip/transport/icmp/BUILD
+++ b/pkg/tcpip/transport/icmp/BUILD
@@ -33,6 +33,7 @@ go_library(
"//pkg/tcpip/header",
"//pkg/tcpip/stack",
"//pkg/tcpip/transport/raw",
+ "//pkg/tcpip/transport/tcp",
"//pkg/waiter",
],
)
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index e2b90ef10..b8005093a 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -708,3 +708,9 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv
// HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, vv buffer.VectorisedView) {
}
+
+// State implements tcpip.Endpoint.State. The ICMP endpoint currently doesn't
+// expose internal socket state.
+func (e *endpoint) State() uint32 {
+ return 0
+}
diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go
index 1daf5823f..e4ff50c91 100644
--- a/pkg/tcpip/transport/raw/endpoint.go
+++ b/pkg/tcpip/transport/raw/endpoint.go
@@ -519,3 +519,8 @@ func (ep *endpoint) HandlePacket(route *stack.Route, netHeader buffer.View, vv b
ep.waiterQueue.Notify(waiter.EventIn)
}
}
+
+// State implements socket.Socket.State.
+func (ep *endpoint) State() uint32 {
+ return 0
+}
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index e31b03f7d..a9dbfb930 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -21,6 +21,7 @@ go_library(
"accept.go",
"connect.go",
"cubic.go",
+ "cubic_state.go",
"endpoint.go",
"endpoint_state.go",
"forwarder.go",
@@ -70,6 +71,7 @@ go_test(
srcs = [
"dual_stack_test.go",
"sack_scoreboard_test.go",
+ "tcp_noracedetector_test.go",
"tcp_sack_test.go",
"tcp_test.go",
"tcp_timestamp_test.go",
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index d4b860975..d05259c0a 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -19,7 +19,6 @@ import (
"encoding/binary"
"hash"
"io"
- "log"
"sync"
"time"
@@ -227,7 +226,6 @@ func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, i
}
n.isRegistered = true
- n.state = stateConnecting
// Create sender and receiver.
//
@@ -253,14 +251,15 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
// Perform the 3-way handshake.
h := newHandshake(ep, l.rcvWnd)
- h.resetToSynRcvd(cookie, irs, opts, l.listenEP)
+ h.resetToSynRcvd(cookie, irs, opts)
if err := h.execute(); err != nil {
ep.stack.Stats().TCP.FailedConnectionAttempts.Increment()
ep.Close()
return nil, err
}
-
- ep.state = stateConnected
+ ep.mu.Lock()
+ ep.state = StateEstablished
+ ep.mu.Unlock()
// Update the receive window scaling. We can't do it before the
// handshake because it's possible that the peer doesn't support window
@@ -277,7 +276,7 @@ func (e *endpoint) deliverAccepted(n *endpoint) {
e.mu.RLock()
state := e.state
e.mu.RUnlock()
- if state == stateListen {
+ if state == StateListen {
e.acceptedChan <- n
e.waiterQueue.Notify(waiter.EventIn)
} else {
@@ -295,7 +294,6 @@ func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header
defer decSynRcvdCount()
defer e.decSynRcvdCount()
defer s.decRef()
-
n, err := ctx.createEndpointAndPerformHandshake(s, opts)
if err != nil {
e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
@@ -307,8 +305,7 @@ func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header
func (e *endpoint) incSynRcvdCount() bool {
e.mu.Lock()
- log.Printf("l: %d, c: %d, e.synRcvdCount: %d", len(e.acceptedChan), cap(e.acceptedChan), e.synRcvdCount)
- if l, c := len(e.acceptedChan), cap(e.acceptedChan); l == c && e.synRcvdCount >= c {
+ if e.synRcvdCount >= cap(e.acceptedChan) {
e.mu.Unlock()
return false
}
@@ -323,6 +320,16 @@ func (e *endpoint) decSynRcvdCount() {
e.mu.Unlock()
}
+func (e *endpoint) acceptQueueIsFull() bool {
+ e.mu.Lock()
+ if l, c := len(e.acceptedChan)+e.synRcvdCount, cap(e.acceptedChan); l >= c {
+ e.mu.Unlock()
+ return true
+ }
+ e.mu.Unlock()
+ return false
+}
+
// handleListenSegment is called when a listening endpoint receives a segment
// and needs to handle it.
func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
@@ -330,20 +337,27 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
case header.TCPFlagSyn:
opts := parseSynSegmentOptions(s)
if incSynRcvdCount() {
- // Drop the SYN if the listen endpoint's accept queue is
- // overflowing.
- if e.incSynRcvdCount() {
- log.Printf("processing syn packet")
+ // Only handle the syn if the following conditions hold
+ // - accept queue is not full.
+ // - number of connections in synRcvd state is less than the
+ // backlog.
+ if !e.acceptQueueIsFull() && e.incSynRcvdCount() {
s.incRef()
go e.handleSynSegment(ctx, s, &opts) // S/R-SAFE: synRcvdCount is the barrier.
return
}
- log.Printf("dropping syn packet")
+ decSynRcvdCount()
e.stack.Stats().TCP.ListenOverflowSynDrop.Increment()
e.stack.Stats().DroppedPackets.Increment()
return
} else {
- // TODO(bhaskerh): Increment syncookie sent stat.
+ // If cookies are in use but the endpoint accept queue
+ // is full then drop the syn.
+ if e.acceptQueueIsFull() {
+ e.stack.Stats().TCP.ListenOverflowSynDrop.Increment()
+ e.stack.Stats().DroppedPackets.Increment()
+ return
+ }
cookie := ctx.createCookie(s.id, s.sequenceNumber, encodeMSS(opts.MSS))
// Send SYN with window scaling because we currently
// dont't encode this information in the cookie.
@@ -361,7 +375,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
}
case header.TCPFlagAck:
- if len(e.acceptedChan) == cap(e.acceptedChan) {
+ if e.acceptQueueIsFull() {
// Silently drop the ack as the application can't accept
// the connection at this point. The ack will be
// retransmitted by the sender anyway and we can
@@ -411,7 +425,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
n.tsOffset = 0
// Switch state to connected.
- n.state = stateConnected
+ n.state = StateEstablished
// Do the delivery in a separate goroutine so
// that we don't block the listen loop in case
@@ -434,7 +448,7 @@ func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) *tcpip.Error {
// handleSynSegment() from attempting to queue new connections
// to the endpoint.
e.mu.Lock()
- e.state = stateClosed
+ e.state = StateClose
// Do cleanup if needed.
e.completeWorkerLocked()
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index 2aed6f286..dd671f7ce 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -60,12 +60,11 @@ const (
// handshake holds the state used during a TCP 3-way handshake.
type handshake struct {
- ep *endpoint
- listenEP *endpoint // only non nil when doing passive connects.
- state handshakeState
- active bool
- flags uint8
- ackNum seqnum.Value
+ ep *endpoint
+ state handshakeState
+ active bool
+ flags uint8
+ ackNum seqnum.Value
// iss is the initial send sequence number, as defined in RFC 793.
iss seqnum.Value
@@ -142,7 +141,7 @@ func (h *handshake) effectiveRcvWndScale() uint8 {
// resetToSynRcvd resets the state of the handshake object to the SYN-RCVD
// state.
-func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *header.TCPSynOptions, listenEP *endpoint) {
+func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *header.TCPSynOptions) {
h.active = false
h.state = handshakeSynRcvd
h.flags = header.TCPFlagSyn | header.TCPFlagAck
@@ -150,7 +149,9 @@ func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *hea
h.ackNum = irs + 1
h.mss = opts.MSS
h.sndWndScale = opts.WS
- h.listenEP = listenEP
+ h.ep.mu.Lock()
+ h.ep.state = StateSynRecv
+ h.ep.mu.Unlock()
}
// checkAck checks if the ACK number, if present, of a segment received during
@@ -219,6 +220,9 @@ func (h *handshake) synSentState(s *segment) *tcpip.Error {
// but resend our own SYN and wait for it to be acknowledged in the
// SYN-RCVD state.
h.state = handshakeSynRcvd
+ h.ep.mu.Lock()
+ h.ep.state = StateSynRecv
+ h.ep.mu.Unlock()
synOpts := header.TCPSynOptions{
WS: h.rcvWndScale,
TS: rcvSynOpts.TS,
@@ -281,18 +285,6 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error {
// We have previously received (and acknowledged) the peer's SYN. If the
// peer acknowledges our SYN, the handshake is completed.
if s.flagIsSet(header.TCPFlagAck) {
- // listenContext is also used by a tcp.Forwarder and in that
- // context we do not have a listening endpoint to check the
- // backlog. So skip this check if listenEP is nil.
- if h.listenEP != nil && len(h.listenEP.acceptedChan) == cap(h.listenEP.acceptedChan) {
- // If there is no space in the accept queue to accept
- // this endpoint then silently drop this ACK. The peer
- // will anyway resend the ack and we can complete the
- // connection the next time it's retransmitted.
- h.ep.stack.Stats().TCP.ListenOverflowAckDrop.Increment()
- h.ep.stack.Stats().DroppedPackets.Increment()
- return nil
- }
// If the timestamp option is negotiated and the segment does
// not carry a timestamp option then the segment must be dropped
// as per https://tools.ietf.org/html/rfc7323#section-3.2.
@@ -663,7 +655,7 @@ func (e *endpoint) makeOptions(sackBlocks []header.SACKBlock) []byte {
// sendRaw sends a TCP segment to the endpoint's peer.
func (e *endpoint) sendRaw(data buffer.VectorisedView, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size) *tcpip.Error {
var sackBlocks []header.SACKBlock
- if e.state == stateConnected && e.rcv.pendingBufSize > 0 && (flags&header.TCPFlagAck != 0) {
+ if e.state == StateEstablished && e.rcv.pendingBufSize > 0 && (flags&header.TCPFlagAck != 0) {
sackBlocks = e.sack.Blocks[:e.sack.NumBlocks]
}
options := e.makeOptions(sackBlocks)
@@ -714,8 +706,7 @@ func (e *endpoint) handleClose() *tcpip.Error {
// protocol goroutine.
func (e *endpoint) resetConnectionLocked(err *tcpip.Error) {
e.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck|header.TCPFlagRst, e.snd.sndUna, e.rcv.rcvNxt, 0)
-
- e.state = stateError
+ e.state = StateError
e.hardError = err
}
@@ -871,14 +862,19 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
// handshake, and then inform potential waiters about its
// completion.
h := newHandshake(e, seqnum.Size(e.receiveBufferAvailable()))
+ e.mu.Lock()
+ h.ep.state = StateSynSent
+ e.mu.Unlock()
+
if err := h.execute(); err != nil {
e.lastErrorMu.Lock()
e.lastError = err
e.lastErrorMu.Unlock()
e.mu.Lock()
- e.state = stateError
+ e.state = StateError
e.hardError = err
+
// Lock released below.
epilogue()
@@ -900,7 +896,7 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
// Tell waiters that the endpoint is connected and writable.
e.mu.Lock()
- e.state = stateConnected
+ e.state = StateEstablished
drained := e.drainDone != nil
e.mu.Unlock()
if drained {
@@ -1000,7 +996,7 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
return err
}
}
- if e.state != stateError {
+ if e.state != StateError {
close(e.drainDone)
<-e.undrain
}
@@ -1056,8 +1052,8 @@ func (e *endpoint) protocolMainLoop(handshake bool) *tcpip.Error {
// Mark endpoint as closed.
e.mu.Lock()
- if e.state != stateError {
- e.state = stateClosed
+ if e.state != StateError {
+ e.state = StateClose
}
// Lock released below.
epilogue()
diff --git a/pkg/tcpip/transport/tcp/cubic.go b/pkg/tcpip/transport/tcp/cubic.go
index e618cd2b9..7b1f5e763 100644
--- a/pkg/tcpip/transport/tcp/cubic.go
+++ b/pkg/tcpip/transport/tcp/cubic.go
@@ -23,6 +23,7 @@ import (
// control algorithm state.
//
// See: https://tools.ietf.org/html/rfc8312.
+// +stateify savable
type cubicState struct {
// wLastMax is the previous wMax value.
wLastMax float64
@@ -33,7 +34,7 @@ type cubicState struct {
// t denotes the time when the current congestion avoidance
// was entered.
- t time.Time
+ t time.Time `state:".(unixTime)"`
// numCongestionEvents tracks the number of congestion events since last
// RTO.
diff --git a/pkg/tcpip/transport/tcp/cubic_state.go b/pkg/tcpip/transport/tcp/cubic_state.go
new file mode 100644
index 000000000..d0f58cfaf
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/cubic_state.go
@@ -0,0 +1,29 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+import (
+ "time"
+)
+
+// saveT is invoked by stateify.
+func (c *cubicState) saveT() unixTime {
+ return unixTime{c.t.Unix(), c.t.UnixNano()}
+}
+
+// loadT is invoked by stateify.
+func (c *cubicState) loadT(unix unixTime) {
+ c.t = time.Unix(unix.second, unix.nano)
+}
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index b66610ee2..1efe9d3fb 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -17,6 +17,7 @@ package tcp
import (
"fmt"
"math"
+ "strings"
"sync"
"sync/atomic"
"time"
@@ -32,18 +33,81 @@ import (
"gvisor.googlesource.com/gvisor/pkg/waiter"
)
-type endpointState int
+// EndpointState represents the state of a TCP endpoint.
+type EndpointState uint32
+// Endpoint states. Note that are represented in a netstack-specific manner and
+// may not be meaningful externally. Specifically, they need to be translated to
+// Linux's representation for these states if presented to userspace.
const (
- stateInitial endpointState = iota
- stateBound
- stateListen
- stateConnecting
- stateConnected
- stateClosed
- stateError
+ // Endpoint states internal to netstack. These map to the TCP state CLOSED.
+ StateInitial EndpointState = iota
+ StateBound
+ StateConnecting // Connect() called, but the initial SYN hasn't been sent.
+ StateError
+
+ // TCP protocol states.
+ StateEstablished
+ StateSynSent
+ StateSynRecv
+ StateFinWait1
+ StateFinWait2
+ StateTimeWait
+ StateClose
+ StateCloseWait
+ StateLastAck
+ StateListen
+ StateClosing
)
+// connected is the set of states where an endpoint is connected to a peer.
+func (s EndpointState) connected() bool {
+ switch s {
+ case StateEstablished, StateFinWait1, StateFinWait2, StateTimeWait, StateCloseWait, StateLastAck, StateClosing:
+ return true
+ default:
+ return false
+ }
+}
+
+// String implements fmt.Stringer.String.
+func (s EndpointState) String() string {
+ switch s {
+ case StateInitial:
+ return "INITIAL"
+ case StateBound:
+ return "BOUND"
+ case StateConnecting:
+ return "CONNECTING"
+ case StateError:
+ return "ERROR"
+ case StateEstablished:
+ return "ESTABLISHED"
+ case StateSynSent:
+ return "SYN-SENT"
+ case StateSynRecv:
+ return "SYN-RCVD"
+ case StateFinWait1:
+ return "FIN-WAIT1"
+ case StateFinWait2:
+ return "FIN-WAIT2"
+ case StateTimeWait:
+ return "TIME-WAIT"
+ case StateClose:
+ return "CLOSED"
+ case StateCloseWait:
+ return "CLOSE-WAIT"
+ case StateLastAck:
+ return "LAST-ACK"
+ case StateListen:
+ return "LISTEN"
+ case StateClosing:
+ return "CLOSING"
+ default:
+ panic("unreachable")
+ }
+}
+
// Reasons for notifying the protocol goroutine.
const (
notifyNonZeroReceiveWindow = 1 << iota
@@ -108,10 +172,14 @@ type endpoint struct {
rcvBufUsed int
// The following fields are protected by the mutex.
- mu sync.RWMutex `state:"nosave"`
- id stack.TransportEndpointID
- state endpointState `state:".(endpointState)"`
- isPortReserved bool `state:"manual"`
+ mu sync.RWMutex `state:"nosave"`
+ id stack.TransportEndpointID
+
+ // state endpointState `state:".(endpointState)"`
+ // pState ProtocolState
+ state EndpointState `state:".(EndpointState)"`
+
+ isPortReserved bool `state:"manual"`
isRegistered bool
boundNICID tcpip.NICID `state:"manual"`
route stack.Route `state:"manual"`
@@ -219,7 +287,7 @@ type endpoint struct {
// cc stores the name of the Congestion Control algorithm to use for
// this endpoint.
- cc CongestionControlOption
+ cc tcpip.CongestionControlOption
// The following are used when a "packet too big" control packet is
// received. They are protected by sndBufMu. They are used to
@@ -304,6 +372,7 @@ func newEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waite
stack: stack,
netProto: netProto,
waiterQueue: waiterQueue,
+ state: StateInitial,
rcvBufSize: DefaultBufferSize,
sndBufSize: DefaultBufferSize,
sndMTU: int(math.MaxInt32),
@@ -326,7 +395,7 @@ func newEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waite
e.rcvBufSize = rs.Default
}
- var cs CongestionControlOption
+ var cs tcpip.CongestionControlOption
if err := stack.TransportProtocolOption(ProtocolNumber, &cs); err == nil {
e.cc = cs
}
@@ -335,7 +404,7 @@ func newEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waite
e.probe = p
}
- e.segmentQueue.setLimit(2 * e.rcvBufSize)
+ e.segmentQueue.setLimit(MaxUnprocessedSegments)
e.workMu.Init()
e.workMu.Lock()
e.tsOffset = timeStampOffset()
@@ -351,14 +420,14 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
defer e.mu.RUnlock()
switch e.state {
- case stateInitial, stateBound, stateConnecting:
+ case StateInitial, StateBound, StateConnecting, StateSynSent, StateSynRecv:
// Ready for nothing.
- case stateClosed, stateError:
+ case StateClose, StateError:
// Ready for anything.
result = mask
- case stateListen:
+ case StateListen:
// Check if there's anything in the accepted channel.
if (mask & waiter.EventIn) != 0 {
if len(e.acceptedChan) > 0 {
@@ -366,7 +435,7 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
}
}
- case stateConnected:
+ case StateEstablished, StateFinWait1, StateFinWait2, StateTimeWait, StateCloseWait, StateLastAck, StateClosing:
// Determine if the endpoint is writable if requested.
if (mask & waiter.EventOut) != 0 {
e.sndBufMu.Lock()
@@ -427,7 +496,7 @@ func (e *endpoint) Close() {
// are immediately available for reuse after Close() is called. If also
// registered, we unregister as well otherwise the next user would fail
// in Listen() when trying to register.
- if e.state == stateListen && e.isPortReserved {
+ if e.state == StateListen && e.isPortReserved {
if e.isRegistered {
e.stack.UnregisterTransportEndpoint(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.id, e)
e.isRegistered = false
@@ -487,15 +556,15 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages,
e.mu.RLock()
// The endpoint can be read if it's connected, or if it's already closed
// but has some pending unread data. Also note that a RST being received
- // would cause the state to become stateError so we should allow the
+ // would cause the state to become StateError so we should allow the
// reads to proceed before returning a ECONNRESET.
e.rcvListMu.Lock()
bufUsed := e.rcvBufUsed
- if s := e.state; s != stateConnected && s != stateClosed && bufUsed == 0 {
+ if s := e.state; !s.connected() && s != StateClose && bufUsed == 0 {
e.rcvListMu.Unlock()
he := e.hardError
e.mu.RUnlock()
- if s == stateError {
+ if s == StateError {
return buffer.View{}, tcpip.ControlMessages{}, he
}
return buffer.View{}, tcpip.ControlMessages{}, tcpip.ErrInvalidEndpointState
@@ -511,7 +580,7 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages,
func (e *endpoint) readLocked() (buffer.View, *tcpip.Error) {
if e.rcvBufUsed == 0 {
- if e.rcvClosed || e.state != stateConnected {
+ if e.rcvClosed || !e.state.connected() {
return buffer.View{}, tcpip.ErrClosedForReceive
}
return buffer.View{}, tcpip.ErrWouldBlock
@@ -547,9 +616,9 @@ func (e *endpoint) Write(p tcpip.Payload, opts tcpip.WriteOptions) (uintptr, <-c
defer e.mu.RUnlock()
// The endpoint cannot be written to if it's not connected.
- if e.state != stateConnected {
+ if !e.state.connected() {
switch e.state {
- case stateError:
+ case StateError:
return 0, nil, e.hardError
default:
return 0, nil, tcpip.ErrClosedForSend
@@ -612,8 +681,8 @@ func (e *endpoint) Peek(vec [][]byte) (uintptr, tcpip.ControlMessages, *tcpip.Er
// The endpoint can be read if it's connected, or if it's already closed
// but has some pending unread data.
- if s := e.state; s != stateConnected && s != stateClosed {
- if s == stateError {
+ if s := e.state; !s.connected() && s != StateClose {
+ if s == StateError {
return 0, tcpip.ControlMessages{}, e.hardError
}
return 0, tcpip.ControlMessages{}, tcpip.ErrInvalidEndpointState
@@ -623,7 +692,7 @@ func (e *endpoint) Peek(vec [][]byte) (uintptr, tcpip.ControlMessages, *tcpip.Er
defer e.rcvListMu.Unlock()
if e.rcvBufUsed == 0 {
- if e.rcvClosed || e.state != stateConnected {
+ if e.rcvClosed || !e.state.connected() {
return 0, tcpip.ControlMessages{}, tcpip.ErrClosedForReceive
}
return 0, tcpip.ControlMessages{}, tcpip.ErrWouldBlock
@@ -757,8 +826,6 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
}
e.rcvListMu.Unlock()
- e.segmentQueue.setLimit(2 * size)
-
e.notifyProtocolGoroutine(mask)
return nil
@@ -791,7 +858,7 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
defer e.mu.Unlock()
// We only allow this to be set when we're in the initial state.
- if e.state != stateInitial {
+ if e.state != StateInitial {
return tcpip.ErrInvalidEndpointState
}
@@ -832,6 +899,40 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
e.mu.Unlock()
return nil
+ case tcpip.CongestionControlOption:
+ // Query the available cc algorithms in the stack and
+ // validate that the specified algorithm is actually
+ // supported in the stack.
+ var avail tcpip.AvailableCongestionControlOption
+ if err := e.stack.TransportProtocolOption(ProtocolNumber, &avail); err != nil {
+ return err
+ }
+ availCC := strings.Split(string(avail), " ")
+ for _, cc := range availCC {
+ if v == tcpip.CongestionControlOption(cc) {
+ // Acquire the work mutex as we may need to
+ // reinitialize the congestion control state.
+ e.mu.Lock()
+ state := e.state
+ e.cc = v
+ e.mu.Unlock()
+ switch state {
+ case StateEstablished:
+ e.workMu.Lock()
+ e.mu.Lock()
+ if e.state == state {
+ e.snd.cc = e.snd.initCongestionControl(e.cc)
+ }
+ e.mu.Unlock()
+ e.workMu.Unlock()
+ }
+ return nil
+ }
+ }
+
+ // Linux returns ENOENT when an invalid congestion
+ // control algorithm is specified.
+ return tcpip.ErrNoSuchFile
default:
return nil
}
@@ -843,7 +944,7 @@ func (e *endpoint) readyReceiveSize() (int, *tcpip.Error) {
defer e.mu.RUnlock()
// The endpoint cannot be in listen state.
- if e.state == stateListen {
+ if e.state == StateListen {
return 0, tcpip.ErrInvalidEndpointState
}
@@ -1001,6 +1102,12 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
}
return nil
+ case *tcpip.CongestionControlOption:
+ e.mu.Lock()
+ *o = e.cc
+ e.mu.Unlock()
+ return nil
+
default:
return tcpip.ErrUnknownProtocolOption
}
@@ -1059,7 +1166,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) (er
nicid := addr.NIC
switch e.state {
- case stateBound:
+ case StateBound:
// If we're already bound to a NIC but the caller is requesting
// that we use a different one now, we cannot proceed.
if e.boundNICID == 0 {
@@ -1072,16 +1179,16 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) (er
nicid = e.boundNICID
- case stateInitial:
- // Nothing to do. We'll eventually fill-in the gaps in the ID
- // (if any) when we find a route.
+ case StateInitial:
+ // Nothing to do. We'll eventually fill-in the gaps in the ID (if any)
+ // when we find a route.
- case stateConnecting:
- // A connection request has already been issued but hasn't
- // completed yet.
+ case StateConnecting, StateSynSent, StateSynRecv:
+ // A connection request has already been issued but hasn't completed
+ // yet.
return tcpip.ErrAlreadyConnecting
- case stateConnected:
+ case StateEstablished:
// The endpoint is already connected. If caller hasn't been notified yet, return success.
if !e.isConnectNotified {
e.isConnectNotified = true
@@ -1090,7 +1197,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) (er
// Otherwise return that it's already connected.
return tcpip.ErrAlreadyConnected
- case stateError:
+ case StateError:
return e.hardError
default:
@@ -1156,7 +1263,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) (er
}
e.isRegistered = true
- e.state = stateConnecting
+ e.state = StateConnecting
e.route = r.Clone()
e.boundNICID = nicid
e.effectiveNetProtos = netProtos
@@ -1177,7 +1284,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) (er
}
e.segmentQueue.mu.Unlock()
e.snd.updateMaxPayloadSize(int(e.route.MTU()), 0)
- e.state = stateConnected
+ e.state = StateEstablished
}
if run {
@@ -1201,8 +1308,8 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
defer e.mu.Unlock()
e.shutdownFlags |= flags
- switch e.state {
- case stateConnected:
+ switch {
+ case e.state.connected():
// Close for read.
if (e.shutdownFlags & tcpip.ShutdownRead) != 0 {
// Mark read side as closed.
@@ -1243,7 +1350,7 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
e.sndCloseWaker.Assert()
}
- case stateListen:
+ case e.state == StateListen:
// Tell protocolListenLoop to stop.
if flags&tcpip.ShutdownRead != 0 {
e.notifyProtocolGoroutine(notifyClose)
@@ -1271,7 +1378,7 @@ func (e *endpoint) Listen(backlog int) (err *tcpip.Error) {
// When the endpoint shuts down, it sets workerCleanup to true, and from
// that point onward, acceptedChan is the responsibility of the cleanup()
// method (and should not be touched anywhere else, including here).
- if e.state == stateListen && !e.workerCleanup {
+ if e.state == StateListen && !e.workerCleanup {
// Adjust the size of the channel iff we can fix existing
// pending connections into the new one.
if len(e.acceptedChan) > backlog {
@@ -1290,7 +1397,7 @@ func (e *endpoint) Listen(backlog int) (err *tcpip.Error) {
}
// Endpoint must be bound before it can transition to listen mode.
- if e.state != stateBound {
+ if e.state != StateBound {
return tcpip.ErrInvalidEndpointState
}
@@ -1300,7 +1407,7 @@ func (e *endpoint) Listen(backlog int) (err *tcpip.Error) {
}
e.isRegistered = true
- e.state = stateListen
+ e.state = StateListen
if e.acceptedChan == nil {
e.acceptedChan = make(chan *endpoint, backlog)
}
@@ -1327,7 +1434,7 @@ func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
defer e.mu.RUnlock()
// Endpoint must be in listen state before it can accept connections.
- if e.state != stateListen {
+ if e.state != StateListen {
return nil, nil, tcpip.ErrInvalidEndpointState
}
@@ -1355,7 +1462,7 @@ func (e *endpoint) Bind(addr tcpip.FullAddress) (err *tcpip.Error) {
// Don't allow binding once endpoint is not in the initial state
// anymore. This is because once the endpoint goes into a connected or
// listen state, it is already bound.
- if e.state != stateInitial {
+ if e.state != StateInitial {
return tcpip.ErrAlreadyBound
}
@@ -1410,7 +1517,7 @@ func (e *endpoint) Bind(addr tcpip.FullAddress) (err *tcpip.Error) {
}
// Mark endpoint as bound.
- e.state = stateBound
+ e.state = StateBound
return nil
}
@@ -1432,7 +1539,7 @@ func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
e.mu.RLock()
defer e.mu.RUnlock()
- if e.state != stateConnected {
+ if !e.state.connected() {
return tcpip.FullAddress{}, tcpip.ErrNotConnected
}
@@ -1741,3 +1848,11 @@ func (e *endpoint) initGSO() {
gso.MaxSize = e.route.GSOMaxSize()
e.gso = gso
}
+
+// State implements tcpip.Endpoint.State. It exports the endpoint's protocol
+// state for diagnostics.
+func (e *endpoint) State() uint32 {
+ e.mu.Lock()
+ defer e.mu.Unlock()
+ return uint32(e.state)
+}
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
index 27b0be046..5f30c2374 100644
--- a/pkg/tcpip/transport/tcp/endpoint_state.go
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -49,8 +49,8 @@ func (e *endpoint) beforeSave() {
defer e.mu.Unlock()
switch e.state {
- case stateInitial, stateBound:
- case stateConnected:
+ case StateInitial, StateBound:
+ case StateEstablished, StateSynSent, StateSynRecv, StateFinWait1, StateFinWait2, StateTimeWait, StateCloseWait, StateLastAck, StateClosing:
if e.route.Capabilities()&stack.CapabilitySaveRestore == 0 {
if e.route.Capabilities()&stack.CapabilityDisconnectOk == 0 {
panic(tcpip.ErrSaveRejection{fmt.Errorf("endpoint cannot be saved in connected state: local %v:%d, remote %v:%d", e.id.LocalAddress, e.id.LocalPort, e.id.RemoteAddress, e.id.RemotePort)})
@@ -66,17 +66,17 @@ func (e *endpoint) beforeSave() {
break
}
fallthrough
- case stateListen, stateConnecting:
+ case StateListen, StateConnecting:
e.drainSegmentLocked()
- if e.state != stateClosed && e.state != stateError {
+ if e.state != StateClose && e.state != StateError {
if !e.workerRunning {
panic("endpoint has no worker running in listen, connecting, or connected state")
}
break
}
fallthrough
- case stateError, stateClosed:
- for e.state == stateError && e.workerRunning {
+ case StateError, StateClose:
+ for e.state == StateError && e.workerRunning {
e.mu.Unlock()
time.Sleep(100 * time.Millisecond)
e.mu.Lock()
@@ -92,7 +92,7 @@ func (e *endpoint) beforeSave() {
panic("endpoint still has waiters upon save")
}
- if e.state != stateClosed && !((e.state == stateBound || e.state == stateListen) == e.isPortReserved) {
+ if e.state != StateClose && !((e.state == StateBound || e.state == StateListen) == e.isPortReserved) {
panic("endpoints which are not in the closed state must have a reserved port IFF they are in bound or listen state")
}
}
@@ -132,7 +132,7 @@ func (e *endpoint) loadAcceptedChan(acceptedEndpoints []*endpoint) {
}
// saveState is invoked by stateify.
-func (e *endpoint) saveState() endpointState {
+func (e *endpoint) saveState() EndpointState {
return e.state
}
@@ -146,15 +146,15 @@ var connectingLoading sync.WaitGroup
// Bound endpoint loading happens last.
// loadState is invoked by stateify.
-func (e *endpoint) loadState(state endpointState) {
+func (e *endpoint) loadState(state EndpointState) {
// This is to ensure that the loading wait groups include all applicable
// endpoints before any asynchronous calls to the Wait() methods.
switch state {
- case stateConnected:
+ case StateEstablished, StateFinWait1, StateFinWait2, StateTimeWait, StateCloseWait, StateLastAck, StateClosing:
connectedLoading.Add(1)
- case stateListen:
+ case StateListen:
listenLoading.Add(1)
- case stateConnecting:
+ case StateConnecting, StateSynSent, StateSynRecv:
connectingLoading.Add(1)
}
e.state = state
@@ -163,12 +163,12 @@ func (e *endpoint) loadState(state endpointState) {
// afterLoad is invoked by stateify.
func (e *endpoint) afterLoad() {
e.stack = stack.StackFromEnv
- e.segmentQueue.setLimit(2 * e.rcvBufSize)
+ e.segmentQueue.setLimit(MaxUnprocessedSegments)
e.workMu.Init()
state := e.state
switch state {
- case stateInitial, stateBound, stateListen, stateConnecting, stateConnected:
+ case StateInitial, StateBound, StateListen, StateConnecting, StateEstablished:
var ss SendBufferSizeOption
if err := e.stack.TransportProtocolOption(ProtocolNumber, &ss); err == nil {
if e.sndBufSize < ss.Min || e.sndBufSize > ss.Max {
@@ -181,7 +181,7 @@ func (e *endpoint) afterLoad() {
}
bind := func() {
- e.state = stateInitial
+ e.state = StateInitial
if len(e.bindAddress) == 0 {
e.bindAddress = e.id.LocalAddress
}
@@ -191,7 +191,7 @@ func (e *endpoint) afterLoad() {
}
switch state {
- case stateConnected:
+ case StateEstablished, StateFinWait1, StateFinWait2, StateTimeWait, StateCloseWait, StateLastAck, StateClosing:
bind()
if len(e.connectingAddress) == 0 {
// This endpoint is accepted by netstack but not yet by
@@ -211,7 +211,7 @@ func (e *endpoint) afterLoad() {
panic("endpoint connecting failed: " + err.String())
}
connectedLoading.Done()
- case stateListen:
+ case StateListen:
tcpip.AsyncLoading.Add(1)
go func() {
connectedLoading.Wait()
@@ -223,7 +223,7 @@ func (e *endpoint) afterLoad() {
listenLoading.Done()
tcpip.AsyncLoading.Done()
}()
- case stateConnecting:
+ case StateConnecting, StateSynSent, StateSynRecv:
tcpip.AsyncLoading.Add(1)
go func() {
connectedLoading.Wait()
@@ -235,7 +235,7 @@ func (e *endpoint) afterLoad() {
connectingLoading.Done()
tcpip.AsyncLoading.Done()
}()
- case stateBound:
+ case StateBound:
tcpip.AsyncLoading.Add(1)
go func() {
connectedLoading.Wait()
@@ -244,7 +244,7 @@ func (e *endpoint) afterLoad() {
bind()
tcpip.AsyncLoading.Done()
}()
- case stateClosed:
+ case StateClose:
if e.isPortReserved {
tcpip.AsyncLoading.Add(1)
go func() {
@@ -252,12 +252,12 @@ func (e *endpoint) afterLoad() {
listenLoading.Wait()
connectingLoading.Wait()
bind()
- e.state = stateClosed
+ e.state = StateClose
tcpip.AsyncLoading.Done()
}()
}
fallthrough
- case stateError:
+ case StateError:
tcpip.DeleteDanglingEndpoint(e)
}
}
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index d31a1edcb..59f4009a1 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -48,6 +48,10 @@ const (
// MaxBufferSize is the largest size a receive and send buffer can grow to.
maxBufferSize = 4 << 20 // 4MB
+
+ // MaxUnprocessedSegments is the maximum number of unprocessed segments
+ // that can be queued for a given endpoint.
+ MaxUnprocessedSegments = 300
)
// SACKEnabled option can be used to enable SACK support in the TCP
@@ -75,13 +79,6 @@ const (
ccCubic = "cubic"
)
-// CongestionControlOption sets the current congestion control algorithm.
-type CongestionControlOption string
-
-// AvailableCongestionControlOption returns the supported congestion control
-// algorithms.
-type AvailableCongestionControlOption string
-
type protocol struct {
mu sync.Mutex
sackEnabled bool
@@ -89,7 +86,6 @@ type protocol struct {
recvBufferSize ReceiveBufferSizeOption
congestionControl string
availableCongestionControl []string
- allowedCongestionControl []string
}
// Number returns the tcp protocol number.
@@ -184,7 +180,7 @@ func (p *protocol) SetOption(option interface{}) *tcpip.Error {
p.mu.Unlock()
return nil
- case CongestionControlOption:
+ case tcpip.CongestionControlOption:
for _, c := range p.availableCongestionControl {
if string(v) == c {
p.mu.Lock()
@@ -193,7 +189,9 @@ func (p *protocol) SetOption(option interface{}) *tcpip.Error {
return nil
}
}
- return tcpip.ErrInvalidOptionValue
+ // linux returns ENOENT when an invalid congestion control
+ // is specified.
+ return tcpip.ErrNoSuchFile
default:
return tcpip.ErrUnknownProtocolOption
}
@@ -219,14 +217,14 @@ func (p *protocol) Option(option interface{}) *tcpip.Error {
*v = p.recvBufferSize
p.mu.Unlock()
return nil
- case *CongestionControlOption:
+ case *tcpip.CongestionControlOption:
p.mu.Lock()
- *v = CongestionControlOption(p.congestionControl)
+ *v = tcpip.CongestionControlOption(p.congestionControl)
p.mu.Unlock()
return nil
- case *AvailableCongestionControlOption:
+ case *tcpip.AvailableCongestionControlOption:
p.mu.Lock()
- *v = AvailableCongestionControlOption(strings.Join(p.availableCongestionControl, " "))
+ *v = tcpip.AvailableCongestionControlOption(strings.Join(p.availableCongestionControl, " "))
p.mu.Unlock()
return nil
default:
diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go
index b08a0e356..f02fa6105 100644
--- a/pkg/tcpip/transport/tcp/rcv.go
+++ b/pkg/tcpip/transport/tcp/rcv.go
@@ -134,6 +134,7 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum
// sequence numbers that have been consumed.
TrimSACKBlockList(&r.ep.sack, r.rcvNxt)
+ // Handle FIN or FIN-ACK.
if s.flagIsSet(header.TCPFlagFin) {
r.rcvNxt++
@@ -144,6 +145,25 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum
r.closed = true
r.ep.readyToRead(nil)
+ // We just received a FIN, our next state depends on whether we sent a
+ // FIN already or not.
+ r.ep.mu.Lock()
+ switch r.ep.state {
+ case StateEstablished:
+ r.ep.state = StateCloseWait
+ case StateFinWait1:
+ if s.flagIsSet(header.TCPFlagAck) {
+ // FIN-ACK, transition to TIME-WAIT.
+ r.ep.state = StateTimeWait
+ } else {
+ // Simultaneous close, expecting a final ACK.
+ r.ep.state = StateClosing
+ }
+ case StateFinWait2:
+ r.ep.state = StateTimeWait
+ }
+ r.ep.mu.Unlock()
+
// Flush out any pending segments, except the very first one if
// it happens to be the one we're handling now because the
// caller is using it.
@@ -156,6 +176,23 @@ func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum
r.pendingRcvdSegments[i].decRef()
}
r.pendingRcvdSegments = r.pendingRcvdSegments[:first]
+
+ return true
+ }
+
+ // Handle ACK (not FIN-ACK, which we handled above) during one of the
+ // shutdown states.
+ if s.flagIsSet(header.TCPFlagAck) {
+ r.ep.mu.Lock()
+ switch r.ep.state {
+ case StateFinWait1:
+ r.ep.state = StateFinWait2
+ case StateClosing:
+ r.ep.state = StateTimeWait
+ case StateLastAck:
+ r.ep.state = StateClose
+ }
+ r.ep.mu.Unlock()
}
return true
diff --git a/pkg/tcpip/transport/tcp/segment_queue.go b/pkg/tcpip/transport/tcp/segment_queue.go
index 3b020e580..e0759225e 100644
--- a/pkg/tcpip/transport/tcp/segment_queue.go
+++ b/pkg/tcpip/transport/tcp/segment_queue.go
@@ -16,8 +16,6 @@ package tcp
import (
"sync"
-
- "gvisor.googlesource.com/gvisor/pkg/tcpip/header"
)
// segmentQueue is a bounded, thread-safe queue of TCP segments.
@@ -58,7 +56,7 @@ func (q *segmentQueue) enqueue(s *segment) bool {
r := q.used < q.limit
if r {
q.list.PushBack(s)
- q.used += s.data.Size() + header.TCPMinimumSize
+ q.used++
}
q.mu.Unlock()
@@ -73,7 +71,7 @@ func (q *segmentQueue) dequeue() *segment {
s := q.list.Front()
if s != nil {
q.list.Remove(s)
- q.used -= s.data.Size() + header.TCPMinimumSize
+ q.used--
}
q.mu.Unlock()
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index afc1d0a55..daa3e8341 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -194,8 +194,6 @@ func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint
s := &sender{
ep: ep,
- sndCwnd: InitialCwnd,
- sndSsthresh: math.MaxInt64,
sndWnd: sndWnd,
sndUna: iss + 1,
sndNxt: iss + 1,
@@ -238,7 +236,13 @@ func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint
return s
}
-func (s *sender) initCongestionControl(congestionControlName CongestionControlOption) congestionControl {
+// initCongestionControl initializes the specified congestion control module and
+// returns a handle to it. It also initializes the sndCwnd and sndSsThresh to
+// their initial values.
+func (s *sender) initCongestionControl(congestionControlName tcpip.CongestionControlOption) congestionControl {
+ s.sndCwnd = InitialCwnd
+ s.sndSsthresh = math.MaxInt64
+
switch congestionControlName {
case ccCubic:
return newCubicCC(s)
@@ -632,6 +636,10 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se
}
seg.flags = header.TCPFlagAck | header.TCPFlagFin
segEnd = seg.sequenceNumber.Add(1)
+ // Transition to FIN-WAIT1 state since we're initiating an active close.
+ s.ep.mu.Lock()
+ s.ep.state = StateFinWait1
+ s.ep.mu.Unlock()
} else {
// We're sending a non-FIN segment.
if seg.flags&header.TCPFlagFin != 0 {
@@ -779,7 +787,7 @@ func (s *sender) sendData() {
break
}
dataSent = true
- s.outstanding++
+ s.outstanding += s.pCount(seg)
s.writeNext = seg.Next()
}
}
diff --git a/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go b/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go
new file mode 100644
index 000000000..4d1519860
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go
@@ -0,0 +1,519 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// These tests are flaky when run under the go race detector due to some
+// iterations taking long enough that the retransmit timer can kick in causing
+// the congestion window measurements to fail due to extra packets etc.
+//
+// +build !race
+
+package tcp_test
+
+import (
+ "fmt"
+ "math"
+ "testing"
+ "time"
+
+ "gvisor.googlesource.com/gvisor/pkg/tcpip"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/header"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/tcp"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/tcp/testing/context"
+)
+
+func TestFastRecovery(t *testing.T) {
+ maxPayload := 32
+ c := context.New(t, uint32(header.TCPMinimumSize+header.IPv4MinimumSize+maxPayload))
+ defer c.Cleanup()
+
+ c.CreateConnected(789, 30000, nil)
+
+ const iterations = 7
+ data := buffer.NewView(2 * maxPayload * (tcp.InitialCwnd << (iterations + 1)))
+ for i := range data {
+ data[i] = byte(i)
+ }
+
+ // Write all the data in one shot. Packets will only be written at the
+ // MTU size though.
+ if _, _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil {
+ t.Fatalf("Write failed: %v", err)
+ }
+
+ // Do slow start for a few iterations.
+ expected := tcp.InitialCwnd
+ bytesRead := 0
+ for i := 0; i < iterations; i++ {
+ expected = tcp.InitialCwnd << uint(i)
+ if i > 0 {
+ // Acknowledge all the data received so far if not on
+ // first iteration.
+ c.SendAck(790, bytesRead)
+ }
+
+ // Read all packets expected on this iteration. Don't
+ // acknowledge any of them just yet, so that we can measure the
+ // congestion window.
+ for j := 0; j < expected; j++ {
+ c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
+ bytesRead += maxPayload
+ }
+
+ // Check we don't receive any more packets on this iteration.
+ // The timeout can't be too high or we'll trigger a timeout.
+ c.CheckNoPacketTimeout("More packets received than expected for this cwnd.", 50*time.Millisecond)
+ }
+
+ // Send 3 duplicate acks. This should force an immediate retransmit of
+ // the pending packet and put the sender into fast recovery.
+ rtxOffset := bytesRead - maxPayload*expected
+ for i := 0; i < 3; i++ {
+ c.SendAck(790, rtxOffset)
+ }
+
+ // Receive the retransmitted packet.
+ c.ReceiveAndCheckPacket(data, rtxOffset, maxPayload)
+
+ if got, want := c.Stack().Stats().TCP.FastRetransmit.Value(), uint64(1); got != want {
+ t.Errorf("got stats.TCP.FastRetransmit.Value = %v, want = %v", got, want)
+ }
+
+ if got, want := c.Stack().Stats().TCP.Retransmits.Value(), uint64(1); got != want {
+ t.Errorf("got stats.TCP.Retransmit.Value = %v, want = %v", got, want)
+ }
+
+ if got, want := c.Stack().Stats().TCP.FastRecovery.Value(), uint64(1); got != want {
+ t.Errorf("got stats.TCP.FastRecovery.Value = %v, want = %v", got, want)
+ }
+
+ // Now send 7 mode duplicate acks. Each of these should cause a window
+ // inflation by 1 and cause the sender to send an extra packet.
+ for i := 0; i < 7; i++ {
+ c.SendAck(790, rtxOffset)
+ }
+
+ recover := bytesRead
+
+ // Ensure no new packets arrive.
+ c.CheckNoPacketTimeout("More packets received than expected during recovery after dupacks for this cwnd.",
+ 50*time.Millisecond)
+
+ // Acknowledge half of the pending data.
+ rtxOffset = bytesRead - expected*maxPayload/2
+ c.SendAck(790, rtxOffset)
+
+ // Receive the retransmit due to partial ack.
+ c.ReceiveAndCheckPacket(data, rtxOffset, maxPayload)
+
+ if got, want := c.Stack().Stats().TCP.FastRetransmit.Value(), uint64(2); got != want {
+ t.Errorf("got stats.TCP.FastRetransmit.Value = %v, want = %v", got, want)
+ }
+
+ if got, want := c.Stack().Stats().TCP.Retransmits.Value(), uint64(2); got != want {
+ t.Errorf("got stats.TCP.Retransmit.Value = %v, want = %v", got, want)
+ }
+
+ // Receive the 10 extra packets that should have been released due to
+ // the congestion window inflation in recovery.
+ for i := 0; i < 10; i++ {
+ c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
+ bytesRead += maxPayload
+ }
+
+ // A partial ACK during recovery should reduce congestion window by the
+ // number acked. Since we had "expected" packets outstanding before sending
+ // partial ack and we acked expected/2 , the cwnd and outstanding should
+ // be expected/2 + 10 (7 dupAcks + 3 for the original 3 dupacks that triggered
+ // fast recovery). Which means the sender should not send any more packets
+ // till we ack this one.
+ c.CheckNoPacketTimeout("More packets received than expected during recovery after partial ack for this cwnd.",
+ 50*time.Millisecond)
+
+ // Acknowledge all pending data to recover point.
+ c.SendAck(790, recover)
+
+ // At this point, the cwnd should reset to expected/2 and there are 10
+ // packets outstanding.
+ //
+ // NOTE: Technically netstack is incorrect in that we adjust the cwnd on
+ // the same segment that takes us out of recovery. But because of that
+ // the actual cwnd at exit of recovery will be expected/2 + 1 as we
+ // acked a cwnd worth of packets which will increase the cwnd further by
+ // 1 in congestion avoidance.
+ //
+ // Now in the first iteration since there are 10 packets outstanding.
+ // We would expect to get expected/2 +1 - 10 packets. But subsequent
+ // iterations will send us expected/2 + 1 + 1 (per iteration).
+ expected = expected/2 + 1 - 10
+ for i := 0; i < iterations; i++ {
+ // Read all packets expected on this iteration. Don't
+ // acknowledge any of them just yet, so that we can measure the
+ // congestion window.
+ for j := 0; j < expected; j++ {
+ c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
+ bytesRead += maxPayload
+ }
+
+ // Check we don't receive any more packets on this iteration.
+ // The timeout can't be too high or we'll trigger a timeout.
+ c.CheckNoPacketTimeout(fmt.Sprintf("More packets received(after deflation) than expected %d for this cwnd.", expected), 50*time.Millisecond)
+
+ // Acknowledge all the data received so far.
+ c.SendAck(790, bytesRead)
+
+ // In cogestion avoidance, the packets trains increase by 1 in
+ // each iteration.
+ if i == 0 {
+ // After the first iteration we expect to get the full
+ // congestion window worth of packets in every
+ // iteration.
+ expected += 10
+ }
+ expected++
+ }
+}
+
+func TestExponentialIncreaseDuringSlowStart(t *testing.T) {
+ maxPayload := 32
+ c := context.New(t, uint32(header.TCPMinimumSize+header.IPv4MinimumSize+maxPayload))
+ defer c.Cleanup()
+
+ c.CreateConnected(789, 30000, nil)
+
+ const iterations = 7
+ data := buffer.NewView(maxPayload * (tcp.InitialCwnd << (iterations + 1)))
+ for i := range data {
+ data[i] = byte(i)
+ }
+
+ // Write all the data in one shot. Packets will only be written at the
+ // MTU size though.
+ if _, _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil {
+ t.Fatalf("Write failed: %v", err)
+ }
+
+ expected := tcp.InitialCwnd
+ bytesRead := 0
+ for i := 0; i < iterations; i++ {
+ // Read all packets expected on this iteration. Don't
+ // acknowledge any of them just yet, so that we can measure the
+ // congestion window.
+ for j := 0; j < expected; j++ {
+ c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
+ bytesRead += maxPayload
+ }
+
+ // Check we don't receive any more packets on this iteration.
+ // The timeout can't be too high or we'll trigger a timeout.
+ c.CheckNoPacketTimeout("More packets received than expected for this cwnd.", 50*time.Millisecond)
+
+ // Acknowledge all the data received so far.
+ c.SendAck(790, bytesRead)
+
+ // Double the number of expected packets for the next iteration.
+ expected *= 2
+ }
+}
+
+func TestCongestionAvoidance(t *testing.T) {
+ maxPayload := 32
+ c := context.New(t, uint32(header.TCPMinimumSize+header.IPv4MinimumSize+maxPayload))
+ defer c.Cleanup()
+
+ c.CreateConnected(789, 30000, nil)
+
+ const iterations = 7
+ data := buffer.NewView(2 * maxPayload * (tcp.InitialCwnd << (iterations + 1)))
+ for i := range data {
+ data[i] = byte(i)
+ }
+
+ // Write all the data in one shot. Packets will only be written at the
+ // MTU size though.
+ if _, _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil {
+ t.Fatalf("Write failed: %v", err)
+ }
+
+ // Do slow start for a few iterations.
+ expected := tcp.InitialCwnd
+ bytesRead := 0
+ for i := 0; i < iterations; i++ {
+ expected = tcp.InitialCwnd << uint(i)
+ if i > 0 {
+ // Acknowledge all the data received so far if not on
+ // first iteration.
+ c.SendAck(790, bytesRead)
+ }
+
+ // Read all packets expected on this iteration. Don't
+ // acknowledge any of them just yet, so that we can measure the
+ // congestion window.
+ for j := 0; j < expected; j++ {
+ c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
+ bytesRead += maxPayload
+ }
+
+ // Check we don't receive any more packets on this iteration.
+ // The timeout can't be too high or we'll trigger a timeout.
+ c.CheckNoPacketTimeout("More packets received than expected for this cwnd (slow start phase).", 50*time.Millisecond)
+ }
+
+ // Don't acknowledge the first packet of the last packet train. Let's
+ // wait for them to time out, which will trigger a restart of slow
+ // start, and initialization of ssthresh to cwnd/2.
+ rtxOffset := bytesRead - maxPayload*expected
+ c.ReceiveAndCheckPacket(data, rtxOffset, maxPayload)
+
+ // Acknowledge all the data received so far.
+ c.SendAck(790, bytesRead)
+
+ // This part is tricky: when the timeout happened, we had "expected"
+ // packets pending, cwnd reset to 1, and ssthresh set to expected/2.
+ // By acknowledging "expected" packets, the slow-start part will
+ // increase cwnd to expected/2 (which "consumes" expected/2-1 of the
+ // acknowledgements), then the congestion avoidance part will consume
+ // an extra expected/2 acks to take cwnd to expected/2 + 1. One ack
+ // remains in the "ack count" (which will cause cwnd to be incremented
+ // once it reaches cwnd acks).
+ //
+ // So we're straight into congestion avoidance with cwnd set to
+ // expected/2 + 1.
+ //
+ // Check that packets trains of cwnd packets are sent, and that cwnd is
+ // incremented by 1 after we acknowledge each packet.
+ expected = expected/2 + 1
+ for i := 0; i < iterations; i++ {
+ // Read all packets expected on this iteration. Don't
+ // acknowledge any of them just yet, so that we can measure the
+ // congestion window.
+ for j := 0; j < expected; j++ {
+ c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
+ bytesRead += maxPayload
+ }
+
+ // Check we don't receive any more packets on this iteration.
+ // The timeout can't be too high or we'll trigger a timeout.
+ c.CheckNoPacketTimeout("More packets received than expected for this cwnd (congestion avoidance phase).", 50*time.Millisecond)
+
+ // Acknowledge all the data received so far.
+ c.SendAck(790, bytesRead)
+
+ // In cogestion avoidance, the packets trains increase by 1 in
+ // each iteration.
+ expected++
+ }
+}
+
+// cubicCwnd returns an estimate of a cubic window given the
+// originalCwnd, wMax, last congestion event time and sRTT.
+func cubicCwnd(origCwnd int, wMax int, congEventTime time.Time, sRTT time.Duration) int {
+ cwnd := float64(origCwnd)
+ // We wait 50ms between each iteration so sRTT as computed by cubic
+ // should be close to 50ms.
+ elapsed := (time.Since(congEventTime) + sRTT).Seconds()
+ k := math.Cbrt(float64(wMax) * 0.3 / 0.7)
+ wtRTT := 0.4*math.Pow(elapsed-k, 3) + float64(wMax)
+ cwnd += (wtRTT - cwnd) / cwnd
+ return int(cwnd)
+}
+
+func TestCubicCongestionAvoidance(t *testing.T) {
+ maxPayload := 32
+ c := context.New(t, uint32(header.TCPMinimumSize+header.IPv4MinimumSize+maxPayload))
+ defer c.Cleanup()
+
+ enableCUBIC(t, c)
+
+ c.CreateConnected(789, 30000, nil)
+
+ const iterations = 7
+ data := buffer.NewView(2 * maxPayload * (tcp.InitialCwnd << (iterations + 1)))
+
+ for i := range data {
+ data[i] = byte(i)
+ }
+
+ // Write all the data in one shot. Packets will only be written at the
+ // MTU size though.
+ if _, _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil {
+ t.Fatalf("Write failed: %v", err)
+ }
+
+ // Do slow start for a few iterations.
+ expected := tcp.InitialCwnd
+ bytesRead := 0
+ for i := 0; i < iterations; i++ {
+ expected = tcp.InitialCwnd << uint(i)
+ if i > 0 {
+ // Acknowledge all the data received so far if not on
+ // first iteration.
+ c.SendAck(790, bytesRead)
+ }
+
+ // Read all packets expected on this iteration. Don't
+ // acknowledge any of them just yet, so that we can measure the
+ // congestion window.
+ for j := 0; j < expected; j++ {
+ c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
+ bytesRead += maxPayload
+ }
+
+ // Check we don't receive any more packets on this iteration.
+ // The timeout can't be too high or we'll trigger a timeout.
+ c.CheckNoPacketTimeout("More packets received than expected for this cwnd (during slow-start phase).", 50*time.Millisecond)
+ }
+
+ // Don't acknowledge the first packet of the last packet train. Let's
+ // wait for them to time out, which will trigger a restart of slow
+ // start, and initialization of ssthresh to cwnd * 0.7.
+ rtxOffset := bytesRead - maxPayload*expected
+ c.ReceiveAndCheckPacket(data, rtxOffset, maxPayload)
+
+ // Acknowledge all pending data.
+ c.SendAck(790, bytesRead)
+
+ // Store away the time we sent the ACK and assuming a 200ms RTO
+ // we estimate that the sender will have an RTO 200ms from now
+ // and go back into slow start.
+ packetDropTime := time.Now().Add(200 * time.Millisecond)
+
+ // This part is tricky: when the timeout happened, we had "expected"
+ // packets pending, cwnd reset to 1, and ssthresh set to expected * 0.7.
+ // By acknowledging "expected" packets, the slow-start part will
+ // increase cwnd to expected/2 essentially putting the connection
+ // straight into congestion avoidance.
+ wMax := expected
+ // Lower expected as per cubic spec after a congestion event.
+ expected = int(float64(expected) * 0.7)
+ cwnd := expected
+ for i := 0; i < iterations; i++ {
+ // Cubic grows window independent of ACKs. Cubic Window growth
+ // is a function of time elapsed since last congestion event.
+ // As a result the congestion window does not grow
+ // deterministically in response to ACKs.
+ //
+ // We need to roughly estimate what the cwnd of the sender is
+ // based on when we sent the dupacks.
+ cwnd := cubicCwnd(cwnd, wMax, packetDropTime, 50*time.Millisecond)
+
+ packetsExpected := cwnd
+ for j := 0; j < packetsExpected; j++ {
+ c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
+ bytesRead += maxPayload
+ }
+ t.Logf("expected packets received, next trying to receive any extra packets that may come")
+
+ // If our estimate was correct there should be no more pending packets.
+ // We attempt to read a packet a few times with a short sleep in between
+ // to ensure that we don't see the sender send any unexpected packets.
+ unexpectedPackets := 0
+ for {
+ gotPacket := c.ReceiveNonBlockingAndCheckPacket(data, bytesRead, maxPayload)
+ if !gotPacket {
+ break
+ }
+ bytesRead += maxPayload
+ unexpectedPackets++
+ time.Sleep(1 * time.Millisecond)
+ }
+ if unexpectedPackets != 0 {
+ t.Fatalf("received %d unexpected packets for iteration %d", unexpectedPackets, i)
+ }
+ // Check we don't receive any more packets on this iteration.
+ // The timeout can't be too high or we'll trigger a timeout.
+ c.CheckNoPacketTimeout("More packets received than expected for this cwnd(congestion avoidance)", 5*time.Millisecond)
+
+ // Acknowledge all the data received so far.
+ c.SendAck(790, bytesRead)
+ }
+}
+
+func TestRetransmit(t *testing.T) {
+ maxPayload := 32
+ c := context.New(t, uint32(header.TCPMinimumSize+header.IPv4MinimumSize+maxPayload))
+ defer c.Cleanup()
+
+ c.CreateConnected(789, 30000, nil)
+
+ const iterations = 7
+ data := buffer.NewView(maxPayload * (tcp.InitialCwnd << (iterations + 1)))
+ for i := range data {
+ data[i] = byte(i)
+ }
+
+ // Write all the data in two shots. Packets will only be written at the
+ // MTU size though.
+ half := data[:len(data)/2]
+ if _, _, err := c.EP.Write(tcpip.SlicePayload(half), tcpip.WriteOptions{}); err != nil {
+ t.Fatalf("Write failed: %v", err)
+ }
+ half = data[len(data)/2:]
+ if _, _, err := c.EP.Write(tcpip.SlicePayload(half), tcpip.WriteOptions{}); err != nil {
+ t.Fatalf("Write failed: %v", err)
+ }
+
+ // Do slow start for a few iterations.
+ expected := tcp.InitialCwnd
+ bytesRead := 0
+ for i := 0; i < iterations; i++ {
+ expected = tcp.InitialCwnd << uint(i)
+ if i > 0 {
+ // Acknowledge all the data received so far if not on
+ // first iteration.
+ c.SendAck(790, bytesRead)
+ }
+
+ // Read all packets expected on this iteration. Don't
+ // acknowledge any of them just yet, so that we can measure the
+ // congestion window.
+ for j := 0; j < expected; j++ {
+ c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
+ bytesRead += maxPayload
+ }
+
+ // Check we don't receive any more packets on this iteration.
+ // The timeout can't be too high or we'll trigger a timeout.
+ c.CheckNoPacketTimeout("More packets received than expected for this cwnd.", 50*time.Millisecond)
+ }
+
+ // Wait for a timeout and retransmit.
+ rtxOffset := bytesRead - maxPayload*expected
+ c.ReceiveAndCheckPacket(data, rtxOffset, maxPayload)
+
+ if got, want := c.Stack().Stats().TCP.Timeouts.Value(), uint64(1); got != want {
+ t.Errorf("got stats.TCP.Timeouts.Value = %v, want = %v", got, want)
+ }
+
+ if got, want := c.Stack().Stats().TCP.Retransmits.Value(), uint64(1); got != want {
+ t.Errorf("got stats.TCP.Retransmits.Value = %v, want = %v", got, want)
+ }
+
+ if got, want := c.Stack().Stats().TCP.SlowStartRetransmits.Value(), uint64(1); got != want {
+ t.Errorf("got stats.TCP.SlowStartRetransmits.Value = %v, want = %v", got, want)
+ }
+
+ // Acknowledge half of the pending data.
+ rtxOffset = bytesRead - expected*maxPayload/2
+ c.SendAck(790, rtxOffset)
+
+ // Receive the remaining data, making sure that acknowledged data is not
+ // retransmitted.
+ for offset := rtxOffset; offset < len(data); offset += maxPayload {
+ c.ReceiveAndCheckPacket(data, offset, maxPayload)
+ c.SendAck(790, offset+maxPayload)
+ }
+
+ c.CheckNoPacketTimeout("More packets received than expected for this cwnd.", 50*time.Millisecond)
+}
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index fe037602b..7d8987219 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -168,8 +168,8 @@ func TestTCPResetsSentIncrement(t *testing.T) {
// Receive the SYN-ACK reply.
b := c.GetPacket()
- tcp := header.TCP(header.IPv4(b).Payload())
- c.IRS = seqnum.Value(tcp.SequenceNumber())
+ tcpHdr := header.TCP(header.IPv4(b).Payload())
+ c.IRS = seqnum.Value(tcpHdr.SequenceNumber())
ackHeaders := &context.Headers{
SrcPort: context.TestPort,
@@ -269,8 +269,8 @@ func TestConnectResetAfterClose(t *testing.T) {
time.Sleep(3 * time.Second)
for {
b := c.GetPacket()
- tcp := header.TCP(header.IPv4(b).Payload())
- if tcp.Flags() == header.TCPFlagAck|header.TCPFlagFin {
+ tcpHdr := header.TCP(header.IPv4(b).Payload())
+ if tcpHdr.Flags() == header.TCPFlagAck|header.TCPFlagFin {
// This is a retransmit of the FIN, ignore it.
continue
}
@@ -553,9 +553,13 @@ func TestRstOnCloseWithUnreadData(t *testing.T) {
// We shouldn't consume a sequence number on RST.
checker.SeqNum(uint32(c.IRS)+1),
))
+ // The RST puts the endpoint into an error state.
+ if got, want := tcp.EndpointState(c.EP.State()), tcp.StateError; got != want {
+ t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+ }
- // This final should be ignored because an ACK on a reset doesn't
- // mean anything.
+ // This final ACK should be ignored because an ACK on a reset doesn't mean
+ // anything.
c.SendPacket(nil, &context.Headers{
SrcPort: context.TestPort,
DstPort: c.Port,
@@ -618,6 +622,10 @@ func TestRstOnCloseWithUnreadDataFinConvertRst(t *testing.T) {
checker.SeqNum(uint32(c.IRS)+1),
))
+ if got, want := tcp.EndpointState(c.EP.State()), tcp.StateFinWait1; got != want {
+ t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+ }
+
// Cause a RST to be generated by closing the read end now since we have
// unread data.
c.EP.Shutdown(tcpip.ShutdownRead)
@@ -630,6 +638,10 @@ func TestRstOnCloseWithUnreadDataFinConvertRst(t *testing.T) {
// We shouldn't consume a sequence number on RST.
checker.SeqNum(uint32(c.IRS)+1),
))
+ // The RST puts the endpoint into an error state.
+ if got, want := tcp.EndpointState(c.EP.State()), tcp.StateError; got != want {
+ t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+ }
// The ACK to the FIN should now be rejected since the connection has been
// closed by a RST.
@@ -1510,8 +1522,8 @@ func testBrokenUpWrite(t *testing.T, c *context.Context, maxPayload int) {
for bytesReceived != dataLen {
b := c.GetPacket()
numPackets++
- tcp := header.TCP(header.IPv4(b).Payload())
- payloadLen := len(tcp.Payload())
+ tcpHdr := header.TCP(header.IPv4(b).Payload())
+ payloadLen := len(tcpHdr.Payload())
checker.IPv4(t, b,
checker.TCP(
checker.DstPort(context.TestPort),
@@ -1522,7 +1534,7 @@ func testBrokenUpWrite(t *testing.T, c *context.Context, maxPayload int) {
)
pdata := data[bytesReceived : bytesReceived+payloadLen]
- if p := tcp.Payload(); !bytes.Equal(pdata, p) {
+ if p := tcpHdr.Payload(); !bytes.Equal(pdata, p) {
t.Fatalf("got data = %v, want = %v", p, pdata)
}
bytesReceived += payloadLen
@@ -1530,7 +1542,7 @@ func testBrokenUpWrite(t *testing.T, c *context.Context, maxPayload int) {
if c.TimeStampEnabled {
// If timestamp option is enabled, echo back the timestamp and increment
// the TSEcr value included in the packet and send that back as the TSVal.
- parsedOpts := tcp.ParsedOptions()
+ parsedOpts := tcpHdr.ParsedOptions()
tsOpt := [12]byte{header.TCPOptionNOP, header.TCPOptionNOP}
header.EncodeTSOption(parsedOpts.TSEcr+1, parsedOpts.TSVal, tsOpt[2:])
options = tsOpt[:]
@@ -1757,8 +1769,8 @@ func TestSynOptionsOnActiveConnect(t *testing.T) {
),
)
- tcp := header.TCP(header.IPv4(b).Payload())
- c.IRS = seqnum.Value(tcp.SequenceNumber())
+ tcpHdr := header.TCP(header.IPv4(b).Payload())
+ c.IRS = seqnum.Value(tcpHdr.SequenceNumber())
// Wait for retransmit.
time.Sleep(1 * time.Second)
@@ -1766,8 +1778,8 @@ func TestSynOptionsOnActiveConnect(t *testing.T) {
checker.TCP(
checker.DstPort(context.TestPort),
checker.TCPFlags(header.TCPFlagSyn),
- checker.SrcPort(tcp.SourcePort()),
- checker.SeqNum(tcp.SequenceNumber()),
+ checker.SrcPort(tcpHdr.SourcePort()),
+ checker.SeqNum(tcpHdr.SequenceNumber()),
checker.TCPSynOptions(header.TCPSynOptions{MSS: mss, WS: wndScale}),
),
)
@@ -1775,8 +1787,8 @@ func TestSynOptionsOnActiveConnect(t *testing.T) {
// Send SYN-ACK.
iss := seqnum.Value(789)
c.SendPacket(nil, &context.Headers{
- SrcPort: tcp.DestinationPort(),
- DstPort: tcp.SourcePort(),
+ SrcPort: tcpHdr.DestinationPort(),
+ DstPort: tcpHdr.SourcePort(),
Flags: header.TCPFlagSyn | header.TCPFlagAck,
SeqNum: iss,
AckNum: c.IRS.Add(1),
@@ -2336,491 +2348,6 @@ func TestFinWithPartialAck(t *testing.T) {
})
}
-func TestExponentialIncreaseDuringSlowStart(t *testing.T) {
- maxPayload := 32
- c := context.New(t, uint32(header.TCPMinimumSize+header.IPv4MinimumSize+maxPayload))
- defer c.Cleanup()
-
- c.CreateConnected(789, 30000, nil)
-
- const iterations = 7
- data := buffer.NewView(maxPayload * (tcp.InitialCwnd << (iterations + 1)))
- for i := range data {
- data[i] = byte(i)
- }
-
- // Write all the data in one shot. Packets will only be written at the
- // MTU size though.
- if _, _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil {
- t.Fatalf("Write failed: %v", err)
- }
-
- expected := tcp.InitialCwnd
- bytesRead := 0
- for i := 0; i < iterations; i++ {
- // Read all packets expected on this iteration. Don't
- // acknowledge any of them just yet, so that we can measure the
- // congestion window.
- for j := 0; j < expected; j++ {
- c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
- bytesRead += maxPayload
- }
-
- // Check we don't receive any more packets on this iteration.
- // The timeout can't be too high or we'll trigger a timeout.
- c.CheckNoPacketTimeout("More packets received than expected for this cwnd.", 50*time.Millisecond)
-
- // Acknowledge all the data received so far.
- c.SendAck(790, bytesRead)
-
- // Double the number of expected packets for the next iteration.
- expected *= 2
- }
-}
-
-func TestCongestionAvoidance(t *testing.T) {
- maxPayload := 32
- c := context.New(t, uint32(header.TCPMinimumSize+header.IPv4MinimumSize+maxPayload))
- defer c.Cleanup()
-
- c.CreateConnected(789, 30000, nil)
-
- const iterations = 7
- data := buffer.NewView(2 * maxPayload * (tcp.InitialCwnd << (iterations + 1)))
- for i := range data {
- data[i] = byte(i)
- }
-
- // Write all the data in one shot. Packets will only be written at the
- // MTU size though.
- if _, _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil {
- t.Fatalf("Write failed: %v", err)
- }
-
- // Do slow start for a few iterations.
- expected := tcp.InitialCwnd
- bytesRead := 0
- for i := 0; i < iterations; i++ {
- expected = tcp.InitialCwnd << uint(i)
- if i > 0 {
- // Acknowledge all the data received so far if not on
- // first iteration.
- c.SendAck(790, bytesRead)
- }
-
- // Read all packets expected on this iteration. Don't
- // acknowledge any of them just yet, so that we can measure the
- // congestion window.
- for j := 0; j < expected; j++ {
- c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
- bytesRead += maxPayload
- }
-
- // Check we don't receive any more packets on this iteration.
- // The timeout can't be too high or we'll trigger a timeout.
- c.CheckNoPacketTimeout("More packets received than expected for this cwnd (slow start phase).", 50*time.Millisecond)
- }
-
- // Don't acknowledge the first packet of the last packet train. Let's
- // wait for them to time out, which will trigger a restart of slow
- // start, and initialization of ssthresh to cwnd/2.
- rtxOffset := bytesRead - maxPayload*expected
- c.ReceiveAndCheckPacket(data, rtxOffset, maxPayload)
-
- // Acknowledge all the data received so far.
- c.SendAck(790, bytesRead)
-
- // This part is tricky: when the timeout happened, we had "expected"
- // packets pending, cwnd reset to 1, and ssthresh set to expected/2.
- // By acknowledging "expected" packets, the slow-start part will
- // increase cwnd to expected/2 (which "consumes" expected/2-1 of the
- // acknowledgements), then the congestion avoidance part will consume
- // an extra expected/2 acks to take cwnd to expected/2 + 1. One ack
- // remains in the "ack count" (which will cause cwnd to be incremented
- // once it reaches cwnd acks).
- //
- // So we're straight into congestion avoidance with cwnd set to
- // expected/2 + 1.
- //
- // Check that packets trains of cwnd packets are sent, and that cwnd is
- // incremented by 1 after we acknowledge each packet.
- expected = expected/2 + 1
- for i := 0; i < iterations; i++ {
- // Read all packets expected on this iteration. Don't
- // acknowledge any of them just yet, so that we can measure the
- // congestion window.
- for j := 0; j < expected; j++ {
- c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
- bytesRead += maxPayload
- }
-
- // Check we don't receive any more packets on this iteration.
- // The timeout can't be too high or we'll trigger a timeout.
- c.CheckNoPacketTimeout("More packets received than expected for this cwnd (congestion avoidance phase).", 50*time.Millisecond)
-
- // Acknowledge all the data received so far.
- c.SendAck(790, bytesRead)
-
- // In cogestion avoidance, the packets trains increase by 1 in
- // each iteration.
- expected++
- }
-}
-
-// cubicCwnd returns an estimate of a cubic window given the
-// originalCwnd, wMax, last congestion event time and sRTT.
-func cubicCwnd(origCwnd int, wMax int, congEventTime time.Time, sRTT time.Duration) int {
- cwnd := float64(origCwnd)
- // We wait 50ms between each iteration so sRTT as computed by cubic
- // should be close to 50ms.
- elapsed := (time.Since(congEventTime) + sRTT).Seconds()
- k := math.Cbrt(float64(wMax) * 0.3 / 0.7)
- wtRTT := 0.4*math.Pow(elapsed-k, 3) + float64(wMax)
- cwnd += (wtRTT - cwnd) / cwnd
- return int(cwnd)
-}
-
-func TestCubicCongestionAvoidance(t *testing.T) {
- maxPayload := 32
- c := context.New(t, uint32(header.TCPMinimumSize+header.IPv4MinimumSize+maxPayload))
- defer c.Cleanup()
-
- enableCUBIC(t, c)
-
- c.CreateConnected(789, 30000, nil)
-
- const iterations = 7
- data := buffer.NewView(2 * maxPayload * (tcp.InitialCwnd << (iterations + 1)))
-
- for i := range data {
- data[i] = byte(i)
- }
-
- // Write all the data in one shot. Packets will only be written at the
- // MTU size though.
- if _, _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil {
- t.Fatalf("Write failed: %v", err)
- }
-
- // Do slow start for a few iterations.
- expected := tcp.InitialCwnd
- bytesRead := 0
- for i := 0; i < iterations; i++ {
- expected = tcp.InitialCwnd << uint(i)
- if i > 0 {
- // Acknowledge all the data received so far if not on
- // first iteration.
- c.SendAck(790, bytesRead)
- }
-
- // Read all packets expected on this iteration. Don't
- // acknowledge any of them just yet, so that we can measure the
- // congestion window.
- for j := 0; j < expected; j++ {
- c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
- bytesRead += maxPayload
- }
-
- // Check we don't receive any more packets on this iteration.
- // The timeout can't be too high or we'll trigger a timeout.
- c.CheckNoPacketTimeout("More packets received than expected for this cwnd (during slow-start phase).", 50*time.Millisecond)
- }
-
- // Don't acknowledge the first packet of the last packet train. Let's
- // wait for them to time out, which will trigger a restart of slow
- // start, and initialization of ssthresh to cwnd * 0.7.
- rtxOffset := bytesRead - maxPayload*expected
- c.ReceiveAndCheckPacket(data, rtxOffset, maxPayload)
-
- // Acknowledge all pending data.
- c.SendAck(790, bytesRead)
-
- // Store away the time we sent the ACK and assuming a 200ms RTO
- // we estimate that the sender will have an RTO 200ms from now
- // and go back into slow start.
- packetDropTime := time.Now().Add(200 * time.Millisecond)
-
- // This part is tricky: when the timeout happened, we had "expected"
- // packets pending, cwnd reset to 1, and ssthresh set to expected * 0.7.
- // By acknowledging "expected" packets, the slow-start part will
- // increase cwnd to expected/2 essentially putting the connection
- // straight into congestion avoidance.
- wMax := expected
- // Lower expected as per cubic spec after a congestion event.
- expected = int(float64(expected) * 0.7)
- cwnd := expected
- for i := 0; i < iterations; i++ {
- // Cubic grows window independent of ACKs. Cubic Window growth
- // is a function of time elapsed since last congestion event.
- // As a result the congestion window does not grow
- // deterministically in response to ACKs.
- //
- // We need to roughly estimate what the cwnd of the sender is
- // based on when we sent the dupacks.
- cwnd := cubicCwnd(cwnd, wMax, packetDropTime, 50*time.Millisecond)
-
- packetsExpected := cwnd
- for j := 0; j < packetsExpected; j++ {
- c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
- bytesRead += maxPayload
- }
- t.Logf("expected packets received, next trying to receive any extra packets that may come")
-
- // If our estimate was correct there should be no more pending packets.
- // We attempt to read a packet a few times with a short sleep in between
- // to ensure that we don't see the sender send any unexpected packets.
- unexpectedPackets := 0
- for {
- gotPacket := c.ReceiveNonBlockingAndCheckPacket(data, bytesRead, maxPayload)
- if !gotPacket {
- break
- }
- bytesRead += maxPayload
- unexpectedPackets++
- time.Sleep(1 * time.Millisecond)
- }
- if unexpectedPackets != 0 {
- t.Fatalf("received %d unexpected packets for iteration %d", unexpectedPackets, i)
- }
- // Check we don't receive any more packets on this iteration.
- // The timeout can't be too high or we'll trigger a timeout.
- c.CheckNoPacketTimeout("More packets received than expected for this cwnd(congestion avoidance)", 5*time.Millisecond)
-
- // Acknowledge all the data received so far.
- c.SendAck(790, bytesRead)
- }
-}
-
-func TestFastRecovery(t *testing.T) {
- maxPayload := 32
- c := context.New(t, uint32(header.TCPMinimumSize+header.IPv4MinimumSize+maxPayload))
- defer c.Cleanup()
-
- c.CreateConnected(789, 30000, nil)
-
- const iterations = 7
- data := buffer.NewView(2 * maxPayload * (tcp.InitialCwnd << (iterations + 1)))
- for i := range data {
- data[i] = byte(i)
- }
-
- // Write all the data in one shot. Packets will only be written at the
- // MTU size though.
- if _, _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil {
- t.Fatalf("Write failed: %v", err)
- }
-
- // Do slow start for a few iterations.
- expected := tcp.InitialCwnd
- bytesRead := 0
- for i := 0; i < iterations; i++ {
- expected = tcp.InitialCwnd << uint(i)
- if i > 0 {
- // Acknowledge all the data received so far if not on
- // first iteration.
- c.SendAck(790, bytesRead)
- }
-
- // Read all packets expected on this iteration. Don't
- // acknowledge any of them just yet, so that we can measure the
- // congestion window.
- for j := 0; j < expected; j++ {
- c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
- bytesRead += maxPayload
- }
-
- // Check we don't receive any more packets on this iteration.
- // The timeout can't be too high or we'll trigger a timeout.
- c.CheckNoPacketTimeout("More packets received than expected for this cwnd.", 50*time.Millisecond)
- }
-
- // Send 3 duplicate acks. This should force an immediate retransmit of
- // the pending packet and put the sender into fast recovery.
- rtxOffset := bytesRead - maxPayload*expected
- for i := 0; i < 3; i++ {
- c.SendAck(790, rtxOffset)
- }
-
- // Receive the retransmitted packet.
- c.ReceiveAndCheckPacket(data, rtxOffset, maxPayload)
-
- if got, want := c.Stack().Stats().TCP.FastRetransmit.Value(), uint64(1); got != want {
- t.Errorf("got stats.TCP.FastRetransmit.Value = %v, want = %v", got, want)
- }
-
- if got, want := c.Stack().Stats().TCP.Retransmits.Value(), uint64(1); got != want {
- t.Errorf("got stats.TCP.Retransmit.Value = %v, want = %v", got, want)
- }
-
- if got, want := c.Stack().Stats().TCP.FastRecovery.Value(), uint64(1); got != want {
- t.Errorf("got stats.TCP.FastRecovery.Value = %v, want = %v", got, want)
- }
-
- // Now send 7 mode duplicate acks. Each of these should cause a window
- // inflation by 1 and cause the sender to send an extra packet.
- for i := 0; i < 7; i++ {
- c.SendAck(790, rtxOffset)
- }
-
- recover := bytesRead
-
- // Ensure no new packets arrive.
- c.CheckNoPacketTimeout("More packets received than expected during recovery after dupacks for this cwnd.",
- 50*time.Millisecond)
-
- // Acknowledge half of the pending data.
- rtxOffset = bytesRead - expected*maxPayload/2
- c.SendAck(790, rtxOffset)
-
- // Receive the retransmit due to partial ack.
- c.ReceiveAndCheckPacket(data, rtxOffset, maxPayload)
-
- if got, want := c.Stack().Stats().TCP.FastRetransmit.Value(), uint64(2); got != want {
- t.Errorf("got stats.TCP.FastRetransmit.Value = %v, want = %v", got, want)
- }
-
- if got, want := c.Stack().Stats().TCP.Retransmits.Value(), uint64(2); got != want {
- t.Errorf("got stats.TCP.Retransmit.Value = %v, want = %v", got, want)
- }
-
- // Receive the 10 extra packets that should have been released due to
- // the congestion window inflation in recovery.
- for i := 0; i < 10; i++ {
- c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
- bytesRead += maxPayload
- }
-
- // A partial ACK during recovery should reduce congestion window by the
- // number acked. Since we had "expected" packets outstanding before sending
- // partial ack and we acked expected/2 , the cwnd and outstanding should
- // be expected/2 + 10 (7 dupAcks + 3 for the original 3 dupacks that triggered
- // fast recovery). Which means the sender should not send any more packets
- // till we ack this one.
- c.CheckNoPacketTimeout("More packets received than expected during recovery after partial ack for this cwnd.",
- 50*time.Millisecond)
-
- // Acknowledge all pending data to recover point.
- c.SendAck(790, recover)
-
- // At this point, the cwnd should reset to expected/2 and there are 10
- // packets outstanding.
- //
- // NOTE: Technically netstack is incorrect in that we adjust the cwnd on
- // the same segment that takes us out of recovery. But because of that
- // the actual cwnd at exit of recovery will be expected/2 + 1 as we
- // acked a cwnd worth of packets which will increase the cwnd further by
- // 1 in congestion avoidance.
- //
- // Now in the first iteration since there are 10 packets outstanding.
- // We would expect to get expected/2 +1 - 10 packets. But subsequent
- // iterations will send us expected/2 + 1 + 1 (per iteration).
- expected = expected/2 + 1 - 10
- for i := 0; i < iterations; i++ {
- // Read all packets expected on this iteration. Don't
- // acknowledge any of them just yet, so that we can measure the
- // congestion window.
- for j := 0; j < expected; j++ {
- c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
- bytesRead += maxPayload
- }
-
- // Check we don't receive any more packets on this iteration.
- // The timeout can't be too high or we'll trigger a timeout.
- c.CheckNoPacketTimeout(fmt.Sprintf("More packets received(after deflation) than expected %d for this cwnd.", expected), 50*time.Millisecond)
-
- // Acknowledge all the data received so far.
- c.SendAck(790, bytesRead)
-
- // In cogestion avoidance, the packets trains increase by 1 in
- // each iteration.
- if i == 0 {
- // After the first iteration we expect to get the full
- // congestion window worth of packets in every
- // iteration.
- expected += 10
- }
- expected++
- }
-}
-
-func TestRetransmit(t *testing.T) {
- maxPayload := 32
- c := context.New(t, uint32(header.TCPMinimumSize+header.IPv4MinimumSize+maxPayload))
- defer c.Cleanup()
-
- c.CreateConnected(789, 30000, nil)
-
- const iterations = 7
- data := buffer.NewView(maxPayload * (tcp.InitialCwnd << (iterations + 1)))
- for i := range data {
- data[i] = byte(i)
- }
-
- // Write all the data in two shots. Packets will only be written at the
- // MTU size though.
- half := data[:len(data)/2]
- if _, _, err := c.EP.Write(tcpip.SlicePayload(half), tcpip.WriteOptions{}); err != nil {
- t.Fatalf("Write failed: %v", err)
- }
- half = data[len(data)/2:]
- if _, _, err := c.EP.Write(tcpip.SlicePayload(half), tcpip.WriteOptions{}); err != nil {
- t.Fatalf("Write failed: %v", err)
- }
-
- // Do slow start for a few iterations.
- expected := tcp.InitialCwnd
- bytesRead := 0
- for i := 0; i < iterations; i++ {
- expected = tcp.InitialCwnd << uint(i)
- if i > 0 {
- // Acknowledge all the data received so far if not on
- // first iteration.
- c.SendAck(790, bytesRead)
- }
-
- // Read all packets expected on this iteration. Don't
- // acknowledge any of them just yet, so that we can measure the
- // congestion window.
- for j := 0; j < expected; j++ {
- c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
- bytesRead += maxPayload
- }
-
- // Check we don't receive any more packets on this iteration.
- // The timeout can't be too high or we'll trigger a timeout.
- c.CheckNoPacketTimeout("More packets received than expected for this cwnd.", 50*time.Millisecond)
- }
-
- // Wait for a timeout and retransmit.
- rtxOffset := bytesRead - maxPayload*expected
- c.ReceiveAndCheckPacket(data, rtxOffset, maxPayload)
-
- if got, want := c.Stack().Stats().TCP.Timeouts.Value(), uint64(1); got != want {
- t.Errorf("got stats.TCP.Timeouts.Value = %v, want = %v", got, want)
- }
-
- if got, want := c.Stack().Stats().TCP.Retransmits.Value(), uint64(1); got != want {
- t.Errorf("got stats.TCP.Retransmits.Value = %v, want = %v", got, want)
- }
-
- if got, want := c.Stack().Stats().TCP.SlowStartRetransmits.Value(), uint64(1); got != want {
- t.Errorf("got stats.TCP.SlowStartRetransmits.Value = %v, want = %v", got, want)
- }
-
- // Acknowledge half of the pending data.
- rtxOffset = bytesRead - expected*maxPayload/2
- c.SendAck(790, rtxOffset)
-
- // Receive the remaining data, making sure that acknowledged data is not
- // retransmitted.
- for offset := rtxOffset; offset < len(data); offset += maxPayload {
- c.ReceiveAndCheckPacket(data, offset, maxPayload)
- c.SendAck(790, offset+maxPayload)
- }
-
- c.CheckNoPacketTimeout("More packets received than expected for this cwnd.", 50*time.Millisecond)
-}
-
func TestUpdateListenBacklog(t *testing.T) {
c := context.New(t, defaultMTU)
defer c.Cleanup()
@@ -3008,8 +2535,8 @@ func TestReceivedSegmentQueuing(t *testing.T) {
checker.TCPFlags(header.TCPFlagAck),
),
)
- tcp := header.TCP(header.IPv4(b).Payload())
- ack := seqnum.Value(tcp.AckNumber())
+ tcpHdr := header.TCP(header.IPv4(b).Payload())
+ ack := seqnum.Value(tcpHdr.AckNumber())
if ack == last {
break
}
@@ -3053,6 +2580,10 @@ func TestReadAfterClosedState(t *testing.T) {
),
)
+ if got, want := tcp.EndpointState(c.EP.State()), tcp.StateFinWait1; got != want {
+ t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+ }
+
// Send some data and acknowledge the FIN.
data := []byte{1, 2, 3}
c.SendPacket(data, &context.Headers{
@@ -3074,9 +2605,15 @@ func TestReadAfterClosedState(t *testing.T) {
),
)
- // Give the stack the chance to transition to closed state.
+ // Give the stack the chance to transition to closed state. Note that since
+ // both the sender and receiver are now closed, we effectively skip the
+ // TIME-WAIT state.
time.Sleep(1 * time.Second)
+ if got, want := tcp.EndpointState(c.EP.State()), tcp.StateClose; got != want {
+ t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+ }
+
// Wait for receive to be notified.
select {
case <-ch:
@@ -3668,13 +3205,14 @@ func TestTCPEndpointProbe(t *testing.T) {
}
}
-func TestSetCongestionControl(t *testing.T) {
+func TestStackSetCongestionControl(t *testing.T) {
testCases := []struct {
- cc tcp.CongestionControlOption
- mustPass bool
+ cc tcpip.CongestionControlOption
+ err *tcpip.Error
}{
- {"reno", true},
- {"cubic", true},
+ {"reno", nil},
+ {"cubic", nil},
+ {"blahblah", tcpip.ErrNoSuchFile},
}
for _, tc := range testCases {
@@ -3684,62 +3222,135 @@ func TestSetCongestionControl(t *testing.T) {
s := c.Stack()
- if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tc.cc); err != nil && tc.mustPass {
- t.Fatalf("s.SetTransportProtocolOption(%v, %v) = %v, want not-nil", tcp.ProtocolNumber, tc.cc, err)
+ var oldCC tcpip.CongestionControlOption
+ if err := s.TransportProtocolOption(tcp.ProtocolNumber, &oldCC); err != nil {
+ t.Fatalf("s.TransportProtocolOption(%v, %v) = %v", tcp.ProtocolNumber, &oldCC, err)
}
- var cc tcp.CongestionControlOption
+ if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tc.cc); err != tc.err {
+ t.Fatalf("s.SetTransportProtocolOption(%v, %v) = %v, want %v", tcp.ProtocolNumber, tc.cc, err, tc.err)
+ }
+
+ var cc tcpip.CongestionControlOption
if err := s.TransportProtocolOption(tcp.ProtocolNumber, &cc); err != nil {
t.Fatalf("s.TransportProtocolOption(%v, %v) = %v", tcp.ProtocolNumber, &cc, err)
}
- if got, want := cc, tc.cc; got != want {
+
+ got, want := cc, oldCC
+ // If SetTransportProtocolOption is expected to succeed
+ // then the returned value for congestion control should
+ // match the one specified in the
+ // SetTransportProtocolOption call above, else it should
+ // be what it was before the call to
+ // SetTransportProtocolOption.
+ if tc.err == nil {
+ want = tc.cc
+ }
+ if got != want {
t.Fatalf("got congestion control: %v, want: %v", got, want)
}
})
}
}
-func TestAvailableCongestionControl(t *testing.T) {
+func TestStackAvailableCongestionControl(t *testing.T) {
c := context.New(t, 1500)
defer c.Cleanup()
s := c.Stack()
// Query permitted congestion control algorithms.
- var aCC tcp.AvailableCongestionControlOption
+ var aCC tcpip.AvailableCongestionControlOption
if err := s.TransportProtocolOption(tcp.ProtocolNumber, &aCC); err != nil {
t.Fatalf("s.TransportProtocolOption(%v, %v) = %v", tcp.ProtocolNumber, &aCC, err)
}
- if got, want := aCC, tcp.AvailableCongestionControlOption("reno cubic"); got != want {
- t.Fatalf("got tcp.AvailableCongestionControlOption: %v, want: %v", got, want)
+ if got, want := aCC, tcpip.AvailableCongestionControlOption("reno cubic"); got != want {
+ t.Fatalf("got tcpip.AvailableCongestionControlOption: %v, want: %v", got, want)
}
}
-func TestSetAvailableCongestionControl(t *testing.T) {
+func TestStackSetAvailableCongestionControl(t *testing.T) {
c := context.New(t, 1500)
defer c.Cleanup()
s := c.Stack()
// Setting AvailableCongestionControlOption should fail.
- aCC := tcp.AvailableCongestionControlOption("xyz")
+ aCC := tcpip.AvailableCongestionControlOption("xyz")
if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &aCC); err == nil {
t.Fatalf("s.TransportProtocolOption(%v, %v) = nil, want non-nil", tcp.ProtocolNumber, &aCC)
}
// Verify that we still get the expected list of congestion control options.
- var cc tcp.AvailableCongestionControlOption
+ var cc tcpip.AvailableCongestionControlOption
if err := s.TransportProtocolOption(tcp.ProtocolNumber, &cc); err != nil {
t.Fatalf("s.TransportProtocolOption(%v, %v) = %v", tcp.ProtocolNumber, &cc, err)
}
- if got, want := cc, tcp.AvailableCongestionControlOption("reno cubic"); got != want {
- t.Fatalf("got tcp.AvailableCongestionControlOption: %v, want: %v", got, want)
+ if got, want := cc, tcpip.AvailableCongestionControlOption("reno cubic"); got != want {
+ t.Fatalf("got tcpip.AvailableCongestionControlOption: %v, want: %v", got, want)
+ }
+}
+
+func TestEndpointSetCongestionControl(t *testing.T) {
+ testCases := []struct {
+ cc tcpip.CongestionControlOption
+ err *tcpip.Error
+ }{
+ {"reno", nil},
+ {"cubic", nil},
+ {"blahblah", tcpip.ErrNoSuchFile},
+ }
+
+ for _, connected := range []bool{false, true} {
+ for _, tc := range testCases {
+ t.Run(fmt.Sprintf("SetSockOpt(.., %v) w/ connected = %v", tc.cc, connected), func(t *testing.T) {
+ c := context.New(t, 1500)
+ defer c.Cleanup()
+
+ // Create TCP endpoint.
+ var err *tcpip.Error
+ c.EP, err = c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
+ if err != nil {
+ t.Fatalf("NewEndpoint failed: %v", err)
+ }
+
+ var oldCC tcpip.CongestionControlOption
+ if err := c.EP.GetSockOpt(&oldCC); err != nil {
+ t.Fatalf("c.EP.SockOpt(%v) = %v", &oldCC, err)
+ }
+
+ if connected {
+ c.Connect(789 /* iss */, 32768 /* rcvWnd */, nil)
+ }
+
+ if err := c.EP.SetSockOpt(tc.cc); err != tc.err {
+ t.Fatalf("c.EP.SetSockOpt(%v) = %v, want %v", tc.cc, err, tc.err)
+ }
+
+ var cc tcpip.CongestionControlOption
+ if err := c.EP.GetSockOpt(&cc); err != nil {
+ t.Fatalf("c.EP.SockOpt(%v) = %v", &cc, err)
+ }
+
+ got, want := cc, oldCC
+ // If SetSockOpt is expected to succeed then the
+ // returned value for congestion control should match
+ // the one specified in the SetSockOpt above, else it
+ // should be what it was before the call to SetSockOpt.
+ if tc.err == nil {
+ want = tc.cc
+ }
+ if got != want {
+ t.Fatalf("got congestion control: %v, want: %v", got, want)
+ }
+ })
+ }
}
}
func enableCUBIC(t *testing.T, c *context.Context) {
t.Helper()
- opt := tcp.CongestionControlOption("cubic")
+ opt := tcpip.CongestionControlOption("cubic")
if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, opt); err != nil {
t.Fatalf("c.s.SetTransportProtocolOption(tcp.ProtocolNumber, %v = %v", opt, err)
}
@@ -3868,7 +3479,7 @@ func executeHandshake(t *testing.T, c *context.Context, srcPort uint16, synCooki
RcvWnd: 30000,
})
- // Receive the SYN-ACK reply.
+ // Receive the SYN-ACK reply.w
b := c.GetPacket()
tcp := header.TCP(header.IPv4(b).Payload())
iss = seqnum.Value(tcp.SequenceNumber())
@@ -3932,12 +3543,18 @@ func TestListenBacklogFull(t *testing.T) {
time.Sleep(50 * time.Millisecond)
- // Now execute one more handshake. This should not be completed and
- // delivered on an Accept() call as the backlog is full at this point.
- irs, iss := executeHandshake(t, c, context.TestPort+uint16(listenBacklog), false /* synCookieInUse */)
+ // Now execute send one more SYN. The stack should not respond as the backlog
+ // is full at this point.
+ c.SendPacket(nil, &context.Headers{
+ SrcPort: context.TestPort + 2,
+ DstPort: context.StackPort,
+ Flags: header.TCPFlagSyn,
+ SeqNum: seqnum.Value(789),
+ RcvWnd: 30000,
+ })
+ c.CheckNoPacketTimeout("unexpected packet received", 50*time.Millisecond)
- time.Sleep(50 * time.Millisecond)
- // Try to accept the connection.
+ // Try to accept the connections in the backlog.
we, ch := waiter.NewChannelEntry(nil)
c.WQ.EventRegister(&we, waiter.EventIn)
defer c.WQ.EventUnregister(&we)
@@ -3969,16 +3586,8 @@ func TestListenBacklogFull(t *testing.T) {
}
}
- // Now craft the ACK again and verify that the connection is now ready
- // to be accepted.
- c.SendPacket(nil, &context.Headers{
- SrcPort: context.TestPort + uint16(listenBacklog),
- DstPort: context.StackPort,
- Flags: header.TCPFlagAck,
- SeqNum: irs + 1,
- AckNum: iss + 1,
- RcvWnd: 30000,
- })
+ // Now a new handshake must succeed.
+ executeHandshake(t, c, context.TestPort+2, false /*synCookieInUse */)
newEP, _, err := c.EP.Accept()
if err == tcpip.ErrWouldBlock {
@@ -3994,6 +3603,7 @@ func TestListenBacklogFull(t *testing.T) {
t.Fatalf("Timed out waiting for accept")
}
}
+
// Now verify that the TCP socket is usable and in a connected state.
data := "Don't panic"
newEP.Write(tcpip.SlicePayload(buffer.NewViewFromBytes([]byte(data))), tcpip.WriteOptions{})
@@ -4004,13 +3614,7 @@ func TestListenBacklogFull(t *testing.T) {
}
}
-func TestListenBacklogFullSynCookieInUse(t *testing.T) {
- saved := tcp.SynRcvdCountThreshold
- defer func() {
- tcp.SynRcvdCountThreshold = saved
- }()
- tcp.SynRcvdCountThreshold = 1
-
+func TestListenSynRcvdQueueFull(t *testing.T) {
c := context.New(t, defaultMTU)
defer c.Cleanup()
@@ -4029,48 +3633,72 @@ func TestListenBacklogFullSynCookieInUse(t *testing.T) {
// Test acceptance.
// Start listening.
listenBacklog := 1
- portOffset := uint16(0)
if err := c.EP.Listen(listenBacklog); err != nil {
t.Fatalf("Listen failed: %v", err)
}
- executeHandshake(t, c, context.TestPort+portOffset, false)
- portOffset++
- // Wait for this to be delivered to the accept queue.
- time.Sleep(50 * time.Millisecond)
+ // Send two SYN's the first one should get a SYN-ACK, the
+ // second one should not get any response and is dropped as
+ // the synRcvd count will be equal to backlog.
+ irs := seqnum.Value(789)
+ c.SendPacket(nil, &context.Headers{
+ SrcPort: context.TestPort,
+ DstPort: context.StackPort,
+ Flags: header.TCPFlagSyn,
+ SeqNum: seqnum.Value(789),
+ RcvWnd: 30000,
+ })
- nonCookieIRS, nonCookieISS := executeHandshake(t, c, context.TestPort+portOffset, false)
+ // Receive the SYN-ACK reply.
+ b := c.GetPacket()
+ tcp := header.TCP(header.IPv4(b).Payload())
+ iss := seqnum.Value(tcp.SequenceNumber())
+ tcpCheckers := []checker.TransportChecker{
+ checker.SrcPort(context.StackPort),
+ checker.DstPort(context.TestPort),
+ checker.TCPFlags(header.TCPFlagAck | header.TCPFlagSyn),
+ checker.AckNum(uint32(irs) + 1),
+ }
+ checker.IPv4(t, b, checker.TCP(tcpCheckers...))
- // Since the backlog is full at this point this connection will not
- // transition out of handshake and ignore the ACK.
- //
- // At this point there should be 1 completed connection in the backlog
- // and one incomplete one pending for a final ACK and hence not ready to be
- // delivered to the endpoint.
- //
- // Now execute one more handshake. This should not be completed and
- // delivered on an Accept() call as the backlog is full at this point
- // and there is already 1 pending endpoint.
+ // Now execute send one more SYN. The stack should not respond as the backlog
+ // is full at this point.
//
- // This one should use a SYN cookie as the synRcvdCount is equal to the
- // SynRcvdCountThreshold.
- time.Sleep(50 * time.Millisecond)
- portOffset++
- irs, iss := executeHandshake(t, c, context.TestPort+portOffset, true)
+ // NOTE: we did not complete the handshake for the previous one so the
+ // accept backlog should be empty and there should be one connection in
+ // synRcvd state.
+ c.SendPacket(nil, &context.Headers{
+ SrcPort: context.TestPort + 1,
+ DstPort: context.StackPort,
+ Flags: header.TCPFlagSyn,
+ SeqNum: seqnum.Value(889),
+ RcvWnd: 30000,
+ })
+ c.CheckNoPacketTimeout("unexpected packet received", 50*time.Millisecond)
- time.Sleep(50 * time.Millisecond)
+ // Now complete the previous connection and verify that there is a connection
+ // to accept.
+ // Send ACK.
+ c.SendPacket(nil, &context.Headers{
+ SrcPort: context.TestPort,
+ DstPort: context.StackPort,
+ Flags: header.TCPFlagAck,
+ SeqNum: irs + 1,
+ AckNum: iss + 1,
+ RcvWnd: 30000,
+ })
- // Verify that there is only one acceptable connection at this point.
+ // Try to accept the connections in the backlog.
we, ch := waiter.NewChannelEntry(nil)
c.WQ.EventRegister(&we, waiter.EventIn)
defer c.WQ.EventUnregister(&we)
- _, _, err = c.EP.Accept()
+ newEP, _, err := c.EP.Accept()
if err == tcpip.ErrWouldBlock {
// Wait for connection to be established.
select {
case <-ch:
- _, _, err = c.EP.Accept()
+ newEP, _, err = c.EP.Accept()
if err != nil {
t.Fatalf("Accept failed: %v", err)
}
@@ -4080,27 +3708,68 @@ func TestListenBacklogFullSynCookieInUse(t *testing.T) {
}
}
- // Now verify that there are no more connections that can be accepted.
- _, _, err = c.EP.Accept()
- if err != tcpip.ErrWouldBlock {
- select {
- case <-ch:
- t.Fatalf("unexpected endpoint delivered on Accept: %+v", c.EP)
- case <-time.After(1 * time.Second):
- }
+ // Now verify that the TCP socket is usable and in a connected state.
+ data := "Don't panic"
+ newEP.Write(tcpip.SlicePayload(buffer.NewViewFromBytes([]byte(data))), tcpip.WriteOptions{})
+ pkt := c.GetPacket()
+ tcp = header.TCP(header.IPv4(pkt).Payload())
+ if string(tcp.Payload()) != data {
+ t.Fatalf("Unexpected data: got %v, want %v", string(tcp.Payload()), data)
}
+}
- // Now send an ACK for the half completed connection
+func TestListenBacklogFullSynCookieInUse(t *testing.T) {
+ saved := tcp.SynRcvdCountThreshold
+ defer func() {
+ tcp.SynRcvdCountThreshold = saved
+ }()
+ tcp.SynRcvdCountThreshold = 1
+
+ c := context.New(t, defaultMTU)
+ defer c.Cleanup()
+
+ // Create TCP endpoint.
+ var err *tcpip.Error
+ c.EP, err = c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
+ if err != nil {
+ t.Fatalf("NewEndpoint failed: %v", err)
+ }
+
+ // Bind to wildcard.
+ if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+ t.Fatalf("Bind failed: %v", err)
+ }
+
+ // Test acceptance.
+ // Start listening.
+ listenBacklog := 1
+ portOffset := uint16(0)
+ if err := c.EP.Listen(listenBacklog); err != nil {
+ t.Fatalf("Listen failed: %v", err)
+ }
+
+ executeHandshake(t, c, context.TestPort+portOffset, false)
+ portOffset++
+ // Wait for this to be delivered to the accept queue.
+ time.Sleep(50 * time.Millisecond)
+
+ // Send a SYN request.
+ irs := seqnum.Value(789)
c.SendPacket(nil, &context.Headers{
- SrcPort: context.TestPort + portOffset - 1,
+ SrcPort: context.TestPort,
DstPort: context.StackPort,
- Flags: header.TCPFlagAck,
- SeqNum: nonCookieIRS + 1,
- AckNum: nonCookieISS + 1,
+ Flags: header.TCPFlagSyn,
+ SeqNum: irs,
RcvWnd: 30000,
})
+ // The Syn should be dropped as the endpoint's backlog is full.
+ c.CheckNoPacketTimeout("unexpected packet received", 50*time.Millisecond)
+
+ // Verify that there is only one acceptable connection at this point.
+ we, ch := waiter.NewChannelEntry(nil)
+ c.WQ.EventRegister(&we, waiter.EventIn)
+ defer c.WQ.EventUnregister(&we)
- // Verify that the connection is now delivered to the backlog.
_, _, err = c.EP.Accept()
if err == tcpip.ErrWouldBlock {
// Wait for connection to be established.
@@ -4116,41 +3785,15 @@ func TestListenBacklogFullSynCookieInUse(t *testing.T) {
}
}
- // Finally send an ACK for the connection that used a cookie and verify that
- // it's also completed and delivered.
- c.SendPacket(nil, &context.Headers{
- SrcPort: context.TestPort + portOffset,
- DstPort: context.StackPort,
- Flags: header.TCPFlagAck,
- SeqNum: irs,
- AckNum: iss,
- RcvWnd: 30000,
- })
-
- time.Sleep(50 * time.Millisecond)
- newEP, _, err := c.EP.Accept()
- if err == tcpip.ErrWouldBlock {
- // Wait for connection to be established.
+ // Now verify that there are no more connections that can be accepted.
+ _, _, err = c.EP.Accept()
+ if err != tcpip.ErrWouldBlock {
select {
case <-ch:
- newEP, _, err = c.EP.Accept()
- if err != nil {
- t.Fatalf("Accept failed: %v", err)
- }
-
+ t.Fatalf("unexpected endpoint delivered on Accept: %+v", c.EP)
case <-time.After(1 * time.Second):
- t.Fatalf("Timed out waiting for accept")
}
}
-
- // Now verify that the TCP socket is usable and in a connected state.
- data := "Don't panic"
- newEP.Write(tcpip.SlicePayload(buffer.NewViewFromBytes([]byte(data))), tcpip.WriteOptions{})
- b := c.GetPacket()
- tcp := header.TCP(header.IPv4(b).Payload())
- if string(tcp.Payload()) != data {
- t.Fatalf("Unexpected data: got %v, want %v", string(tcp.Payload()), data)
- }
}
func TestPassiveConnectionAttemptIncrement(t *testing.T) {
@@ -4165,9 +3808,15 @@ func TestPassiveConnectionAttemptIncrement(t *testing.T) {
if err := ep.Bind(tcpip.FullAddress{Addr: context.StackAddr, Port: context.StackPort}); err != nil {
t.Fatalf("Bind failed: %v", err)
}
+ if got, want := tcp.EndpointState(ep.State()), tcp.StateBound; got != want {
+ t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+ }
if err := c.EP.Listen(1); err != nil {
t.Fatalf("Listen failed: %v", err)
}
+ if got, want := tcp.EndpointState(c.EP.State()), tcp.StateListen; got != want {
+ t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+ }
stats := c.Stack().Stats()
want := stats.TCP.PassiveConnectionOpenings.Value() + 1
@@ -4218,18 +3867,12 @@ func TestPassiveFailedConnectionAttemptIncrement(t *testing.T) {
}
srcPort := uint16(context.TestPort)
- // Now attempt 3 handshakes, the first two will fill up the accept and the SYN-RCVD
- // queue for the endpoint.
+ // Now attempt a handshakes it will fill up the accept backlog.
executeHandshake(t, c, srcPort, false)
// Give time for the final ACK to be processed as otherwise the next handshake could
// get accepted before the previous one based on goroutine scheduling.
time.Sleep(50 * time.Millisecond)
- irs, iss := executeHandshake(t, c, srcPort+1, false)
-
- // Wait for a short while for the accepted connection to be delivered to
- // the channel before trying to send the 3rd SYN.
- time.Sleep(40 * time.Millisecond)
want := stats.TCP.ListenOverflowSynDrop.Value() + 1
@@ -4267,26 +3910,44 @@ func TestPassiveFailedConnectionAttemptIncrement(t *testing.T) {
t.Fatalf("Timed out waiting for accept")
}
}
+}
- // Now complete the next connection in SYN-RCVD state as it should
- // have dropped the final ACK to the handshake due to accept queue
- // being full.
- c.SendPacket(nil, &context.Headers{
- SrcPort: srcPort + 1,
- DstPort: context.StackPort,
- Flags: header.TCPFlagAck,
- SeqNum: irs + 1,
- AckNum: iss + 1,
- RcvWnd: 30000,
- })
+func TestEndpointBindListenAcceptState(t *testing.T) {
+ c := context.New(t, defaultMTU)
+ defer c.Cleanup()
+ wq := &waiter.Queue{}
+ ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq)
+ if err != nil {
+ t.Fatalf("NewEndpoint failed: %v", err)
+ }
- // Now check that there is one more acceptable connections.
- _, _, err = c.EP.Accept()
+ if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+ t.Fatalf("Bind failed: %v", err)
+ }
+ if got, want := tcp.EndpointState(ep.State()), tcp.StateBound; got != want {
+ t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+ }
+
+ if err := ep.Listen(10); err != nil {
+ t.Fatalf("Listen failed: %v", err)
+ }
+ if got, want := tcp.EndpointState(ep.State()), tcp.StateListen; got != want {
+ t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+ }
+
+ c.PassiveConnectWithOptions(100, 5, header.TCPSynOptions{MSS: defaultIPv4MSS})
+
+ // Try to accept the connection.
+ we, ch := waiter.NewChannelEntry(nil)
+ wq.EventRegister(&we, waiter.EventIn)
+ defer wq.EventUnregister(&we)
+
+ aep, _, err := ep.Accept()
if err == tcpip.ErrWouldBlock {
// Wait for connection to be established.
select {
case <-ch:
- _, _, err = c.EP.Accept()
+ aep, _, err = ep.Accept()
if err != nil {
t.Fatalf("Accept failed: %v", err)
}
@@ -4295,19 +3956,23 @@ func TestPassiveFailedConnectionAttemptIncrement(t *testing.T) {
t.Fatalf("Timed out waiting for accept")
}
}
+ if got, want := tcp.EndpointState(aep.State()), tcp.StateEstablished; got != want {
+ t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+ }
+ // Listening endpoint remains in listen state.
+ if got, want := tcp.EndpointState(ep.State()), tcp.StateListen; got != want {
+ t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+ }
- // Try and accept a 3rd one this should fail.
- _, _, err = c.EP.Accept()
- if err == tcpip.ErrWouldBlock {
- // Wait for connection to be established.
- select {
- case <-ch:
- ep, _, err = c.EP.Accept()
- if err == nil {
- t.Fatalf("Accept succeeded when it should have failed got: %+v", ep)
- }
-
- case <-time.After(1 * time.Second):
- }
+ ep.Close()
+ // Give worker goroutines time to receive the close notification.
+ time.Sleep(1 * time.Second)
+ if got, want := tcp.EndpointState(ep.State()), tcp.StateClose; got != want {
+ t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+ }
+ // Accepted endpoint remains open when the listen endpoint is closed.
+ if got, want := tcp.EndpointState(aep.State()), tcp.StateEstablished; got != want {
+ t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
}
+
}
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index 6e12413c6..a4d89e24d 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -520,32 +520,21 @@ func (c *Context) CreateConnected(iss seqnum.Value, rcvWnd seqnum.Size, epRcvBuf
c.CreateConnectedWithRawOptions(iss, rcvWnd, epRcvBuf, nil)
}
-// CreateConnectedWithRawOptions creates a connected TCP endpoint and sends
-// the specified option bytes as the Option field in the initial SYN packet.
+// Connect performs the 3-way handshake for c.EP with the provided Initial
+// Sequence Number (iss) and receive window(rcvWnd) and any options if
+// specified.
//
// It also sets the receive buffer for the endpoint to the specified
// value in epRcvBuf.
-func (c *Context) CreateConnectedWithRawOptions(iss seqnum.Value, rcvWnd seqnum.Size, epRcvBuf *tcpip.ReceiveBufferSizeOption, options []byte) {
- // Create TCP endpoint.
- var err *tcpip.Error
- c.EP, err = c.s.NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
- if err != nil {
- c.t.Fatalf("NewEndpoint failed: %v", err)
- }
-
- if epRcvBuf != nil {
- if err := c.EP.SetSockOpt(*epRcvBuf); err != nil {
- c.t.Fatalf("SetSockOpt failed failed: %v", err)
- }
- }
-
+//
+// PreCondition: c.EP must already be created.
+func (c *Context) Connect(iss seqnum.Value, rcvWnd seqnum.Size, options []byte) {
// Start connection attempt.
waitEntry, notifyCh := waiter.NewChannelEntry(nil)
c.WQ.EventRegister(&waitEntry, waiter.EventOut)
defer c.WQ.EventUnregister(&waitEntry)
- err = c.EP.Connect(tcpip.FullAddress{Addr: TestAddr, Port: TestPort})
- if err != tcpip.ErrConnectStarted {
+ if err := c.EP.Connect(tcpip.FullAddress{Addr: TestAddr, Port: TestPort}); err != tcpip.ErrConnectStarted {
c.t.Fatalf("Unexpected return value from Connect: %v", err)
}
@@ -557,13 +546,16 @@ func (c *Context) CreateConnectedWithRawOptions(iss seqnum.Value, rcvWnd seqnum.
checker.TCPFlags(header.TCPFlagSyn),
),
)
+ if got, want := tcp.EndpointState(c.EP.State()), tcp.StateSynSent; got != want {
+ c.t.Fatalf("Unexpected endpoint state: want %v, got %v", want, got)
+ }
- tcp := header.TCP(header.IPv4(b).Payload())
- c.IRS = seqnum.Value(tcp.SequenceNumber())
+ tcpHdr := header.TCP(header.IPv4(b).Payload())
+ c.IRS = seqnum.Value(tcpHdr.SequenceNumber())
c.SendPacket(nil, &Headers{
- SrcPort: tcp.DestinationPort(),
- DstPort: tcp.SourcePort(),
+ SrcPort: tcpHdr.DestinationPort(),
+ DstPort: tcpHdr.SourcePort(),
Flags: header.TCPFlagSyn | header.TCPFlagAck,
SeqNum: iss,
AckNum: c.IRS.Add(1),
@@ -584,15 +576,38 @@ func (c *Context) CreateConnectedWithRawOptions(iss seqnum.Value, rcvWnd seqnum.
// Wait for connection to be established.
select {
case <-notifyCh:
- err = c.EP.GetSockOpt(tcpip.ErrorOption{})
- if err != nil {
+ if err := c.EP.GetSockOpt(tcpip.ErrorOption{}); err != nil {
c.t.Fatalf("Unexpected error when connecting: %v", err)
}
case <-time.After(1 * time.Second):
c.t.Fatalf("Timed out waiting for connection")
}
+ if got, want := tcp.EndpointState(c.EP.State()), tcp.StateEstablished; got != want {
+ c.t.Fatalf("Unexpected endpoint state: want %v, got %v", want, got)
+ }
+
+ c.Port = tcpHdr.SourcePort()
+}
+
+// CreateConnectedWithRawOptions creates a connected TCP endpoint and sends
+// the specified option bytes as the Option field in the initial SYN packet.
+//
+// It also sets the receive buffer for the endpoint to the specified
+// value in epRcvBuf.
+func (c *Context) CreateConnectedWithRawOptions(iss seqnum.Value, rcvWnd seqnum.Size, epRcvBuf *tcpip.ReceiveBufferSizeOption, options []byte) {
+ // Create TCP endpoint.
+ var err *tcpip.Error
+ c.EP, err = c.s.NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
+ if err != nil {
+ c.t.Fatalf("NewEndpoint failed: %v", err)
+ }
- c.Port = tcp.SourcePort()
+ if epRcvBuf != nil {
+ if err := c.EP.SetSockOpt(*epRcvBuf); err != nil {
+ c.t.Fatalf("SetSockOpt failed failed: %v", err)
+ }
+ }
+ c.Connect(iss, rcvWnd, options)
}
// RawEndpoint is just a small wrapper around a TCP endpoint's state to make
@@ -690,6 +705,9 @@ func (c *Context) CreateConnectedWithOptions(wantOptions header.TCPSynOptions) *
if err != nil {
c.t.Fatalf("c.s.NewEndpoint(tcp, ipv4...) = %v", err)
}
+ if got, want := tcp.EndpointState(c.EP.State()), tcp.StateInitial; got != want {
+ c.t.Fatalf("Unexpected endpoint state: want %v, got %v", want, got)
+ }
// Start connection attempt.
waitEntry, notifyCh := waiter.NewChannelEntry(nil)
@@ -719,6 +737,10 @@ func (c *Context) CreateConnectedWithOptions(wantOptions header.TCPSynOptions) *
}),
),
)
+ if got, want := tcp.EndpointState(c.EP.State()), tcp.StateSynSent; got != want {
+ c.t.Fatalf("Unexpected endpoint state: want %v, got %v", want, got)
+ }
+
tcpSeg := header.TCP(header.IPv4(b).Payload())
synOptions := header.ParseSynOptions(tcpSeg.Options(), false)
@@ -782,6 +804,9 @@ func (c *Context) CreateConnectedWithOptions(wantOptions header.TCPSynOptions) *
case <-time.After(1 * time.Second):
c.t.Fatalf("Timed out waiting for connection")
}
+ if got, want := tcp.EndpointState(c.EP.State()), tcp.StateEstablished; got != want {
+ c.t.Fatalf("Unexpected endpoint state: want %v, got %v", want, got)
+ }
// Store the source port in use by the endpoint.
c.Port = tcpSeg.SourcePort()
@@ -821,10 +846,16 @@ func (c *Context) AcceptWithOptions(wndScale int, synOptions header.TCPSynOption
if err := ep.Bind(tcpip.FullAddress{Port: StackPort}); err != nil {
c.t.Fatalf("Bind failed: %v", err)
}
+ if got, want := tcp.EndpointState(ep.State()), tcp.StateBound; got != want {
+ c.t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+ }
if err := ep.Listen(10); err != nil {
c.t.Fatalf("Listen failed: %v", err)
}
+ if got, want := tcp.EndpointState(ep.State()), tcp.StateListen; got != want {
+ c.t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+ }
rep := c.PassiveConnectWithOptions(100, wndScale, synOptions)
@@ -847,6 +878,10 @@ func (c *Context) AcceptWithOptions(wndScale int, synOptions header.TCPSynOption
c.t.Fatalf("Timed out waiting for accept")
}
}
+ if got, want := tcp.EndpointState(c.EP.State()), tcp.StateEstablished; got != want {
+ c.t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+ }
+
return rep
}
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 3d52a4f31..fa7278286 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -1000,3 +1000,9 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv
// HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, vv buffer.VectorisedView) {
}
+
+// State implements socket.Socket.State.
+func (e *endpoint) State() uint32 {
+ // TODO(b/112063468): Translate internal state to values returned by Linux.
+ return 0
+}
diff --git a/pkg/urpc/urpc.go b/pkg/urpc/urpc.go
index 0f155ec74..4ea684659 100644
--- a/pkg/urpc/urpc.go
+++ b/pkg/urpc/urpc.go
@@ -35,7 +35,7 @@ import (
)
// maxFiles determines the maximum file payload.
-const maxFiles = 16
+const maxFiles = 32
// ErrTooManyFiles is returned when too many file descriptors are mapped.
var ErrTooManyFiles = errors.New("too many files")
diff --git a/runsc/BUILD b/runsc/BUILD
index af8e928c5..8a57c597b 100644
--- a/runsc/BUILD
+++ b/runsc/BUILD
@@ -1,6 +1,4 @@
-package(
- licenses = ["notice"], # Apache 2.0
-)
+package(licenses = ["notice"]) # Apache 2.0
load("@io_bazel_rules_go//go:def.bzl", "go_binary")
load("@bazel_tools//tools/build_defs/pkg:pkg.bzl", "pkg_deb", "pkg_tar")
@@ -84,8 +82,9 @@ pkg_tar(
genrule(
name = "deb-version",
outs = ["version.txt"],
- cmd = "cat bazel-out/volatile-status.txt | grep VERSION | sed 's/^[^0-9]*//' >$@",
+ cmd = "$(location :runsc) -version | grep 'runsc version' | sed 's/^[^0-9]*//' > $@",
stamp = 1,
+ tools = [":runsc"],
)
pkg_deb(
@@ -98,4 +97,7 @@ pkg_deb(
package = "runsc",
postinst = "debian/postinst.sh",
version_file = ":version.txt",
+ visibility = [
+ "//visibility:public",
+ ],
)
diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD
index df9907e52..744f852a1 100644
--- a/runsc/boot/BUILD
+++ b/runsc/boot/BUILD
@@ -16,6 +16,7 @@ go_library(
"limits.go",
"loader.go",
"network.go",
+ "pprof.go",
"strace.go",
],
importpath = "gvisor.googlesource.com/gvisor/runsc/boot",
@@ -30,6 +31,7 @@ go_library(
"//pkg/cpuid",
"//pkg/eventchannel",
"//pkg/log",
+ "//pkg/memutil",
"//pkg/rand",
"//pkg/sentry/arch",
"//pkg/sentry/arch:registers_go_proto",
@@ -51,7 +53,6 @@ go_library(
"//pkg/sentry/kernel/kdefs",
"//pkg/sentry/limits",
"//pkg/sentry/loader",
- "//pkg/sentry/memutil",
"//pkg/sentry/pgalloc",
"//pkg/sentry/platform",
"//pkg/sentry/platform/kvm",
@@ -94,6 +95,7 @@ go_test(
size = "small",
srcs = [
"compat_test.go",
+ "fs_test.go",
"loader_test.go",
],
embed = [":boot"],
diff --git a/runsc/boot/config.go b/runsc/boot/config.go
index 15f624f9b..6112b6c0a 100644
--- a/runsc/boot/config.go
+++ b/runsc/boot/config.go
@@ -221,6 +221,17 @@ type Config struct {
// user, and without chrooting the sandbox process. This can be
// necessary in test environments that have limited capabilities.
TestOnlyAllowRunAsCurrentUserWithoutChroot bool
+
+ // NumNetworkChannels controls the number of AF_PACKET sockets that map
+ // to the same underlying network device. This allows netstack to better
+ // scale for high throughput use cases.
+ NumNetworkChannels int
+
+ // Rootless allows the sandbox to be started with a user that is not root.
+ // Defense is depth measures are weaker with rootless. Specifically, the
+ // sandbox and Gofer process run as root inside a user namespace with root
+ // mapped to the caller's user.
+ Rootless bool
}
// ToFlags returns a slice of flags that correspond to the given Config.
@@ -244,6 +255,8 @@ func (c *Config) ToFlags() []string {
"--panic-signal=" + strconv.Itoa(c.PanicSignal),
"--profile=" + strconv.FormatBool(c.ProfileEnable),
"--net-raw=" + strconv.FormatBool(c.EnableRaw),
+ "--num-network-channels=" + strconv.Itoa(c.NumNetworkChannels),
+ "--rootless=" + strconv.FormatBool(c.Rootless),
}
if c.TestOnlyAllowRunAsCurrentUserWithoutChroot {
// Only include if set since it is never to be used by users.
diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go
index 72ab9ef86..26765cc46 100644
--- a/runsc/boot/controller.go
+++ b/runsc/boot/controller.go
@@ -237,7 +237,7 @@ func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
return fmt.Errorf("start arguments must contain stdin, stderr, and stdout followed by at least one file for the container root gofer")
}
- err := cm.l.startContainer(cm.l.k, args.Spec, args.Conf, args.CID, args.FilePayload.Files)
+ err := cm.l.startContainer(args.Spec, args.Conf, args.CID, args.FilePayload.Files)
if err != nil {
log.Debugf("containerManager.Start failed %q: %+v: %v", args.CID, args, err)
return err
@@ -340,8 +340,8 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
cm.l.k = k
// Set up the restore environment.
- fds := &fdDispenser{fds: cm.l.goferFDs}
- renv, err := createRestoreEnvironment(cm.l.spec, cm.l.conf, fds)
+ mntr := newContainerMounter(cm.l.spec, "", cm.l.goferFDs, cm.l.k, cm.l.mountHints)
+ renv, err := mntr.createRestoreEnvironment(cm.l.conf)
if err != nil {
return fmt.Errorf("creating RestoreEnvironment: %v", err)
}
@@ -359,6 +359,17 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
return fmt.Errorf("file cannot be empty")
}
+ if cm.l.conf.ProfileEnable {
+ // initializePProf opens /proc/self/maps, so has to be
+ // called before installing seccomp filters.
+ initializePProf()
+ }
+
+ // Seccomp filters have to be applied before parsing the state file.
+ if err := cm.l.installSeccompFilters(); err != nil {
+ return err
+ }
+
// Load the state.
loadOpts := state.LoadOpts{Source: specFile}
if err := loadOpts.Load(k, networkStack); err != nil {
@@ -369,11 +380,11 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
k.Timekeeper().SetClocks(time.NewCalibratedClocks())
// Since we have a new kernel we also must make a new watchdog.
- watchdog := watchdog.New(k, watchdog.DefaultTimeout, cm.l.conf.WatchdogAction)
+ dog := watchdog.New(k, watchdog.DefaultTimeout, cm.l.conf.WatchdogAction)
// Change the loader fields to reflect the changes made when restoring.
cm.l.k = k
- cm.l.watchdog = watchdog
+ cm.l.watchdog = dog
cm.l.rootProcArgs = kernel.CreateProcessArgs{}
cm.l.restore = true
@@ -420,16 +431,12 @@ type WaitPIDArgs struct {
// CID is the container ID.
CID string
-
- // ClearStatus determines whether the exit status of the process should
- // be cleared when WaitPID returns.
- ClearStatus bool
}
// WaitPID waits for the process with PID 'pid' in the sandbox.
func (cm *containerManager) WaitPID(args *WaitPIDArgs, waitStatus *uint32) error {
log.Debugf("containerManager.Wait")
- return cm.l.waitPID(kernel.ThreadID(args.PID), args.CID, args.ClearStatus, waitStatus)
+ return cm.l.waitPID(kernel.ThreadID(args.PID), args.CID, waitStatus)
}
// SignalDeliveryMode enumerates different signal delivery modes.
diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go
index 4e428b49c..0811e10f4 100644
--- a/runsc/boot/fds.go
+++ b/runsc/boot/fds.go
@@ -28,11 +28,12 @@ import (
// createFDMap creates an FD map that contains stdin, stdout, and stderr. If
// console is true, then ioctl calls will be passed through to the host FD.
// Upon success, createFDMap dups then closes stdioFDs.
-func createFDMap(ctx context.Context, k *kernel.Kernel, l *limits.LimitSet, console bool, stdioFDs []int) (*kernel.FDMap, error) {
+func createFDMap(ctx context.Context, l *limits.LimitSet, console bool, stdioFDs []int) (*kernel.FDMap, error) {
if len(stdioFDs) != 3 {
return nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs))
}
+ k := kernel.KernelFromContext(ctx)
fdm := k.NewFDMap()
defer fdm.DecRef()
mounter := fs.FileOwnerFromContext(ctx)
diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go
index 652da1cef..ef2dbfad2 100644
--- a/runsc/boot/filter/config.go
+++ b/runsc/boot/filter/config.go
@@ -246,6 +246,10 @@ var allowedSyscalls = seccomp.SyscallRules{
},
syscall.SYS_SETITIMER: {},
syscall.SYS_SHUTDOWN: []seccomp.Rule{
+ // Used by fs/host to shutdown host sockets.
+ {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RD)},
+ {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_WR)},
+ // Used by unet to shutdown connections.
{seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)},
},
syscall.SYS_SIGALTSTACK: {},
diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go
index 4b1557b9a..2fa0725d1 100644
--- a/runsc/boot/fs.go
+++ b/runsc/boot/fs.go
@@ -18,6 +18,7 @@ import (
"fmt"
"path"
"path/filepath"
+ "sort"
"strconv"
"strings"
"syscall"
@@ -29,9 +30,6 @@ import (
_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/sys"
_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tmpfs"
_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tty"
- "gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
- "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
- "gvisor.googlesource.com/gvisor/pkg/sentry/limits"
specs "github.com/opencontainers/runtime-spec/specs-go"
"gvisor.googlesource.com/gvisor/pkg/abi/linux"
@@ -40,6 +38,8 @@ import (
"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
"gvisor.googlesource.com/gvisor/pkg/sentry/fs/gofer"
"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
"gvisor.googlesource.com/gvisor/pkg/syserror"
"gvisor.googlesource.com/gvisor/runsc/specutils"
)
@@ -51,6 +51,9 @@ const (
// Device name for root mount.
rootDevice = "9pfs-/"
+ // MountPrefix is the annotation prefix for mount hints.
+ MountPrefix = "gvisor.dev/spec/mount"
+
// ChildContainersDir is the directory where child container root
// filesystems are mounted.
ChildContainersDir = "/__runsc_containers__"
@@ -65,67 +68,24 @@ const (
nonefs = "none"
)
-type fdDispenser struct {
- fds []int
-}
-
-func (f *fdDispenser) remove() int {
- if f.empty() {
- panic("fdDispenser out of fds")
- }
- rv := f.fds[0]
- f.fds = f.fds[1:]
- return rv
-}
-
-func (f *fdDispenser) empty() bool {
- return len(f.fds) == 0
-}
+func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) {
+ // Upper layer uses the same flags as lower, but it must be read-write.
+ upperFlags := lowerFlags
+ upperFlags.ReadOnly = false
-func adjustDirentCache(k *kernel.Kernel) error {
- var hl syscall.Rlimit
- if err := syscall.Getrlimit(syscall.RLIMIT_NOFILE, &hl); err != nil {
- return fmt.Errorf("getting RLIMIT_NOFILE: %v", err)
- }
- if int64(hl.Cur) != syscall.RLIM_INFINITY {
- newSize := hl.Cur / 2
- if newSize < gofer.DefaultDirentCacheSize {
- log.Infof("Setting gofer dirent cache size to %d", newSize)
- gofer.DefaultDirentCacheSize = newSize
- k.DirentCacheLimiter = fs.NewDirentCacheLimiter(newSize)
- }
+ tmpFS := mustFindFilesystem("tmpfs")
+ if !fs.IsDir(lower.StableAttr) {
+ // Create overlay on top of mount file, e.g. /etc/hostname.
+ msrc := fs.NewCachingMountSource(tmpFS, upperFlags)
+ return fs.NewOverlayRootFile(ctx, msrc, lower, upperFlags)
}
- return nil
-}
-// setupRootContainerFS creates a mount namespace containing the root filesystem
-// and all mounts. 'rootCtx' is used to walk directories to find mount points.
-// 'setMountNS' is called after namespace is created. It must set the mount NS
-// to 'rootCtx'.
-func setupRootContainerFS(userCtx context.Context, rootCtx context.Context, spec *specs.Spec, conf *Config, goferFDs []int, setMountNS func(*fs.MountNamespace)) error {
- mounts := compileMounts(spec)
-
- // Create a tmpfs mount where we create and mount a root filesystem for
- // each child container.
- mounts = append(mounts, specs.Mount{
- Type: tmpfs,
- Destination: ChildContainersDir,
- })
-
- fds := &fdDispenser{fds: goferFDs}
- rootInode, err := createRootMount(rootCtx, spec, conf, fds, mounts)
- if err != nil {
- return fmt.Errorf("creating root mount: %v", err)
- }
- mns, err := fs.NewMountNamespace(userCtx, rootInode)
+ // Create overlay on top of mount dir.
+ upper, err := tmpFS.Mount(ctx, name+"-upper", upperFlags, "", nil)
if err != nil {
- return fmt.Errorf("creating root mount namespace: %v", err)
+ return nil, fmt.Errorf("creating tmpfs overlay: %v", err)
}
- setMountNS(mns)
-
- root := mns.Root()
- defer root.DecRef()
- return mountSubmounts(rootCtx, conf, mns, root, mounts, fds)
+ return fs.NewOverlayRoot(ctx, upper, lower, upperFlags)
}
// compileMounts returns the supported mounts from the mount spec, adding any
@@ -184,186 +144,6 @@ func compileMounts(spec *specs.Spec) []specs.Mount {
return mounts
}
-// createRootMount creates the root filesystem.
-func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *fdDispenser, mounts []specs.Mount) (*fs.Inode, error) {
- // First construct the filesystem from the spec.Root.
- mf := fs.MountSourceFlags{ReadOnly: spec.Root.Readonly || conf.Overlay}
-
- var (
- rootInode *fs.Inode
- err error
- )
-
- fd := fds.remove()
- log.Infof("Mounting root over 9P, ioFD: %d", fd)
- p9FS := mustFindFilesystem("9p")
- opts := p9MountOptions(fd, conf.FileAccess)
- rootInode, err = p9FS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","), nil)
- if err != nil {
- return nil, fmt.Errorf("creating root mount point: %v", err)
- }
-
- // We need to overlay the root on top of a ramfs with stub directories
- // for submount paths. "/dev" "/sys" "/proc" and "/tmp" are always
- // mounted even if they are not in the spec.
- submounts := append(subtargets("/", mounts), "/dev", "/sys", "/proc", "/tmp")
- rootInode, err = addSubmountOverlay(ctx, rootInode, submounts)
- if err != nil {
- return nil, fmt.Errorf("adding submount overlay: %v", err)
- }
-
- if conf.Overlay && !spec.Root.Readonly {
- log.Debugf("Adding overlay on top of root mount")
- // Overlay a tmpfs filesystem on top of the root.
- rootInode, err = addOverlay(ctx, conf, rootInode, "root-overlay-upper", mf)
- if err != nil {
- return nil, err
- }
- }
-
- log.Infof("Mounted %q to %q type root", spec.Root.Path, "/")
- return rootInode, nil
-}
-
-func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) {
- // Upper layer uses the same flags as lower, but it must be read-write.
- lowerFlags.ReadOnly = false
-
- tmpFS := mustFindFilesystem("tmpfs")
- if !fs.IsDir(lower.StableAttr) {
- // Create overlay on top of mount file, e.g. /etc/hostname.
- msrc := fs.NewCachingMountSource(tmpFS, lowerFlags)
- return fs.NewOverlayRootFile(ctx, msrc, lower, lowerFlags)
- }
-
- // Create overlay on top of mount dir.
- upper, err := tmpFS.Mount(ctx, name+"-upper", lowerFlags, "", nil)
- if err != nil {
- return nil, fmt.Errorf("creating tmpfs overlay: %v", err)
- }
- return fs.NewOverlayRoot(ctx, upper, lower, lowerFlags)
-}
-
-// getMountNameAndOptions retrieves the fsName, opts, and useOverlay values
-// used for mounts.
-func getMountNameAndOptions(conf *Config, m specs.Mount, fds *fdDispenser) (string, []string, bool, error) {
- var (
- fsName string
- opts []string
- useOverlay bool
- err error
- )
-
- switch m.Type {
- case devpts, devtmpfs, proc, sysfs:
- fsName = m.Type
- case nonefs:
- fsName = sysfs
- case tmpfs:
- fsName = m.Type
-
- // tmpfs has some extra supported options that we must pass through.
- opts, err = parseAndFilterOptions(m.Options, "mode", "uid", "gid")
-
- case bind:
- fd := fds.remove()
- fsName = "9p"
- // Non-root bind mounts are always shared.
- opts = p9MountOptions(fd, FileAccessShared)
- // If configured, add overlay to all writable mounts.
- useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
-
- default:
- // TODO(nlacasse): Support all the mount types and make this a
- // fatal error. Most applications will "just work" without
- // them, so this is a warning for now.
- // we do not support.
- log.Warningf("ignoring unknown filesystem type %q", m.Type)
- }
- return fsName, opts, useOverlay, err
-}
-
-func mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, mounts []specs.Mount, fds *fdDispenser) error {
- for _, m := range mounts {
- if err := mountSubmount(ctx, conf, mns, root, fds, m, mounts); err != nil {
- return fmt.Errorf("mount submount %q: %v", m.Destination, err)
- }
- }
-
- if err := mountTmp(ctx, conf, mns, root, mounts); err != nil {
- return fmt.Errorf("mount submount %q: %v", "tmp", err)
- }
-
- if !fds.empty() {
- return fmt.Errorf("not all mount points were consumed, remaining: %v", fds)
- }
- return nil
-}
-
-// mountSubmount mounts volumes inside the container's root. Because mounts may
-// be readonly, a lower ramfs overlay is added to create the mount point dir.
-// Another overlay is added with tmpfs on top if Config.Overlay is true.
-// 'm.Destination' must be an absolute path with '..' and symlinks resolved.
-func mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, fds *fdDispenser, m specs.Mount, mounts []specs.Mount) error {
- // Map mount type to filesystem name, and parse out the options that we are
- // capable of dealing with.
- fsName, opts, useOverlay, err := getMountNameAndOptions(conf, m, fds)
-
- // Return the error or nil that corresponds to the default case in getMountNameAndOptions.
- if err != nil {
- return err
- }
- if fsName == "" {
- return nil
- }
-
- // All filesystem names should have been mapped to something we know.
- filesystem := mustFindFilesystem(fsName)
-
- mf := mountFlags(m.Options)
- if useOverlay {
- // All writes go to upper, be paranoid and make lower readonly.
- mf.ReadOnly = true
- }
-
- inode, err := filesystem.Mount(ctx, mountDevice(m), mf, strings.Join(opts, ","), nil)
- if err != nil {
- return fmt.Errorf("creating mount with source %q: %v", m.Source, err)
- }
-
- // If there are submounts, we need to overlay the mount on top of a
- // ramfs with stub directories for submount paths.
- submounts := subtargets(m.Destination, mounts)
- if len(submounts) > 0 {
- log.Infof("Adding submount overlay over %q", m.Destination)
- inode, err = addSubmountOverlay(ctx, inode, submounts)
- if err != nil {
- return fmt.Errorf("adding submount overlay: %v", err)
- }
- }
-
- if useOverlay {
- log.Debugf("Adding overlay on top of mount %q", m.Destination)
- inode, err = addOverlay(ctx, conf, inode, m.Type, mf)
- if err != nil {
- return err
- }
- }
-
- maxTraversals := uint(0)
- dirent, err := mns.FindInode(ctx, root, root, m.Destination, &maxTraversals)
- if err != nil {
- return fmt.Errorf("can't find mount destination %q: %v", m.Destination, err)
- }
- defer dirent.DecRef()
- if err := mns.Mount(ctx, dirent, inode); err != nil {
- return fmt.Errorf("mount %q error: %v", m.Destination, err)
- }
-
- log.Infof("Mounted %q to %q type %s", m.Source, m.Destination, m.Type)
- return nil
-}
-
// p9MountOptions creates a slice of options for a p9 mount.
func p9MountOptions(fd int, fa FileAccessType) []string {
opts := []string{
@@ -416,82 +196,6 @@ func mountDevice(m specs.Mount) string {
return "none"
}
-// addRestoreMount adds a mount to the MountSources map used for restoring a
-// checkpointed container.
-func addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount, fds *fdDispenser) error {
- fsName, opts, useOverlay, err := getMountNameAndOptions(conf, m, fds)
-
- // Return the error or nil that corresponds to the default case in getMountNameAndOptions.
- if err != nil {
- return err
- }
- // TODO(nlacasse): Fix this when we support all the mount types and
- // make this a fatal error.
- if fsName == "" {
- return nil
- }
-
- newMount := fs.MountArgs{
- Dev: mountDevice(m),
- Flags: mountFlags(m.Options),
- DataString: strings.Join(opts, ","),
- }
- if useOverlay {
- newMount.Flags.ReadOnly = true
- }
- renv.MountSources[fsName] = append(renv.MountSources[fsName], newMount)
- log.Infof("Added mount at %q: %+v", fsName, newMount)
- return nil
-}
-
-// createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding the mounts
-// to the environment.
-func createRestoreEnvironment(spec *specs.Spec, conf *Config, fds *fdDispenser) (*fs.RestoreEnvironment, error) {
- renv := &fs.RestoreEnvironment{
- MountSources: make(map[string][]fs.MountArgs),
- }
-
- // Add root mount.
- fd := fds.remove()
- opts := p9MountOptions(fd, conf.FileAccess)
-
- mf := fs.MountSourceFlags{}
- if spec.Root.Readonly || conf.Overlay {
- mf.ReadOnly = true
- }
-
- rootMount := fs.MountArgs{
- Dev: rootDevice,
- Flags: mf,
- DataString: strings.Join(opts, ","),
- }
- renv.MountSources[rootFsName] = append(renv.MountSources[rootFsName], rootMount)
-
- // Add submounts.
- var tmpMounted bool
- for _, m := range compileMounts(spec) {
- if err := addRestoreMount(conf, renv, m, fds); err != nil {
- return nil, err
- }
- if filepath.Clean(m.Destination) == "/tmp" {
- tmpMounted = true
- }
- }
-
- // TODO(b/67958150): handle '/tmp' properly (see mountTmp()).
- if !tmpMounted {
- tmpMount := specs.Mount{
- Type: tmpfs,
- Destination: "/tmp",
- }
- if err := addRestoreMount(conf, renv, tmpMount, fds); err != nil {
- return nil, err
- }
- }
-
- return renv, nil
-}
-
func mountFlags(opts []string) fs.MountSourceFlags {
mf := fs.MountSourceFlags{}
for _, o := range opts {
@@ -546,22 +250,254 @@ func subtargets(root string, mnts []specs.Mount) []string {
return targets
}
-// setupContainerFS is used to set up the file system and amend the procArgs accordingly.
-// procArgs are passed by reference and the FDMap field is modified. It dups stdioFDs.
-func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf *Config, stdioFDs, goferFDs []int, console bool, creds *auth.Credentials, ls *limits.LimitSet, k *kernel.Kernel, cid string) error {
- ctx := procArgs.NewContext(k)
-
- // Create the FD map, which will set stdin, stdout, and stderr. If console
- // is true, then ioctl calls will be passed through to the host fd.
- fdm, err := createFDMap(ctx, k, ls, console, stdioFDs)
+// setExecutablePath sets the procArgs.Filename by searching the PATH for an
+// executable matching the procArgs.Argv[0].
+func setExecutablePath(ctx context.Context, mns *fs.MountNamespace, procArgs *kernel.CreateProcessArgs) error {
+ paths := fs.GetPath(procArgs.Envv)
+ exe := procArgs.Argv[0]
+ f, err := mns.ResolveExecutablePath(ctx, procArgs.WorkingDirectory, exe, paths)
if err != nil {
- return fmt.Errorf("importing fds: %v", err)
+ return fmt.Errorf("searching for executable %q, cwd: %q, $PATH=%q: %v", exe, procArgs.WorkingDirectory, strings.Join(paths, ":"), err)
+ }
+ procArgs.Filename = f
+ return nil
+}
+
+func adjustDirentCache(k *kernel.Kernel) error {
+ var hl syscall.Rlimit
+ if err := syscall.Getrlimit(syscall.RLIMIT_NOFILE, &hl); err != nil {
+ return fmt.Errorf("getting RLIMIT_NOFILE: %v", err)
}
+ if int64(hl.Cur) != syscall.RLIM_INFINITY {
+ newSize := hl.Cur / 2
+ if newSize < gofer.DefaultDirentCacheSize {
+ log.Infof("Setting gofer dirent cache size to %d", newSize)
+ gofer.DefaultDirentCacheSize = newSize
+ k.DirentCacheLimiter = fs.NewDirentCacheLimiter(newSize)
+ }
+ }
+ return nil
+}
- // CreateProcess takes a reference on FDMap if successful. We
- // won't need ours either way.
- procArgs.FDMap = fdm
+type fdDispenser struct {
+ fds []int
+}
+func (f *fdDispenser) remove() int {
+ if f.empty() {
+ panic("fdDispenser out of fds")
+ }
+ rv := f.fds[0]
+ f.fds = f.fds[1:]
+ return rv
+}
+
+func (f *fdDispenser) empty() bool {
+ return len(f.fds) == 0
+}
+
+type shareType int
+
+const (
+ invalid shareType = iota
+
+ // container shareType indicates that the mount is used by a single container.
+ container
+
+ // pod shareType indicates that the mount is used by more than one container
+ // inside the pod.
+ pod
+
+ // shared shareType indicates that the mount can also be shared with a process
+ // outside the pod, e.g. NFS.
+ shared
+)
+
+func parseShare(val string) (shareType, error) {
+ switch val {
+ case "container":
+ return container, nil
+ case "pod":
+ return pod, nil
+ case "shared":
+ return shared, nil
+ default:
+ return 0, fmt.Errorf("invalid share value %q", val)
+ }
+}
+
+func (s shareType) String() string {
+ switch s {
+ case invalid:
+ return "invalid"
+ case container:
+ return "container"
+ case pod:
+ return "pod"
+ case shared:
+ return "shared"
+ default:
+ return fmt.Sprintf("invalid share value %d", s)
+ }
+}
+
+// mountHint represents extra information about mounts that are provided via
+// annotations. They can override mount type, and provide sharing information
+// so that mounts can be correctly shared inside the pod.
+type mountHint struct {
+ name string
+ share shareType
+ mount specs.Mount
+
+ // root is the inode where the volume is mounted. For mounts with 'pod' share
+ // the volume is mounted once and then bind mounted inside the containers.
+ root *fs.Inode
+}
+
+func (m *mountHint) setField(key, val string) error {
+ switch key {
+ case "source":
+ if len(val) == 0 {
+ return fmt.Errorf("source cannot be empty")
+ }
+ m.mount.Source = val
+ case "type":
+ return m.setType(val)
+ case "share":
+ share, err := parseShare(val)
+ if err != nil {
+ return err
+ }
+ m.share = share
+ case "options":
+ return m.setOptions(val)
+ default:
+ return fmt.Errorf("invalid mount annotation: %s=%s", key, val)
+ }
+ return nil
+}
+
+func (m *mountHint) setType(val string) error {
+ switch val {
+ case "tmpfs", "bind":
+ m.mount.Type = val
+ default:
+ return fmt.Errorf("invalid type %q", val)
+ }
+ return nil
+}
+
+func (m *mountHint) setOptions(val string) error {
+ opts := strings.Split(val, ",")
+ if err := specutils.ValidateMountOptions(opts); err != nil {
+ return err
+ }
+ // Sort options so it can be compared with container mount options later on.
+ sort.Strings(opts)
+ m.mount.Options = opts
+ return nil
+}
+
+func (m *mountHint) isSupported() bool {
+ return m.mount.Type == tmpfs && m.share == pod
+}
+
+// podMountHints contains a collection of mountHints for the pod.
+type podMountHints struct {
+ mounts map[string]*mountHint
+}
+
+func newPodMountHints(spec *specs.Spec) (*podMountHints, error) {
+ mnts := make(map[string]*mountHint)
+ for k, v := range spec.Annotations {
+ // Look for 'gvisor.dev/spec/mount' annotations and parse them.
+ if strings.HasPrefix(k, MountPrefix) {
+ parts := strings.Split(k, "/")
+ if len(parts) != 5 {
+ return nil, fmt.Errorf("invalid mount annotation: %s=%s", k, v)
+ }
+ name := parts[3]
+ if len(name) == 0 || path.Clean(name) != name {
+ return nil, fmt.Errorf("invalid mount name: %s", name)
+ }
+ mnt := mnts[name]
+ if mnt == nil {
+ mnt = &mountHint{name: name}
+ mnts[name] = mnt
+ }
+ if err := mnt.setField(parts[4], v); err != nil {
+ return nil, err
+ }
+ }
+ }
+
+ // Validate all hints after done parsing.
+ for name, m := range mnts {
+ log.Infof("Mount annotation found, name: %s, source: %q, type: %s, share: %v", name, m.mount.Source, m.mount.Type, m.share)
+ if m.share == invalid {
+ return nil, fmt.Errorf("share field for %q has not been set", m.name)
+ }
+ if len(m.mount.Source) == 0 {
+ return nil, fmt.Errorf("source field for %q has not been set", m.name)
+ }
+ if len(m.mount.Type) == 0 {
+ return nil, fmt.Errorf("type field for %q has not been set", m.name)
+ }
+
+ // Check for duplicate mount sources.
+ for name2, m2 := range mnts {
+ if name != name2 && m.mount.Source == m2.mount.Source {
+ return nil, fmt.Errorf("mounts %q and %q have the same mount source %q", m.name, m2.name, m.mount.Source)
+ }
+ }
+ }
+
+ return &podMountHints{mounts: mnts}, nil
+}
+
+func (p *podMountHints) findMount(mount specs.Mount) *mountHint {
+ for _, m := range p.mounts {
+ if m.mount.Source == mount.Source {
+ return m
+ }
+ }
+ return nil
+}
+
+type containerMounter struct {
+ // cid is the container ID. May be set to empty for the root container.
+ cid string
+
+ root *specs.Root
+
+ // mounts is the set of submounts for the container. It's a copy from the spec
+ // that may be freely modified without affecting the original spec.
+ mounts []specs.Mount
+
+ // fds is the list of FDs to be dispensed for mounts that require it.
+ fds fdDispenser
+
+ k *kernel.Kernel
+
+ hints *podMountHints
+}
+
+func newContainerMounter(spec *specs.Spec, cid string, goferFDs []int, k *kernel.Kernel, hints *podMountHints) *containerMounter {
+ return &containerMounter{
+ cid: cid,
+ root: spec.Root,
+ mounts: compileMounts(spec),
+ fds: fdDispenser{fds: goferFDs},
+ k: k,
+ hints: hints,
+ }
+}
+
+// setupFS is used to set up the file system for containers and amend
+// the procArgs accordingly. This is the main entry point for this rest of
+// functions in this file. procArgs are passed by reference and the FDMap field
+// is modified. It dups stdioFDs.
+func (c *containerMounter) setupFS(ctx context.Context, conf *Config, procArgs *kernel.CreateProcessArgs, creds *auth.Credentials) error {
// Use root user to configure mounts. The current user might not have
// permission to do so.
rootProcArgs := kernel.CreateProcessArgs{
@@ -570,16 +506,19 @@ func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf
Umask: 0022,
MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
}
- rootCtx := rootProcArgs.NewContext(k)
+ rootCtx := rootProcArgs.NewContext(c.k)
// If this is the root container, we also need to setup the root mount
// namespace.
- mns := k.RootMountNamespace()
+ mns := c.k.RootMountNamespace()
if mns == nil {
// Setup the root container.
- return setupRootContainerFS(ctx, rootCtx, spec, conf, goferFDs, func(mns *fs.MountNamespace) {
- k.SetRootMountNamespace(mns)
- })
+ if err := c.setupRootContainer(ctx, rootCtx, conf, func(mns *fs.MountNamespace) {
+ c.k.SetRootMountNamespace(mns)
+ }); err != nil {
+ return err
+ }
+ return c.checkDispenser()
}
// Setup a child container.
@@ -593,18 +532,17 @@ func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf
if err != nil {
return fmt.Errorf("couldn't find child container dir %q: %v", ChildContainersDir, err)
}
- if err := contDir.CreateDirectory(ctx, globalRoot, cid, fs.FilePermsFromMode(0755)); err != nil {
- return fmt.Errorf("create directory %q: %v", cid, err)
+ if err := contDir.CreateDirectory(ctx, globalRoot, c.cid, fs.FilePermsFromMode(0755)); err != nil {
+ return fmt.Errorf("create directory %q: %v", c.cid, err)
}
- containerRoot, err := contDir.Walk(ctx, globalRoot, cid)
+ containerRoot, err := contDir.Walk(ctx, globalRoot, c.cid)
if err != nil {
- return fmt.Errorf("walk to %q failed: %v", cid, err)
+ return fmt.Errorf("walk to %q failed: %v", c.cid, err)
}
defer containerRoot.DecRef()
// Create the container's root filesystem mount.
- fds := &fdDispenser{fds: goferFDs}
- rootInode, err := createRootMount(rootCtx, spec, conf, fds, nil)
+ rootInode, err := c.createRootMount(rootCtx, conf)
if err != nil {
return fmt.Errorf("creating filesystem for container: %v", err)
}
@@ -614,39 +552,32 @@ func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf
return fmt.Errorf("mount container root: %v", err)
}
- // We have to re-walk to the dirent to find the mounted
- // directory. The old dirent is invalid at this point.
- containerRoot, err = contDir.Walk(ctx, globalRoot, cid)
+ // We have to re-walk to the dirent to find the mounted directory. The old
+ // dirent is invalid at this point.
+ containerRoot, err = contDir.Walk(ctx, globalRoot, c.cid)
if err != nil {
- return fmt.Errorf("find container mount point %q: %v", cid, err)
+ return fmt.Errorf("find container mount point %q: %v", c.cid, err)
}
cu := specutils.MakeCleanup(func() { containerRoot.DecRef() })
defer cu.Clean()
- log.Infof("Mounted child's root fs to %q", filepath.Join(ChildContainersDir, cid))
+ log.Infof("Mounted child's root fs to %q", filepath.Join(ChildContainersDir, c.cid))
// Set process root here, so 'rootCtx.Value(CtxRoot)' will return it.
procArgs.Root = containerRoot
// Mount all submounts.
- mounts := compileMounts(spec)
- if err := mountSubmounts(rootCtx, conf, mns, containerRoot, mounts, fds); err != nil {
+ if err := c.mountSubmounts(rootCtx, conf, mns, containerRoot); err != nil {
return err
}
cu.Release()
- return nil
+ return c.checkDispenser()
}
-// setExecutablePath sets the procArgs.Filename by searching the PATH for an
-// executable matching the procArgs.Argv[0].
-func setExecutablePath(ctx context.Context, mns *fs.MountNamespace, procArgs *kernel.CreateProcessArgs) error {
- paths := fs.GetPath(procArgs.Envv)
- exe := procArgs.Argv[0]
- f, err := mns.ResolveExecutablePath(ctx, procArgs.WorkingDirectory, exe, paths)
- if err != nil {
- return fmt.Errorf("searching for executable %q, cwd: %q, $PATH=%q: %v", exe, procArgs.WorkingDirectory, strings.Join(paths, ":"), err)
+func (c *containerMounter) checkDispenser() error {
+ if !c.fds.empty() {
+ return fmt.Errorf("not all gofer FDs were consumed, remaining: %v", c.fds)
}
- procArgs.Filename = f
return nil
}
@@ -715,17 +646,354 @@ func destroyContainerFS(ctx context.Context, cid string, k *kernel.Kernel) error
return nil
}
+// setupRootContainer creates a mount namespace containing the root filesystem
+// and all mounts. 'rootCtx' is used to walk directories to find mount points.
+// 'setMountNS' is called after namespace is created. It must set the mount NS
+// to 'rootCtx'.
+func (c *containerMounter) setupRootContainer(userCtx context.Context, rootCtx context.Context, conf *Config, setMountNS func(*fs.MountNamespace)) error {
+ for _, hint := range c.hints.mounts {
+ log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type)
+ inode, err := c.mountSharedMaster(rootCtx, conf, hint)
+ if err != nil {
+ return fmt.Errorf("mounting shared master %q: %v", hint.name, err)
+ }
+ hint.root = inode
+ }
+
+ // Create a tmpfs mount where we create and mount a root filesystem for
+ // each child container.
+ c.mounts = append(c.mounts, specs.Mount{
+ Type: tmpfs,
+ Destination: ChildContainersDir,
+ })
+
+ rootInode, err := c.createRootMount(rootCtx, conf)
+ if err != nil {
+ return fmt.Errorf("creating root mount: %v", err)
+ }
+ mns, err := fs.NewMountNamespace(userCtx, rootInode)
+ if err != nil {
+ return fmt.Errorf("creating root mount namespace: %v", err)
+ }
+ setMountNS(mns)
+
+ root := mns.Root()
+ defer root.DecRef()
+ return c.mountSubmounts(rootCtx, conf, mns, root)
+}
+
+// mountSharedMaster mounts the master of a volume that is shared among
+// containers in a pod. It returns the root mount's inode.
+func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *Config, hint *mountHint) (*fs.Inode, error) {
+ // Map mount type to filesystem name, and parse out the options that we are
+ // capable of dealing with.
+ fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, hint.mount)
+ if err != nil {
+ return nil, err
+ }
+ if len(fsName) == 0 {
+ return nil, fmt.Errorf("mount type not supported %q", hint.mount.Type)
+ }
+
+ // Mount with revalidate because it's shared among containers.
+ opts = append(opts, "cache=revalidate")
+
+ // All filesystem names should have been mapped to something we know.
+ filesystem := mustFindFilesystem(fsName)
+
+ mf := mountFlags(hint.mount.Options)
+ if useOverlay {
+ // All writes go to upper, be paranoid and make lower readonly.
+ mf.ReadOnly = true
+ }
+
+ inode, err := filesystem.Mount(ctx, mountDevice(hint.mount), mf, strings.Join(opts, ","), nil)
+ if err != nil {
+ return nil, fmt.Errorf("creating mount %q: %v", hint.name, err)
+ }
+
+ if useOverlay {
+ log.Debugf("Adding overlay on top of shared mount %q", hint.name)
+ inode, err = addOverlay(ctx, conf, inode, hint.mount.Type, mf)
+ if err != nil {
+ return nil, err
+ }
+ }
+
+ return inode, nil
+}
+
+// createRootMount creates the root filesystem.
+func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*fs.Inode, error) {
+ // First construct the filesystem from the spec.Root.
+ mf := fs.MountSourceFlags{ReadOnly: c.root.Readonly || conf.Overlay}
+
+ fd := c.fds.remove()
+ log.Infof("Mounting root over 9P, ioFD: %d", fd)
+ p9FS := mustFindFilesystem("9p")
+ opts := p9MountOptions(fd, conf.FileAccess)
+ rootInode, err := p9FS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","), nil)
+ if err != nil {
+ return nil, fmt.Errorf("creating root mount point: %v", err)
+ }
+
+ // We need to overlay the root on top of a ramfs with stub directories
+ // for submount paths. "/dev" "/sys" "/proc" and "/tmp" are always
+ // mounted even if they are not in the spec.
+ submounts := append(subtargets("/", c.mounts), "/dev", "/sys", "/proc", "/tmp")
+ rootInode, err = addSubmountOverlay(ctx, rootInode, submounts)
+ if err != nil {
+ return nil, fmt.Errorf("adding submount overlay: %v", err)
+ }
+
+ if conf.Overlay && !c.root.Readonly {
+ log.Debugf("Adding overlay on top of root mount")
+ // Overlay a tmpfs filesystem on top of the root.
+ rootInode, err = addOverlay(ctx, conf, rootInode, "root-overlay-upper", mf)
+ if err != nil {
+ return nil, err
+ }
+ }
+
+ log.Infof("Mounted %q to %q type root", c.root.Path, "/")
+ return rootInode, nil
+}
+
+// getMountNameAndOptions retrieves the fsName, opts, and useOverlay values
+// used for mounts.
+func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (string, []string, bool, error) {
+ var (
+ fsName string
+ opts []string
+ useOverlay bool
+ err error
+ )
+
+ switch m.Type {
+ case devpts, devtmpfs, proc, sysfs:
+ fsName = m.Type
+ case nonefs:
+ fsName = sysfs
+ case tmpfs:
+ fsName = m.Type
+
+ // tmpfs has some extra supported options that we must pass through.
+ opts, err = parseAndFilterOptions(m.Options, "mode", "uid", "gid")
+
+ case bind:
+ fd := c.fds.remove()
+ fsName = "9p"
+ // Non-root bind mounts are always shared.
+ opts = p9MountOptions(fd, FileAccessShared)
+ // If configured, add overlay to all writable mounts.
+ useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
+
+ default:
+ // TODO(nlacasse): Support all the mount types and make this a fatal error.
+ // Most applications will "just work" without them, so this is a warning
+ // for now.
+ log.Warningf("ignoring unknown filesystem type %q", m.Type)
+ }
+ return fsName, opts, useOverlay, err
+}
+
+func (c *containerMounter) mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent) error {
+ for _, m := range c.mounts {
+ if hint := c.hints.findMount(m); hint != nil && hint.isSupported() {
+ if err := c.mountSharedSubmount(ctx, mns, root, m, hint); err != nil {
+ return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, m.Destination, err)
+ }
+ } else {
+ if err := c.mountSubmount(ctx, conf, mns, root, m); err != nil {
+ return fmt.Errorf("mount submount %q: %v", m.Destination, err)
+ }
+ }
+ }
+
+ if err := c.mountTmp(ctx, conf, mns, root); err != nil {
+ return fmt.Errorf("mount submount %q: %v", "tmp", err)
+ }
+ return nil
+}
+
+// mountSubmount mounts volumes inside the container's root. Because mounts may
+// be readonly, a lower ramfs overlay is added to create the mount point dir.
+// Another overlay is added with tmpfs on top if Config.Overlay is true.
+// 'm.Destination' must be an absolute path with '..' and symlinks resolved.
+func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, m specs.Mount) error {
+ // Map mount type to filesystem name, and parse out the options that we are
+ // capable of dealing with.
+ fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m)
+ if err != nil {
+ return err
+ }
+ if fsName == "" {
+ // Filesystem is not supported (e.g. cgroup), just skip it.
+ return nil
+ }
+
+ // All filesystem names should have been mapped to something we know.
+ filesystem := mustFindFilesystem(fsName)
+
+ mf := mountFlags(m.Options)
+ if useOverlay {
+ // All writes go to upper, be paranoid and make lower readonly.
+ mf.ReadOnly = true
+ }
+
+ inode, err := filesystem.Mount(ctx, mountDevice(m), mf, strings.Join(opts, ","), nil)
+ if err != nil {
+ return fmt.Errorf("creating mount with source %q: %v", m.Source, err)
+ }
+
+ // If there are submounts, we need to overlay the mount on top of a ramfs
+ // with stub directories for submount paths.
+ submounts := subtargets(m.Destination, c.mounts)
+ if len(submounts) > 0 {
+ log.Infof("Adding submount overlay over %q", m.Destination)
+ inode, err = addSubmountOverlay(ctx, inode, submounts)
+ if err != nil {
+ return fmt.Errorf("adding submount overlay: %v", err)
+ }
+ }
+
+ if useOverlay {
+ log.Debugf("Adding overlay on top of mount %q", m.Destination)
+ inode, err = addOverlay(ctx, conf, inode, m.Type, mf)
+ if err != nil {
+ return err
+ }
+ }
+
+ maxTraversals := uint(0)
+ dirent, err := mns.FindInode(ctx, root, root, m.Destination, &maxTraversals)
+ if err != nil {
+ return fmt.Errorf("can't find mount destination %q: %v", m.Destination, err)
+ }
+ defer dirent.DecRef()
+ if err := mns.Mount(ctx, dirent, inode); err != nil {
+ return fmt.Errorf("mount %q error: %v", m.Destination, err)
+ }
+
+ log.Infof("Mounted %q to %q type %s", m.Source, m.Destination, m.Type)
+ return nil
+}
+
+// mountSharedSubmount binds mount to a previously mounted volume that is shared
+// among containers in the same pod.
+func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.MountNamespace, root *fs.Dirent, mount specs.Mount, source *mountHint) error {
+ // For now enforce that all options are the same. Once bind mount is properly
+ // supported, then we should ensure the master is less restrictive than the
+ // container, e.g. master can be 'rw' while container mounts as 'ro'.
+ if len(mount.Options) != len(source.mount.Options) {
+ return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", source.mount.Options, mount.Options)
+ }
+ sort.Strings(mount.Options)
+ for i, opt := range mount.Options {
+ if opt != source.mount.Options[i] {
+ return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", source.mount.Options, mount.Options)
+ }
+ }
+
+ maxTraversals := uint(0)
+ target, err := mns.FindInode(ctx, root, root, mount.Destination, &maxTraversals)
+ if err != nil {
+ return fmt.Errorf("can't find mount destination %q: %v", mount.Destination, err)
+ }
+ defer target.DecRef()
+
+ if err := mns.Mount(ctx, target, source.root); err != nil {
+ return fmt.Errorf("bind mount %q error: %v", mount.Destination, err)
+ }
+
+ log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name)
+ return nil
+}
+
+// addRestoreMount adds a mount to the MountSources map used for restoring a
+// checkpointed container.
+func (c *containerMounter) addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount) error {
+ fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m)
+ if err != nil {
+ return err
+ }
+ if fsName == "" {
+ // Filesystem is not supported (e.g. cgroup), just skip it.
+ return nil
+ }
+
+ newMount := fs.MountArgs{
+ Dev: mountDevice(m),
+ Flags: mountFlags(m.Options),
+ DataString: strings.Join(opts, ","),
+ }
+ if useOverlay {
+ newMount.Flags.ReadOnly = true
+ }
+ renv.MountSources[fsName] = append(renv.MountSources[fsName], newMount)
+ log.Infof("Added mount at %q: %+v", fsName, newMount)
+ return nil
+}
+
+// createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding
+// the mounts to the environment.
+func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEnvironment, error) {
+ renv := &fs.RestoreEnvironment{
+ MountSources: make(map[string][]fs.MountArgs),
+ }
+
+ // Add root mount.
+ fd := c.fds.remove()
+ opts := p9MountOptions(fd, conf.FileAccess)
+
+ mf := fs.MountSourceFlags{}
+ if c.root.Readonly || conf.Overlay {
+ mf.ReadOnly = true
+ }
+
+ rootMount := fs.MountArgs{
+ Dev: rootDevice,
+ Flags: mf,
+ DataString: strings.Join(opts, ","),
+ }
+ renv.MountSources[rootFsName] = append(renv.MountSources[rootFsName], rootMount)
+
+ // Add submounts.
+ var tmpMounted bool
+ for _, m := range c.mounts {
+ if err := c.addRestoreMount(conf, renv, m); err != nil {
+ return nil, err
+ }
+ if filepath.Clean(m.Destination) == "/tmp" {
+ tmpMounted = true
+ }
+ }
+
+ // TODO(b/67958150): handle '/tmp' properly (see mountTmp()).
+ if !tmpMounted {
+ tmpMount := specs.Mount{
+ Type: tmpfs,
+ Destination: "/tmp",
+ }
+ if err := c.addRestoreMount(conf, renv, tmpMount); err != nil {
+ return nil, err
+ }
+ }
+
+ return renv, nil
+}
+
// mountTmp mounts an internal tmpfs at '/tmp' if it's safe to do so.
// Technically we don't have to mount tmpfs at /tmp, as we could just rely on
// the host /tmp, but this is a nice optimization, and fixes some apps that call
// mknod in /tmp. It's unsafe to mount tmpfs if:
-// 1. /tmp is mounted explictly: we should not override user's wish
+// 1. /tmp is mounted explicitly: we should not override user's wish
// 2. /tmp is not empty: mounting tmpfs would hide existing files in /tmp
//
// Note that when there are submounts inside of '/tmp', directories for the
// mount points must be present, making '/tmp' not empty anymore.
-func mountTmp(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, mounts []specs.Mount) error {
- for _, m := range mounts {
+func (c *containerMounter) mountTmp(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent) error {
+ for _, m := range c.mounts {
if filepath.Clean(m.Destination) == "/tmp" {
log.Debugf("Explict %q mount found, skipping internal tmpfs, mount: %+v", "/tmp", m)
return nil
@@ -766,7 +1034,7 @@ func mountTmp(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *f
// another user. This is normally done for /tmp.
Options: []string{"mode=1777"},
}
- return mountSubmount(ctx, conf, mns, root, nil, tmpMount, mounts)
+ return c.mountSubmount(ctx, conf, mns, root, tmpMount)
default:
return err
diff --git a/runsc/boot/fs_test.go b/runsc/boot/fs_test.go
new file mode 100644
index 000000000..49ab34b33
--- /dev/null
+++ b/runsc/boot/fs_test.go
@@ -0,0 +1,193 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package boot
+
+import (
+ "path"
+ "reflect"
+ "strings"
+ "testing"
+
+ specs "github.com/opencontainers/runtime-spec/specs-go"
+)
+
+func TestPodMountHintsHappy(t *testing.T) {
+ spec := &specs.Spec{
+ Annotations: map[string]string{
+ path.Join(MountPrefix, "mount1", "source"): "foo",
+ path.Join(MountPrefix, "mount1", "type"): "tmpfs",
+ path.Join(MountPrefix, "mount1", "share"): "pod",
+
+ path.Join(MountPrefix, "mount2", "source"): "bar",
+ path.Join(MountPrefix, "mount2", "type"): "bind",
+ path.Join(MountPrefix, "mount2", "share"): "container",
+ path.Join(MountPrefix, "mount2", "options"): "rw,private",
+ },
+ }
+ podHints, err := newPodMountHints(spec)
+ if err != nil {
+ t.Errorf("newPodMountHints failed: %v", err)
+ }
+
+ // Check that fields were set correctly.
+ mount1 := podHints.mounts["mount1"]
+ if want := "mount1"; want != mount1.name {
+ t.Errorf("mount1 name, want: %q, got: %q", want, mount1.name)
+ }
+ if want := "foo"; want != mount1.mount.Source {
+ t.Errorf("mount1 source, want: %q, got: %q", want, mount1.mount.Source)
+ }
+ if want := "tmpfs"; want != mount1.mount.Type {
+ t.Errorf("mount1 type, want: %q, got: %q", want, mount1.mount.Type)
+ }
+ if want := pod; want != mount1.share {
+ t.Errorf("mount1 type, want: %q, got: %q", want, mount1.share)
+ }
+ if want := []string(nil); !reflect.DeepEqual(want, mount1.mount.Options) {
+ t.Errorf("mount1 type, want: %q, got: %q", want, mount1.mount.Options)
+ }
+
+ mount2 := podHints.mounts["mount2"]
+ if want := "mount2"; want != mount2.name {
+ t.Errorf("mount2 name, want: %q, got: %q", want, mount2.name)
+ }
+ if want := "bar"; want != mount2.mount.Source {
+ t.Errorf("mount2 source, want: %q, got: %q", want, mount2.mount.Source)
+ }
+ if want := "bind"; want != mount2.mount.Type {
+ t.Errorf("mount2 type, want: %q, got: %q", want, mount2.mount.Type)
+ }
+ if want := container; want != mount2.share {
+ t.Errorf("mount2 type, want: %q, got: %q", want, mount2.share)
+ }
+ if want := []string{"private", "rw"}; !reflect.DeepEqual(want, mount2.mount.Options) {
+ t.Errorf("mount2 type, want: %q, got: %q", want, mount2.mount.Options)
+ }
+}
+
+func TestPodMountHintsErrors(t *testing.T) {
+ for _, tst := range []struct {
+ name string
+ annotations map[string]string
+ error string
+ }{
+ {
+ name: "too short",
+ annotations: map[string]string{
+ path.Join(MountPrefix, "mount1"): "foo",
+ },
+ error: "invalid mount annotation",
+ },
+ {
+ name: "no name",
+ annotations: map[string]string{
+ MountPrefix + "//source": "foo",
+ },
+ error: "invalid mount name",
+ },
+ {
+ name: "missing source",
+ annotations: map[string]string{
+ path.Join(MountPrefix, "mount1", "type"): "tmpfs",
+ path.Join(MountPrefix, "mount1", "share"): "pod",
+ },
+ error: "source field",
+ },
+ {
+ name: "missing type",
+ annotations: map[string]string{
+ path.Join(MountPrefix, "mount1", "source"): "foo",
+ path.Join(MountPrefix, "mount1", "share"): "pod",
+ },
+ error: "type field",
+ },
+ {
+ name: "missing share",
+ annotations: map[string]string{
+ path.Join(MountPrefix, "mount1", "source"): "foo",
+ path.Join(MountPrefix, "mount1", "type"): "tmpfs",
+ },
+ error: "share field",
+ },
+ {
+ name: "invalid field name",
+ annotations: map[string]string{
+ path.Join(MountPrefix, "mount1", "invalid"): "foo",
+ },
+ error: "invalid mount annotation",
+ },
+ {
+ name: "invalid source",
+ annotations: map[string]string{
+ path.Join(MountPrefix, "mount1", "source"): "",
+ path.Join(MountPrefix, "mount1", "type"): "tmpfs",
+ path.Join(MountPrefix, "mount1", "share"): "pod",
+ },
+ error: "source cannot be empty",
+ },
+ {
+ name: "invalid type",
+ annotations: map[string]string{
+ path.Join(MountPrefix, "mount1", "source"): "foo",
+ path.Join(MountPrefix, "mount1", "type"): "invalid-type",
+ path.Join(MountPrefix, "mount1", "share"): "pod",
+ },
+ error: "invalid type",
+ },
+ {
+ name: "invalid share",
+ annotations: map[string]string{
+ path.Join(MountPrefix, "mount1", "source"): "foo",
+ path.Join(MountPrefix, "mount1", "type"): "tmpfs",
+ path.Join(MountPrefix, "mount1", "share"): "invalid-share",
+ },
+ error: "invalid share",
+ },
+ {
+ name: "invalid options",
+ annotations: map[string]string{
+ path.Join(MountPrefix, "mount1", "source"): "foo",
+ path.Join(MountPrefix, "mount1", "type"): "tmpfs",
+ path.Join(MountPrefix, "mount1", "share"): "pod",
+ path.Join(MountPrefix, "mount1", "options"): "invalid-option",
+ },
+ error: "unknown mount option",
+ },
+ {
+ name: "duplicate source",
+ annotations: map[string]string{
+ path.Join(MountPrefix, "mount1", "source"): "foo",
+ path.Join(MountPrefix, "mount1", "type"): "tmpfs",
+ path.Join(MountPrefix, "mount1", "share"): "pod",
+
+ path.Join(MountPrefix, "mount2", "source"): "foo",
+ path.Join(MountPrefix, "mount2", "type"): "bind",
+ path.Join(MountPrefix, "mount2", "share"): "container",
+ },
+ error: "have the same mount source",
+ },
+ } {
+ t.Run(tst.name, func(t *testing.T) {
+ spec := &specs.Spec{Annotations: tst.annotations}
+ podHints, err := newPodMountHints(spec)
+ if err == nil || !strings.Contains(err.Error(), tst.error) {
+ t.Errorf("newPodMountHints invalid error, want: .*%s.*, got: %v", tst.error, err)
+ }
+ if podHints != nil {
+ t.Errorf("newPodMountHints must return nil on failure: %+v", podHints)
+ }
+ })
+ }
+}
diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go
index 6ac6b94dd..c1dea736f 100644
--- a/runsc/boot/loader.go
+++ b/runsc/boot/loader.go
@@ -29,6 +29,7 @@ import (
"gvisor.googlesource.com/gvisor/pkg/abi/linux"
"gvisor.googlesource.com/gvisor/pkg/cpuid"
"gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/memutil"
"gvisor.googlesource.com/gvisor/pkg/rand"
"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
"gvisor.googlesource.com/gvisor/pkg/sentry/control"
@@ -37,7 +38,6 @@ import (
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
"gvisor.googlesource.com/gvisor/pkg/sentry/loader"
- "gvisor.googlesource.com/gvisor/pkg/sentry/memutil"
"gvisor.googlesource.com/gvisor/pkg/sentry/pgalloc"
"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
"gvisor.googlesource.com/gvisor/pkg/sentry/platform/kvm"
@@ -117,6 +117,10 @@ type Loader struct {
//
// processes is guardded by mu.
processes map[execID]*execProcess
+
+ // mountHints provides extra information about mounts for containers that
+ // apply to the entire pod.
+ mountHints *podMountHints
}
// execID uniquely identifies a sentry process that is executed in a container.
@@ -288,7 +292,7 @@ func New(args Args) (*Loader, error) {
}
// Create a watchdog.
- watchdog := watchdog.New(k, watchdog.DefaultTimeout, args.Conf.WatchdogAction)
+ dog := watchdog.New(k, watchdog.DefaultTimeout, args.Conf.WatchdogAction)
procArgs, err := newProcess(args.ID, args.Spec, creds, k)
if err != nil {
@@ -299,18 +303,24 @@ func New(args Args) (*Loader, error) {
return nil, fmt.Errorf("initializing compat logs: %v", err)
}
+ mountHints, err := newPodMountHints(args.Spec)
+ if err != nil {
+ return nil, fmt.Errorf("creating pod mount hints: %v", err)
+ }
+
eid := execID{cid: args.ID}
l := &Loader{
k: k,
conf: args.Conf,
console: args.Console,
- watchdog: watchdog,
+ watchdog: dog,
spec: args.Spec,
goferFDs: args.GoferFDs,
stdioFDs: args.StdioFDs,
rootProcArgs: procArgs,
sandboxID: args.ID,
processes: map[execID]*execProcess{eid: {}},
+ mountHints: mountHints,
}
// We don't care about child signals; some platforms can generate a
@@ -424,6 +434,9 @@ func createMemoryFile() (*pgalloc.MemoryFile, error) {
return nil, fmt.Errorf("error creating memfd: %v", err)
}
memfile := os.NewFile(uintptr(memfd), memfileName)
+ // We can't enable pgalloc.MemoryFileOpts.UseHostMemcgPressure even if
+ // there are memory cgroups specified, because at this point we're already
+ // in a mount namespace in which the relevant cgroupfs is not visible.
mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{})
if err != nil {
memfile.Close()
@@ -432,7 +445,24 @@ func createMemoryFile() (*pgalloc.MemoryFile, error) {
return mf, nil
}
-// Run runs the root container..
+func (l *Loader) installSeccompFilters() error {
+ if l.conf.DisableSeccomp {
+ filter.Report("syscall filter is DISABLED. Running in less secure mode.")
+ } else {
+ opts := filter.Options{
+ Platform: l.k.Platform,
+ HostNetwork: l.conf.Network == NetworkHost,
+ ProfileEnable: l.conf.ProfileEnable,
+ ControllerFD: l.ctrl.srv.FD(),
+ }
+ if err := filter.Install(opts); err != nil {
+ return fmt.Errorf("installing seccomp filters: %v", err)
+ }
+ }
+ return nil
+}
+
+// Run runs the root container.
func (l *Loader) Run() error {
err := l.run()
l.ctrl.manager.startResultChan <- err
@@ -467,36 +497,34 @@ func (l *Loader) run() error {
return fmt.Errorf("trying to start deleted container %q", l.sandboxID)
}
- // Finally done with all configuration. Setup filters before user code
- // is loaded.
- if l.conf.DisableSeccomp {
- filter.Report("syscall filter is DISABLED. Running in less secure mode.")
- } else {
- opts := filter.Options{
- Platform: l.k.Platform,
- HostNetwork: l.conf.Network == NetworkHost,
- ProfileEnable: l.conf.ProfileEnable,
- ControllerFD: l.ctrl.srv.FD(),
- }
- if err := filter.Install(opts); err != nil {
- return fmt.Errorf("installing seccomp filters: %v", err)
- }
- }
-
// If we are restoring, we do not want to create a process.
// l.restore is set by the container manager when a restore call is made.
if !l.restore {
- if err := setupContainerFS(
- &l.rootProcArgs,
- l.spec,
- l.conf,
- l.stdioFDs,
- l.goferFDs,
- l.console,
- l.rootProcArgs.Credentials,
- l.rootProcArgs.Limits,
- l.k,
- "" /* CID, which isn't needed for the root container */); err != nil {
+ if l.conf.ProfileEnable {
+ initializePProf()
+ }
+
+ // Finally done with all configuration. Setup filters before user code
+ // is loaded.
+ if err := l.installSeccompFilters(); err != nil {
+ return err
+ }
+
+ // Create the FD map, which will set stdin, stdout, and stderr. If console
+ // is true, then ioctl calls will be passed through to the host fd.
+ ctx := l.rootProcArgs.NewContext(l.k)
+ fdm, err := createFDMap(ctx, l.rootProcArgs.Limits, l.console, l.stdioFDs)
+ if err != nil {
+ return fmt.Errorf("importing fds: %v", err)
+ }
+ // CreateProcess takes a reference on FDMap if successful. We won't need
+ // ours either way.
+ l.rootProcArgs.FDMap = fdm
+
+ // cid for root container can be empty. Only subcontainers need it to set
+ // the mount location.
+ mntr := newContainerMounter(l.spec, "", l.goferFDs, l.k, l.mountHints)
+ if err := mntr.setupFS(ctx, l.conf, &l.rootProcArgs, l.rootProcArgs.Credentials); err != nil {
return err
}
@@ -552,7 +580,7 @@ func (l *Loader) createContainer(cid string) error {
// startContainer starts a child container. It returns the thread group ID of
// the newly created process. Caller owns 'files' and may close them after
// this method returns.
-func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config, cid string, files []*os.File) error {
+func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, files []*os.File) error {
// Create capabilities.
caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities)
if err != nil {
@@ -596,6 +624,16 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
stdioFDs = append(stdioFDs, int(f.Fd()))
}
+ // Create the FD map, which will set stdin, stdout, and stderr.
+ ctx := procArgs.NewContext(l.k)
+ fdm, err := createFDMap(ctx, procArgs.Limits, false, stdioFDs)
+ if err != nil {
+ return fmt.Errorf("importing fds: %v", err)
+ }
+ // CreateProcess takes a reference on FDMap if successful. We won't need ours
+ // either way.
+ procArgs.FDMap = fdm
+
// Can't take ownership away from os.File. dup them to get a new FDs.
var goferFDs []int
for _, f := range files[3:] {
@@ -606,22 +644,12 @@ func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config
goferFDs = append(goferFDs, fd)
}
- if err := setupContainerFS(
- &procArgs,
- spec,
- conf,
- stdioFDs,
- goferFDs,
- false,
- creds,
- procArgs.Limits,
- k,
- cid); err != nil {
+ mntr := newContainerMounter(spec, cid, goferFDs, l.k, l.mountHints)
+ if err := mntr.setupFS(ctx, conf, &procArgs, creds); err != nil {
return fmt.Errorf("configuring container FS: %v", err)
}
- ctx := procArgs.NewContext(l.k)
- mns := k.RootMountNamespace()
+ mns := l.k.RootMountNamespace()
if err := setExecutablePath(ctx, mns, &procArgs); err != nil {
return fmt.Errorf("setting executable path for %+v: %v", procArgs, err)
}
@@ -724,7 +752,7 @@ func (l *Loader) waitContainer(cid string, waitStatus *uint32) error {
return nil
}
-func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, clearStatus bool, waitStatus *uint32) error {
+func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) error {
if tgid <= 0 {
return fmt.Errorf("PID (%d) must be positive", tgid)
}
@@ -736,13 +764,10 @@ func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, clearStatus bool, wai
ws := l.wait(execTG)
*waitStatus = ws
- // Remove tg from the cache if caller requested it.
- if clearStatus {
- l.mu.Lock()
- delete(l.processes, eid)
- log.Debugf("updated processes (removal): %v", l.processes)
- l.mu.Unlock()
- }
+ l.mu.Lock()
+ delete(l.processes, eid)
+ log.Debugf("updated processes (removal): %v", l.processes)
+ l.mu.Unlock()
return nil
}
diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go
index 4603f751d..2f2499811 100644
--- a/runsc/boot/loader_test.go
+++ b/runsc/boot/loader_test.go
@@ -397,14 +397,15 @@ func TestCreateMountNamespace(t *testing.T) {
}
defer cleanup()
- // setupRootContainerFS needs to find root from the context after the
+ // setupRootContainer needs to find root from the context after the
// namespace is created.
var mns *fs.MountNamespace
setMountNS := func(m *fs.MountNamespace) {
mns = m
ctx.(*contexttest.TestContext).RegisterValue(fs.CtxRoot, mns.Root())
}
- if err := setupRootContainerFS(ctx, ctx, &tc.spec, conf, []int{sandEnd}, setMountNS); err != nil {
+ mntr := newContainerMounter(&tc.spec, "", []int{sandEnd}, nil, &podMountHints{})
+ if err := mntr.setupRootContainer(ctx, ctx, conf, setMountNS); err != nil {
t.Fatalf("createMountNamespace test case %q failed: %v", tc.name, err)
}
root := mns.Root()
@@ -609,8 +610,8 @@ func TestRestoreEnvironment(t *testing.T) {
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
conf := testConfig()
- fds := &fdDispenser{fds: tc.ioFDs}
- actualRenv, err := createRestoreEnvironment(tc.spec, conf, fds)
+ mntr := newContainerMounter(tc.spec, "", tc.ioFDs, nil, &podMountHints{})
+ actualRenv, err := mntr.createRestoreEnvironment(conf)
if !tc.errorExpected && err != nil {
t.Fatalf("could not create restore environment for test:%s", tc.name)
} else if tc.errorExpected {
diff --git a/runsc/boot/network.go b/runsc/boot/network.go
index 0a154d90b..d86803252 100644
--- a/runsc/boot/network.go
+++ b/runsc/boot/network.go
@@ -56,7 +56,11 @@ type FDBasedLink struct {
Addresses []net.IP
Routes []Route
GSOMaxSize uint32
- LinkAddress []byte
+ LinkAddress net.HardwareAddr
+
+ // NumChannels controls how many underlying FD's are to be used to
+ // create this endpoint.
+ NumChannels int
}
// LoopbackLink configures a loopback li nk.
@@ -68,8 +72,9 @@ type LoopbackLink struct {
// CreateLinksAndRoutesArgs are arguments to CreateLinkAndRoutes.
type CreateLinksAndRoutesArgs struct {
- // FilePayload contains the fds associated with the FDBasedLinks. The
- // two slices must have the same length.
+ // FilePayload contains the fds associated with the FDBasedLinks. The
+ // number of fd's should match the sum of the NumChannels field of the
+ // FDBasedLink entries below.
urpc.FilePayload
LoopbackLinks []LoopbackLink
@@ -95,8 +100,12 @@ func (r *Route) toTcpipRoute(id tcpip.NICID) tcpip.Route {
// CreateLinksAndRoutes creates links and routes in a network stack. It should
// only be called once.
func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct{}) error {
- if len(args.FilePayload.Files) != len(args.FDBasedLinks) {
- return fmt.Errorf("FilePayload must be same length at FDBasedLinks")
+ wantFDs := 0
+ for _, l := range args.FDBasedLinks {
+ wantFDs += l.NumChannels
+ }
+ if got := len(args.FilePayload.Files); got != wantFDs {
+ return fmt.Errorf("args.FilePayload.Files has %d FD's but we need %d entries based on FDBasedLinks", got, wantFDs)
}
var nicID tcpip.NICID
@@ -123,20 +132,26 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
}
}
- for i, link := range args.FDBasedLinks {
+ fdOffset := 0
+ for _, link := range args.FDBasedLinks {
nicID++
nicids[link.Name] = nicID
- // Copy the underlying FD.
- oldFD := args.FilePayload.Files[i].Fd()
- newFD, err := syscall.Dup(int(oldFD))
- if err != nil {
- return fmt.Errorf("failed to dup FD %v: %v", oldFD, err)
+ FDs := []int{}
+ for j := 0; j < link.NumChannels; j++ {
+ // Copy the underlying FD.
+ oldFD := args.FilePayload.Files[fdOffset].Fd()
+ newFD, err := syscall.Dup(int(oldFD))
+ if err != nil {
+ return fmt.Errorf("failed to dup FD %v: %v", oldFD, err)
+ }
+ FDs = append(FDs, newFD)
+ fdOffset++
}
mac := tcpip.LinkAddress(link.LinkAddress)
linkEP, err := fdbased.New(&fdbased.Options{
- FD: newFD,
+ FDs: FDs,
MTU: uint32(link.MTU),
EthernetHeader: true,
Address: mac,
@@ -148,7 +163,7 @@ func (n *Network) CreateLinksAndRoutes(args *CreateLinksAndRoutesArgs, _ *struct
return err
}
- log.Infof("Enabling interface %q with id %d on addresses %+v (%v)", link.Name, nicID, link.Addresses, mac)
+ log.Infof("Enabling interface %q with id %d on addresses %+v (%v) w/ %d channels", link.Name, nicID, link.Addresses, mac, link.NumChannels)
if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses, false /* loopback */); err != nil {
return err
}
diff --git a/pkg/sentry/memutil/memutil.go b/runsc/boot/pprof.go
index a4154c42a..463362f02 100644
--- a/pkg/sentry/memutil/memutil.go
+++ b/runsc/boot/pprof.go
@@ -1,4 +1,4 @@
-// Copyright 2018 The gVisor Authors.
+// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -12,5 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-// Package memutil contains the utility functions for memory operations.
-package memutil
+package boot
+
+func initializePProf() {
+}
diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD
index b7551a5ab..df6af0ced 100644
--- a/runsc/cmd/BUILD
+++ b/runsc/cmd/BUILD
@@ -14,9 +14,11 @@ go_library(
"debug.go",
"delete.go",
"do.go",
+ "error.go",
"events.go",
"exec.go",
"gofer.go",
+ "help.go",
"kill.go",
"list.go",
"path.go",
@@ -28,6 +30,7 @@ go_library(
"spec.go",
"start.go",
"state.go",
+ "syscalls.go",
"wait.go",
],
importpath = "gvisor.googlesource.com/gvisor/runsc/cmd",
@@ -38,6 +41,7 @@ go_library(
"//pkg/log",
"//pkg/p9",
"//pkg/sentry/control",
+ "//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
"//pkg/unet",
"//pkg/urpc",
diff --git a/runsc/cmd/boot.go b/runsc/cmd/boot.go
index 3a547d4aa..e0a950e9c 100644
--- a/runsc/cmd/boot.go
+++ b/runsc/cmd/boot.go
@@ -130,6 +130,8 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
// Ensure that if there is a panic, all goroutine stacks are printed.
debug.SetTraceback("all")
+ conf := args[0].(*boot.Config)
+
if b.setUpRoot {
if err := setUpChroot(b.pidns); err != nil {
Fatalf("error setting up chroot: %v", err)
@@ -143,14 +145,16 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
args = append(args, arg)
}
}
- // Note that we've already read the spec from the spec FD, and
- // we will read it again after the exec call. This works
- // because the ReadSpecFromFile function seeks to the beginning
- // of the file before reading.
- if err := callSelfAsNobody(args); err != nil {
- Fatalf("%v", err)
+ if !conf.Rootless {
+ // Note that we've already read the spec from the spec FD, and
+ // we will read it again after the exec call. This works
+ // because the ReadSpecFromFile function seeks to the beginning
+ // of the file before reading.
+ if err := callSelfAsNobody(args); err != nil {
+ Fatalf("%v", err)
+ }
+ panic("callSelfAsNobody must never return success")
}
- panic("callSelfAsNobody must never return success")
}
}
@@ -163,9 +167,6 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
}
specutils.LogSpec(spec)
- conf := args[0].(*boot.Config)
- waitStatus := args[1].(*syscall.WaitStatus)
-
if b.applyCaps {
caps := spec.Process.Capabilities
if caps == nil {
@@ -251,6 +252,7 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
ws := l.WaitExit()
log.Infof("application exiting with %+v", ws)
+ waitStatus := args[1].(*syscall.WaitStatus)
*waitStatus = syscall.WaitStatus(ws.Status())
l.Destroy()
return subcommands.ExitSuccess
diff --git a/runsc/cmd/capability_test.go b/runsc/cmd/capability_test.go
index ee74d33d8..2825dfaa5 100644
--- a/runsc/cmd/capability_test.go
+++ b/runsc/cmd/capability_test.go
@@ -116,6 +116,6 @@ func TestCapabilities(t *testing.T) {
}
func TestMain(m *testing.M) {
- testutil.RunAsRoot()
+ specutils.MaybeRunAsRoot()
os.Exit(m.Run())
}
diff --git a/runsc/cmd/cmd.go b/runsc/cmd/cmd.go
index a2fc377d1..5b4cc4a39 100644
--- a/runsc/cmd/cmd.go
+++ b/runsc/cmd/cmd.go
@@ -17,34 +17,15 @@ package cmd
import (
"fmt"
- "os"
"runtime"
"strconv"
"syscall"
- "github.com/google/subcommands"
specs "github.com/opencontainers/runtime-spec/specs-go"
"gvisor.googlesource.com/gvisor/pkg/log"
"gvisor.googlesource.com/gvisor/runsc/specutils"
)
-// Errorf logs to stderr and returns subcommands.ExitFailure.
-func Errorf(s string, args ...interface{}) subcommands.ExitStatus {
- // If runsc is being invoked by docker or cri-o, then we might not have
- // access to stderr, so we log a serious-looking warning in addition to
- // writing to stderr.
- log.Warningf("FATAL ERROR: "+s, args...)
- fmt.Fprintf(os.Stderr, s+"\n", args...)
- // Return an error that is unlikely to be used by the application.
- return subcommands.ExitFailure
-}
-
-// Fatalf logs to stderr and exits with a failure status code.
-func Fatalf(s string, args ...interface{}) {
- Errorf(s, args...)
- os.Exit(128)
-}
-
// intFlags can be used with int flags that appear multiple times.
type intFlags []int
diff --git a/runsc/cmd/create.go b/runsc/cmd/create.go
index 629c198fd..e82e8c667 100644
--- a/runsc/cmd/create.go
+++ b/runsc/cmd/create.go
@@ -16,7 +16,6 @@ package cmd
import (
"context"
-
"flag"
"github.com/google/subcommands"
"gvisor.googlesource.com/gvisor/runsc/boot"
@@ -83,13 +82,17 @@ func (c *Create) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}
id := f.Arg(0)
conf := args[0].(*boot.Config)
+ if conf.Rootless {
+ return Errorf("Rootless mode not supported with %q", c.Name())
+ }
+
bundleDir := c.bundleDir
if bundleDir == "" {
bundleDir = getwdOrDie()
}
spec, err := specutils.ReadSpec(bundleDir)
if err != nil {
- Fatalf("reading spec: %v", err)
+ return Errorf("reading spec: %v", err)
}
specutils.LogSpec(spec)
@@ -97,7 +100,7 @@ func (c *Create) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}
// container unless the metadata specifies that it should be run in an
// existing container.
if _, err := container.Create(id, spec, conf, bundleDir, c.consoleSocket, c.pidFile, c.userLog); err != nil {
- Fatalf("creating container: %v", err)
+ return Errorf("creating container: %v", err)
}
return subcommands.ExitSuccess
}
diff --git a/runsc/cmd/do.go b/runsc/cmd/do.go
index 8ea59046c..3f6e46fce 100644
--- a/runsc/cmd/do.go
+++ b/runsc/cmd/do.go
@@ -39,10 +39,9 @@ import (
// Do implements subcommands.Command for the "do" command. It sets up a simple
// sandbox and executes the command inside it. See Usage() for more details.
type Do struct {
- root string
- cwd string
- ip string
- networkNamespace bool
+ root string
+ cwd string
+ ip string
}
// Name implements subcommands.Command.Name.
@@ -72,7 +71,6 @@ func (c *Do) SetFlags(f *flag.FlagSet) {
f.StringVar(&c.root, "root", "/", `path to the root directory, defaults to "/"`)
f.StringVar(&c.cwd, "cwd", ".", "path to the current directory, defaults to the current directory")
f.StringVar(&c.ip, "ip", "192.168.10.2", "IPv4 address for the sandbox")
- f.BoolVar(&c.networkNamespace, "netns", true, "run in a new network namespace")
}
// Execute implements subcommands.Command.Execute.
@@ -85,15 +83,21 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su
conf := args[0].(*boot.Config)
waitStatus := args[1].(*syscall.WaitStatus)
- // Map the entire host file system, but make it readonly with a writable
- // overlay on top (ignore --overlay option).
- conf.Overlay = true
+ if conf.Rootless {
+ if err := specutils.MaybeRunAsRoot(); err != nil {
+ return Errorf("Error executing inside namespace: %v", err)
+ }
+ // Execution will continue here if no more capabilities are needed...
+ }
hostname, err := os.Hostname()
if err != nil {
return Errorf("Error to retrieve hostname: %v", err)
}
+ // Map the entire host file system, but make it readonly with a writable
+ // overlay on top (ignore --overlay option).
+ conf.Overlay = true
absRoot, err := resolvePath(c.root)
if err != nil {
return Errorf("Error resolving root: %v", err)
@@ -119,11 +123,22 @@ func (c *Do) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) su
specutils.LogSpec(spec)
cid := fmt.Sprintf("runsc-%06d", rand.Int31n(1000000))
- if !c.networkNamespace {
- if conf.Network != boot.NetworkHost {
- Fatalf("The current network namespace can be used only if --network=host is set", nil)
+ if conf.Network == boot.NetworkNone {
+ netns := specs.LinuxNamespace{
+ Type: specs.NetworkNamespace,
+ }
+ if spec.Linux != nil {
+ panic("spec.Linux is not nil")
}
- } else if conf.Network != boot.NetworkNone {
+ spec.Linux = &specs.Linux{Namespaces: []specs.LinuxNamespace{netns}}
+
+ } else if conf.Rootless {
+ if conf.Network == boot.NetworkSandbox {
+ fmt.Println("*** Rootless requires changing network type to host ***")
+ conf.Network = boot.NetworkHost
+ }
+
+ } else {
clean, err := c.setupNet(cid, spec)
if err != nil {
return Errorf("Error setting up network: %v", err)
diff --git a/runsc/cmd/error.go b/runsc/cmd/error.go
new file mode 100644
index 000000000..700b19f14
--- /dev/null
+++ b/runsc/cmd/error.go
@@ -0,0 +1,72 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+ "encoding/json"
+ "fmt"
+ "io"
+ "os"
+ "time"
+
+ "github.com/google/subcommands"
+ "gvisor.googlesource.com/gvisor/pkg/log"
+)
+
+// ErrorLogger is where error messages should be written to. These messages are
+// consumed by containerd and show up to users of command line tools,
+// like docker/kubectl.
+var ErrorLogger io.Writer
+
+type jsonError struct {
+ Msg string `json:"msg"`
+ Level string `json:"level"`
+ Time time.Time `json:"time"`
+}
+
+// Errorf logs error to containerd log (--log), to stderr, and debug logs. It
+// returns subcommands.ExitFailure for convenience with subcommand.Execute()
+// methods:
+// return Errorf("Danger! Danger!")
+//
+func Errorf(format string, args ...interface{}) subcommands.ExitStatus {
+ // If runsc is being invoked by docker or cri-o, then we might not have
+ // access to stderr, so we log a serious-looking warning in addition to
+ // writing to stderr.
+ log.Warningf("FATAL ERROR: "+format, args...)
+ fmt.Fprintf(os.Stderr, format+"\n", args...)
+
+ j := jsonError{
+ Msg: fmt.Sprintf(format, args...),
+ Level: "error",
+ Time: time.Now(),
+ }
+ b, err := json.Marshal(j)
+ if err != nil {
+ panic(err)
+ }
+ if ErrorLogger != nil {
+ ErrorLogger.Write(b)
+ }
+
+ return subcommands.ExitFailure
+}
+
+// Fatalf logs the same way as Errorf() does, plus *exits* the process.
+func Fatalf(format string, args ...interface{}) {
+ Errorf(format, args...)
+ // Return an error that is unlikely to be used by the application.
+ os.Exit(128)
+}
diff --git a/runsc/cmd/exec.go b/runsc/cmd/exec.go
index 52fd7ac4b..0eeaaadba 100644
--- a/runsc/cmd/exec.go
+++ b/runsc/cmd/exec.go
@@ -40,8 +40,6 @@ import (
"gvisor.googlesource.com/gvisor/runsc/specutils"
)
-const privateClearStatusFlag = "private-clear-status"
-
// Exec implements subcommands.Command for the "exec" command.
type Exec struct {
cwd string
@@ -51,7 +49,6 @@ type Exec struct {
extraKGIDs stringSlice
caps stringSlice
detach bool
- clearStatus bool
processPath string
pidFile string
internalPidFile string
@@ -103,10 +100,6 @@ func (ex *Exec) SetFlags(f *flag.FlagSet) {
f.StringVar(&ex.pidFile, "pid-file", "", "filename that the container pid will be written to")
f.StringVar(&ex.internalPidFile, "internal-pid-file", "", "filename that the container-internal pid will be written to")
f.StringVar(&ex.consoleSocket, "console-socket", "", "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal")
-
- // This flag clears the status of the exec'd process upon completion. It is
- // only used when we fork due to --detach being set on the parent.
- f.BoolVar(&ex.clearStatus, privateClearStatusFlag, true, "private flag, do not use")
}
// Execute implements subcommands.Command.Execute. It starts a process in an
@@ -150,13 +143,16 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
// write the child's PID to the pid file. So when the container returns, the
// child process will also return and signal containerd.
if ex.detach {
- return ex.execAndWait(waitStatus)
+ return ex.execChildAndWait(waitStatus)
}
+ return ex.exec(c, e, waitStatus)
+}
+func (ex *Exec) exec(c *container.Container, e *control.ExecArgs, waitStatus *syscall.WaitStatus) subcommands.ExitStatus {
// Start the new process and get it pid.
pid, err := c.Execute(e)
if err != nil {
- Fatalf("getting processes for container: %v", err)
+ return Errorf("executing processes for container: %v", err)
}
if e.StdioIsPty {
@@ -170,33 +166,37 @@ func (ex *Exec) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
if ex.internalPidFile != "" {
pidStr := []byte(strconv.Itoa(int(pid)))
if err := ioutil.WriteFile(ex.internalPidFile, pidStr, 0644); err != nil {
- Fatalf("writing internal pid file %q: %v", ex.internalPidFile, err)
+ return Errorf("writing internal pid file %q: %v", ex.internalPidFile, err)
}
}
- // Generate the pid file after the internal pid file is generated, so that users
- // can safely assume that the internal pid file is ready after `runsc exec -d`
- // returns.
+ // Generate the pid file after the internal pid file is generated, so that
+ // users can safely assume that the internal pid file is ready after
+ // `runsc exec -d` returns.
if ex.pidFile != "" {
if err := ioutil.WriteFile(ex.pidFile, []byte(strconv.Itoa(os.Getpid())), 0644); err != nil {
- Fatalf("writing pid file: %v", err)
+ return Errorf("writing pid file: %v", err)
}
}
// Wait for the process to exit.
- ws, err := c.WaitPID(pid, ex.clearStatus)
+ ws, err := c.WaitPID(pid)
if err != nil {
- Fatalf("waiting on pid %d: %v", pid, err)
+ return Errorf("waiting on pid %d: %v", pid, err)
}
*waitStatus = ws
return subcommands.ExitSuccess
}
-func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStatus {
- binPath := specutils.ExePath
+func (ex *Exec) execChildAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStatus {
var args []string
+ for _, a := range os.Args[1:] {
+ if !strings.Contains(a, "detach") {
+ args = append(args, a)
+ }
+ }
- // The command needs to write a pid file so that execAndWait can tell
+ // The command needs to write a pid file so that execChildAndWait can tell
// when it has started. If no pid-file was provided, we should use a
// filename in a temp directory.
pidFile := ex.pidFile
@@ -210,19 +210,7 @@ func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStat
args = append(args, "--pid-file="+pidFile)
}
- // Add the rest of the args, excluding the "detach" flag.
- for _, a := range os.Args[1:] {
- if strings.Contains(a, "detach") {
- // Replace with the "private-clear-status" flag, which tells
- // the new process it's a detached child and shouldn't
- // clear the exit status of the sentry process.
- args = append(args, fmt.Sprintf("--%s=false", privateClearStatusFlag))
- } else {
- args = append(args, a)
- }
- }
-
- cmd := exec.Command(binPath, args...)
+ cmd := exec.Command(specutils.ExePath, args...)
cmd.Args[0] = "runsc-exec"
// Exec stdio defaults to current process stdio.
@@ -233,8 +221,7 @@ func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStat
// If the console control socket file is provided, then create a new
// pty master/slave pair and set the TTY on the sandbox process.
if ex.consoleSocket != "" {
- // Create a new TTY pair and send the master on the provided
- // socket.
+ // Create a new TTY pair and send the master on the provided socket.
tty, err := console.NewWithSocket(ex.consoleSocket)
if err != nil {
Fatalf("setting up console with socket %q: %v", ex.consoleSocket, err)
@@ -256,7 +243,7 @@ func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStat
Fatalf("failure to start child exec process, err: %v", err)
}
- log.Infof("Started child (PID: %d) to exec and wait: %s %s", cmd.Process.Pid, binPath, args)
+ log.Infof("Started child (PID: %d) to exec and wait: %s %s", cmd.Process.Pid, specutils.ExePath, args)
// Wait for PID file to ensure that child process has started. Otherwise,
// '--process' file is deleted as soon as this process returns and the child
@@ -278,7 +265,10 @@ func (ex *Exec) execAndWait(waitStatus *syscall.WaitStatus) subcommands.ExitStat
return false, nil
}
if err := specutils.WaitForReady(cmd.Process.Pid, 10*time.Second, ready); err != nil {
- Fatalf("unexpected error waiting for PID file, err: %v", err)
+ // Don't log fatal error here, otherwise it will override the error logged
+ // by the child process that has failed to start.
+ log.Warningf("Unexpected error waiting for PID file, err: %v", err)
+ return subcommands.ExitFailure
}
*waitStatus = 0
diff --git a/runsc/cmd/help.go b/runsc/cmd/help.go
new file mode 100644
index 000000000..ff4f901cb
--- /dev/null
+++ b/runsc/cmd/help.go
@@ -0,0 +1,126 @@
+// Copyright 2018 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+ "context"
+ "fmt"
+
+ "flag"
+ "github.com/google/subcommands"
+)
+
+// NewHelp returns a help command for the given commander.
+func NewHelp(cdr *subcommands.Commander) *Help {
+ return &Help{
+ cdr: cdr,
+ }
+}
+
+// Help implements subcommands.Command for the "help" command. The 'help'
+// command prints help for commands registered to a Commander but also allows for
+// registering additional help commands that print other documentation.
+type Help struct {
+ cdr *subcommands.Commander
+ commands []subcommands.Command
+ help bool
+}
+
+// Name implements subcommands.Command.Name.
+func (*Help) Name() string {
+ return "help"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Help) Synopsis() string {
+ return "Print help documentation."
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Help) Usage() string {
+ return `help [<subcommand>]:
+ With an argument, prints detailed information on the use of
+ the specified topic or subcommand. With no argument, print a list of
+ all commands and a brief description of each.
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (h *Help) SetFlags(f *flag.FlagSet) {}
+
+// Execute implements subcommands.Command.Execute.
+func (h *Help) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+ switch f.NArg() {
+ case 0:
+ fmt.Fprintf(h.cdr.Output, "Usage: %s <flags> <subcommand> <subcommand args>\n\n", h.cdr.Name())
+ fmt.Fprintf(h.cdr.Output, `runsc is a command line client for running applications packaged in the Open
+Container Initiative (OCI) format. Applications run by runsc are run in an
+isolated gVisor sandbox that emulates a Linux environment.
+
+gVisor is a user-space kernel, written in Go, that implements a substantial
+portion of the Linux system call interface. It provides an additional layer
+of isolation between running applications and the host operating system.
+
+Functionality is provided by subcommands. For additonal help on individual
+subcommands use "%s %s <subcommand>".
+
+`, h.cdr.Name(), h.Name())
+ h.cdr.VisitGroups(func(g *subcommands.CommandGroup) {
+ h.cdr.ExplainGroup(h.cdr.Output, g)
+ })
+
+ fmt.Fprintf(h.cdr.Output, "Additional help topics (Use \"%s %s <topic>\" to see help on the topic):\n", h.cdr.Name(), h.Name())
+ for _, cmd := range h.commands {
+ fmt.Fprintf(h.cdr.Output, "\t%-15s %s\n", cmd.Name(), cmd.Synopsis())
+ }
+ fmt.Fprintf(h.cdr.Output, "\nUse \"%s flags\" for a list of top-level flags\n", h.cdr.Name())
+ return subcommands.ExitSuccess
+ default:
+ // Look for commands registered to the commander and print help explanation if found.
+ found := false
+ h.cdr.VisitCommands(func(g *subcommands.CommandGroup, cmd subcommands.Command) {
+ if f.Arg(0) == cmd.Name() {
+ h.cdr.ExplainCommand(h.cdr.Output, cmd)
+ found = true
+ }
+ })
+ if found {
+ return subcommands.ExitSuccess
+ }
+
+ // Next check commands registered to the help command.
+ for _, cmd := range h.commands {
+ if f.Arg(0) == cmd.Name() {
+ fs := flag.NewFlagSet(f.Arg(0), flag.ContinueOnError)
+ fs.Usage = func() { h.cdr.ExplainCommand(h.cdr.Error, cmd) }
+ cmd.SetFlags(fs)
+ if fs.Parse(f.Args()[1:]) != nil {
+ return subcommands.ExitUsageError
+ }
+ return cmd.Execute(ctx, f, args...)
+ }
+ }
+
+ fmt.Fprintf(h.cdr.Error, "Subcommand %s not understood\n", f.Arg(0))
+ }
+
+ f.Usage()
+ return subcommands.ExitUsageError
+}
+
+// Register registers a new help command.
+func (h *Help) Register(cmd subcommands.Command) {
+ h.commands = append(h.commands, cmd)
+}
diff --git a/runsc/cmd/restore.go b/runsc/cmd/restore.go
index 3ab2f5676..a78a0dce6 100644
--- a/runsc/cmd/restore.go
+++ b/runsc/cmd/restore.go
@@ -80,25 +80,29 @@ func (r *Restore) Execute(_ context.Context, f *flag.FlagSet, args ...interface{
conf := args[0].(*boot.Config)
waitStatus := args[1].(*syscall.WaitStatus)
+ if conf.Rootless {
+ return Errorf("Rootless mode not supported with %q", r.Name())
+ }
+
bundleDir := r.bundleDir
if bundleDir == "" {
bundleDir = getwdOrDie()
}
spec, err := specutils.ReadSpec(bundleDir)
if err != nil {
- Fatalf("reading spec: %v", err)
+ return Errorf("reading spec: %v", err)
}
specutils.LogSpec(spec)
if r.imagePath == "" {
- Fatalf("image-path flag must be provided")
+ return Errorf("image-path flag must be provided")
}
conf.RestoreFile = filepath.Join(r.imagePath, checkpointFileName)
ws, err := container.Run(id, spec, conf, bundleDir, r.consoleSocket, r.pidFile, r.userLog, r.detach)
if err != nil {
- Fatalf("running container: %v", err)
+ return Errorf("running container: %v", err)
}
*waitStatus = ws
diff --git a/runsc/cmd/run.go b/runsc/cmd/run.go
index c228b4f93..abf602239 100644
--- a/runsc/cmd/run.go
+++ b/runsc/cmd/run.go
@@ -67,19 +67,23 @@ func (r *Run) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) s
conf := args[0].(*boot.Config)
waitStatus := args[1].(*syscall.WaitStatus)
+ if conf.Rootless {
+ return Errorf("Rootless mode not supported with %q", r.Name())
+ }
+
bundleDir := r.bundleDir
if bundleDir == "" {
bundleDir = getwdOrDie()
}
spec, err := specutils.ReadSpec(bundleDir)
if err != nil {
- Fatalf("reading spec: %v", err)
+ return Errorf("reading spec: %v", err)
}
specutils.LogSpec(spec)
ws, err := container.Run(id, spec, conf, bundleDir, r.consoleSocket, r.pidFile, r.userLog, r.detach)
if err != nil {
- Fatalf("running container: %v", err)
+ return Errorf("running container: %v", err)
}
*waitStatus = ws
diff --git a/runsc/cmd/start.go b/runsc/cmd/start.go
index 657726251..31e8f42bb 100644
--- a/runsc/cmd/start.go
+++ b/runsc/cmd/start.go
@@ -16,7 +16,6 @@ package cmd
import (
"context"
-
"flag"
"github.com/google/subcommands"
"gvisor.googlesource.com/gvisor/runsc/boot"
diff --git a/runsc/cmd/syscalls.go b/runsc/cmd/syscalls.go
new file mode 100644
index 000000000..9c8a66490
--- /dev/null
+++ b/runsc/cmd/syscalls.go
@@ -0,0 +1,347 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+ "context"
+ "encoding/csv"
+ "encoding/json"
+ "fmt"
+ "io"
+ "os"
+ "sort"
+ "strconv"
+ "text/tabwriter"
+
+ "flag"
+ "github.com/google/subcommands"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
+)
+
+// Syscalls implements subcommands.Command for the "syscalls" command.
+type Syscalls struct {
+ output string
+ os string
+ arch string
+}
+
+// CompatibilityInfo is a map of system and architecture to compatibility doc.
+// Maps operating system to architecture to ArchInfo.
+type CompatibilityInfo map[string]map[string]ArchInfo
+
+// ArchInfo is compatbility doc for an architecture.
+type ArchInfo struct {
+ // Syscalls maps syscall number for the architecture to the doc.
+ Syscalls map[uintptr]SyscallDoc `json:"syscalls"`
+}
+
+// SyscallDoc represents a single item of syscall documentation.
+type SyscallDoc struct {
+ Name string `json:"name"`
+ num uintptr
+
+ Support string `json:"support"`
+ Note string `json:"note,omitempty"`
+ URLs []string `json:"urls,omitempty"`
+}
+
+type outputFunc func(io.Writer, CompatibilityInfo) error
+
+var (
+ // The string name to use for printing compatibility for all OSes.
+ osAll = "all"
+
+ // The string name to use for printing compatibility for all architectures.
+ archAll = "all"
+
+ // A map of OS name to map of architecture name to syscall table.
+ syscallTableMap = make(map[string]map[string]*kernel.SyscallTable)
+
+ // A map of output type names to output functions.
+ outputMap = map[string]outputFunc{
+ "table": outputTable,
+ "json": outputJSON,
+ "csv": outputCSV,
+ }
+)
+
+// Name implements subcommands.Command.Name.
+func (*Syscalls) Name() string {
+ return "syscalls"
+}
+
+// Synopsis implements subcommands.Command.Synopsis.
+func (*Syscalls) Synopsis() string {
+ return "Print compatibility information for syscalls."
+}
+
+// Usage implements subcommands.Command.Usage.
+func (*Syscalls) Usage() string {
+ return `syscalls [options] - Print compatibility information for syscalls.
+`
+}
+
+// SetFlags implements subcommands.Command.SetFlags.
+func (s *Syscalls) SetFlags(f *flag.FlagSet) {
+ f.StringVar(&s.output, "o", "table", "Output format (table, csv, json).")
+ f.StringVar(&s.os, "os", osAll, "The OS (e.g. linux)")
+ f.StringVar(&s.arch, "arch", archAll, "The CPU architecture (e.g. amd64).")
+}
+
+// Execute implements subcommands.Command.Execute.
+func (s *Syscalls) Execute(_ context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
+ out, ok := outputMap[s.output]
+ if !ok {
+ Fatalf("Unsupported output format %q", s.output)
+ }
+
+ // Build map of all supported architectures.
+ tables := kernel.SyscallTables()
+ for _, t := range tables {
+ osMap, ok := syscallTableMap[t.OS.String()]
+ if !ok {
+ osMap = make(map[string]*kernel.SyscallTable)
+ syscallTableMap[t.OS.String()] = osMap
+ }
+ osMap[t.Arch.String()] = t
+ }
+
+ // Build a map of the architectures we want to output.
+ info, err := getCompatibilityInfo(s.os, s.arch)
+ if err != nil {
+ Fatalf("%v", err)
+ }
+
+ if err := out(os.Stdout, info); err != nil {
+ Fatalf("Error writing output: %v", err)
+ }
+
+ return subcommands.ExitSuccess
+}
+
+// getCompatibilityInfo returns compatibility info for the given OS name and
+// architecture name. Supports the special name 'all' for OS and architecture that
+// specifies that all supported OSes or architectures should be included.
+func getCompatibilityInfo(osName string, archName string) (CompatibilityInfo, error) {
+ info := CompatibilityInfo(make(map[string]map[string]ArchInfo))
+ if osName == osAll {
+ // Special processing for the 'all' OS name.
+ for osName, _ := range syscallTableMap {
+ info[osName] = make(map[string]ArchInfo)
+ // osName is a specific OS name.
+ if err := addToCompatibilityInfo(info, osName, archName); err != nil {
+ return info, err
+ }
+ }
+ } else {
+ // osName is a specific OS name.
+ info[osName] = make(map[string]ArchInfo)
+ if err := addToCompatibilityInfo(info, osName, archName); err != nil {
+ return info, err
+ }
+ }
+
+ return info, nil
+}
+
+// addToCompatibilityInfo adds ArchInfo for the given specific OS name and
+// architecture name. Supports the special architecture name 'all' to specify
+// that all supported architectures for the OS should be included.
+func addToCompatibilityInfo(info CompatibilityInfo, osName string, archName string) error {
+ if archName == archAll {
+ // Special processing for the 'all' architecture name.
+ for archName, _ := range syscallTableMap[osName] {
+ archInfo, err := getArchInfo(osName, archName)
+ if err != nil {
+ return err
+ }
+ info[osName][archName] = archInfo
+ }
+ } else {
+ // archName is a specific architecture name.
+ archInfo, err := getArchInfo(osName, archName)
+ if err != nil {
+ return err
+ }
+ info[osName][archName] = archInfo
+ }
+
+ return nil
+}
+
+// getArchInfo returns compatibility info for a specific OS and architecture.
+func getArchInfo(osName string, archName string) (ArchInfo, error) {
+ info := ArchInfo{}
+ info.Syscalls = make(map[uintptr]SyscallDoc)
+
+ t, ok := syscallTableMap[osName][archName]
+ if !ok {
+ return info, fmt.Errorf("syscall table for %s/%s not found", osName, archName)
+ }
+
+ for num, sc := range t.Table {
+ info.Syscalls[num] = SyscallDoc{
+ Name: sc.Name,
+ num: num,
+ Support: sc.SupportLevel.String(),
+ Note: sc.Note,
+ URLs: sc.URLs,
+ }
+ }
+
+ return info, nil
+}
+
+// outputTable outputs the syscall info in tabular format.
+func outputTable(w io.Writer, info CompatibilityInfo) error {
+ tw := tabwriter.NewWriter(w, 0, 0, 2, ' ', 0)
+
+ // Linux
+ for osName, osInfo := range info {
+ for archName, archInfo := range osInfo {
+ // Print the OS/arch
+ fmt.Fprintf(w, "%s/%s:\n\n", osName, archName)
+
+ // Sort the syscalls for output in the table.
+ sortedCalls := []SyscallDoc{}
+ for _, sc := range archInfo.Syscalls {
+ sortedCalls = append(sortedCalls, sc)
+ }
+ sort.Slice(sortedCalls, func(i, j int) bool {
+ return sortedCalls[i].num < sortedCalls[j].num
+ })
+
+ // Write the header
+ _, err := fmt.Fprintf(tw, "%s\t%s\t%s\t%s\n",
+ "NUM",
+ "NAME",
+ "SUPPORT",
+ "NOTE",
+ )
+ if err != nil {
+ return err
+ }
+
+ // Write each syscall entry
+ for _, sc := range sortedCalls {
+ _, err = fmt.Fprintf(tw, "%s\t%s\t%s\t%s\n",
+ strconv.FormatInt(int64(sc.num), 10),
+ sc.Name,
+ sc.Support,
+ sc.Note,
+ )
+ if err != nil {
+ return err
+ }
+ // Add issue urls to note.
+ for _, url := range sc.URLs {
+ _, err = fmt.Fprintf(tw, "%s\t%s\t%s\tSee: %s\t\n",
+ "",
+ "",
+ "",
+ url,
+ )
+ if err != nil {
+ return err
+ }
+ }
+ }
+
+ err = tw.Flush()
+ if err != nil {
+ return err
+ }
+ }
+ }
+
+ return nil
+}
+
+// outputJSON outputs the syscall info in JSON format.
+func outputJSON(w io.Writer, info CompatibilityInfo) error {
+ e := json.NewEncoder(w)
+ e.SetIndent("", " ")
+ return e.Encode(info)
+}
+
+// numberedRow is aCSV row annotated by syscall number (used for sorting)
+type numberedRow struct {
+ num uintptr
+ row []string
+}
+
+// outputCSV outputs the syscall info in tabular format.
+func outputCSV(w io.Writer, info CompatibilityInfo) error {
+ csvWriter := csv.NewWriter(w)
+
+ // Linux
+ for osName, osInfo := range info {
+ for archName, archInfo := range osInfo {
+ // Sort the syscalls for output in the table.
+ sortedCalls := []numberedRow{}
+ for _, sc := range archInfo.Syscalls {
+ // Add issue urls to note.
+ note := sc.Note
+ for _, url := range sc.URLs {
+ note = fmt.Sprintf("%s\nSee: %s", note, url)
+ }
+
+ sortedCalls = append(sortedCalls, numberedRow{
+ num: sc.num,
+ row: []string{
+ osName,
+ archName,
+ strconv.FormatInt(int64(sc.num), 10),
+ sc.Name,
+ sc.Support,
+ note,
+ },
+ })
+ }
+ sort.Slice(sortedCalls, func(i, j int) bool {
+ return sortedCalls[i].num < sortedCalls[j].num
+ })
+
+ // Write the header
+ err := csvWriter.Write([]string{
+ "OS",
+ "Arch",
+ "Num",
+ "Name",
+ "Support",
+ "Note",
+ })
+ if err != nil {
+ return err
+ }
+
+ // Write each syscall entry
+ for _, sc := range sortedCalls {
+ err = csvWriter.Write(sc.row)
+ if err != nil {
+ return err
+ }
+ }
+
+ csvWriter.Flush()
+ err = csvWriter.Error()
+ if err != nil {
+ return err
+ }
+ }
+ }
+
+ return nil
+}
diff --git a/runsc/cmd/wait.go b/runsc/cmd/wait.go
index a55a682f3..58fd01974 100644
--- a/runsc/cmd/wait.go
+++ b/runsc/cmd/wait.go
@@ -88,14 +88,14 @@ func (wt *Wait) Execute(_ context.Context, f *flag.FlagSet, args ...interface{})
waitStatus = ws
// Wait on a PID in the root PID namespace.
case wt.rootPID != unsetPID:
- ws, err := c.WaitRootPID(int32(wt.rootPID), true /* clearStatus */)
+ ws, err := c.WaitRootPID(int32(wt.rootPID))
if err != nil {
Fatalf("waiting on PID in root PID namespace %d in container %q: %v", wt.rootPID, c.ID, err)
}
waitStatus = ws
// Wait on a PID in the container's PID namespace.
case wt.pid != unsetPID:
- ws, err := c.WaitPID(int32(wt.pid), true /* clearStatus */)
+ ws, err := c.WaitPID(int32(wt.pid))
if err != nil {
Fatalf("waiting on PID %d in container %q: %v", wt.pid, c.ID, err)
}
diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go
index b8af27c15..d016533e6 100644
--- a/runsc/container/console_test.go
+++ b/runsc/container/console_test.go
@@ -258,7 +258,7 @@ func TestJobControlSignalExec(t *testing.T) {
}
// Make sure the process indicates it was killed by a SIGKILL.
- ws, err := c.WaitPID(pid, true)
+ ws, err := c.WaitPID(pid)
if err != nil {
t.Errorf("waiting on container failed: %v", err)
}
diff --git a/runsc/container/container.go b/runsc/container/container.go
index 513085836..04b611b56 100644
--- a/runsc/container/container.go
+++ b/runsc/container/container.go
@@ -530,22 +530,22 @@ func (c *Container) Wait() (syscall.WaitStatus, error) {
// WaitRootPID waits for process 'pid' in the sandbox's PID namespace and
// returns its WaitStatus.
-func (c *Container) WaitRootPID(pid int32, clearStatus bool) (syscall.WaitStatus, error) {
+func (c *Container) WaitRootPID(pid int32) (syscall.WaitStatus, error) {
log.Debugf("Wait on PID %d in sandbox %q", pid, c.Sandbox.ID)
if !c.isSandboxRunning() {
return 0, fmt.Errorf("sandbox is not running")
}
- return c.Sandbox.WaitPID(c.Sandbox.ID, pid, clearStatus)
+ return c.Sandbox.WaitPID(c.Sandbox.ID, pid)
}
// WaitPID waits for process 'pid' in the container's PID namespace and returns
// its WaitStatus.
-func (c *Container) WaitPID(pid int32, clearStatus bool) (syscall.WaitStatus, error) {
+func (c *Container) WaitPID(pid int32) (syscall.WaitStatus, error) {
log.Debugf("Wait on PID %d in container %q", pid, c.ID)
if !c.isSandboxRunning() {
return 0, fmt.Errorf("sandbox is not running")
}
- return c.Sandbox.WaitPID(c.ID, pid, clearStatus)
+ return c.Sandbox.WaitPID(c.ID, pid)
}
// SignalContainer sends the signal to the container. If all is true and signal
diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go
index dcd9910a0..867bf8187 100644
--- a/runsc/container/container_test.go
+++ b/runsc/container/container_test.go
@@ -36,6 +36,7 @@ import (
"gvisor.googlesource.com/gvisor/pkg/sentry/control"
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
"gvisor.googlesource.com/gvisor/runsc/boot"
+ "gvisor.googlesource.com/gvisor/runsc/specutils"
"gvisor.googlesource.com/gvisor/runsc/test/testutil"
)
@@ -1841,7 +1842,7 @@ func (cont *Container) executeSync(args *control.ExecArgs) (syscall.WaitStatus,
if err != nil {
return 0, fmt.Errorf("error executing: %v", err)
}
- ws, err := cont.WaitPID(pid, true /* clearStatus */)
+ ws, err := cont.WaitPID(pid)
if err != nil {
return 0, fmt.Errorf("error waiting: %v", err)
}
@@ -1853,7 +1854,7 @@ func TestMain(m *testing.M) {
if err := testutil.ConfigureExePath(); err != nil {
panic(err.Error())
}
- testutil.RunAsRoot()
+ specutils.MaybeRunAsRoot()
os.Exit(m.Run())
}
diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go
index 39c4dc03d..d57a73d46 100644
--- a/runsc/container/multi_container_test.go
+++ b/runsc/container/multi_container_test.go
@@ -99,6 +99,36 @@ func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*C
return containers, cleanup, nil
}
+type execDesc struct {
+ c *Container
+ cmd []string
+ want int
+ desc string
+}
+
+func execMany(execs []execDesc) error {
+ for _, exec := range execs {
+ args := &control.ExecArgs{Argv: exec.cmd}
+ if ws, err := exec.c.executeSync(args); err != nil {
+ return fmt.Errorf("error executing %+v: %v", args, err)
+ } else if ws.ExitStatus() != exec.want {
+ return fmt.Errorf("%q: exec %q got exit status: %d, want: %d", exec.desc, exec.cmd, ws.ExitStatus(), exec.want)
+ }
+ }
+ return nil
+}
+
+func createSharedMount(mount specs.Mount, name string, pod ...*specs.Spec) {
+ for _, spec := range pod {
+ spec.Annotations[path.Join(boot.MountPrefix, name, "source")] = mount.Source
+ spec.Annotations[path.Join(boot.MountPrefix, name, "type")] = mount.Type
+ spec.Annotations[path.Join(boot.MountPrefix, name, "share")] = "pod"
+ if len(mount.Options) > 0 {
+ spec.Annotations[path.Join(boot.MountPrefix, name, "options")] = strings.Join(mount.Options, ",")
+ }
+ }
+}
+
// TestMultiContainerSanity checks that it is possible to run 2 dead-simple
// containers in the same sandbox.
func TestMultiContainerSanity(t *testing.T) {
@@ -175,12 +205,12 @@ func TestMultiContainerWait(t *testing.T) {
go func(c *Container) {
defer wg.Done()
const pid = 2
- if ws, err := c.WaitPID(pid, true /* clearStatus */); err != nil {
+ if ws, err := c.WaitPID(pid); err != nil {
t.Errorf("failed to wait for PID %d: %v", pid, err)
} else if es := ws.ExitStatus(); es != 0 {
t.Errorf("PID %d exited with non-zero status %d", pid, es)
}
- if _, err := c.WaitPID(pid, true /* clearStatus */); err == nil {
+ if _, err := c.WaitPID(pid); err == nil {
t.Errorf("wait for stopped PID %d should fail", pid)
}
}(containers[1])
@@ -263,12 +293,12 @@ func TestExecWait(t *testing.T) {
}
// Get the exit status from the exec'd process.
- if ws, err := containers[0].WaitPID(pid, true /* clearStatus */); err != nil {
+ if ws, err := containers[0].WaitPID(pid); err != nil {
t.Fatalf("failed to wait for process %+v with pid %d: %v", args, pid, err)
} else if es := ws.ExitStatus(); es != 0 {
t.Fatalf("process %+v exited with non-zero status %d", args, es)
}
- if _, err := containers[0].WaitPID(pid, true /* clearStatus */); err == nil {
+ if _, err := containers[0].WaitPID(pid); err == nil {
t.Fatalf("wait for stopped process %+v should fail", args)
}
}
@@ -828,3 +858,272 @@ func TestMultiContainerGoferStop(t *testing.T) {
}
}
}
+
+// Test that pod shared mounts are properly mounted in 2 containers and that
+// changes from one container is reflected in the other.
+func TestMultiContainerSharedMount(t *testing.T) {
+ for _, conf := range configs(all...) {
+ t.Logf("Running test with conf: %+v", conf)
+
+ // Setup the containers.
+ sleep := []string{"sleep", "100"}
+ podSpec, ids := createSpecs(sleep, sleep)
+ mnt0 := specs.Mount{
+ Destination: "/mydir/test",
+ Source: "/some/dir",
+ Type: "tmpfs",
+ Options: nil,
+ }
+ podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0)
+
+ mnt1 := mnt0
+ mnt1.Destination = "/mydir2/test2"
+ podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1)
+
+ createSharedMount(mnt0, "test-mount", podSpec...)
+
+ containers, cleanup, err := startContainers(conf, podSpec, ids)
+ if err != nil {
+ t.Fatalf("error starting containers: %v", err)
+ }
+ defer cleanup()
+
+ file0 := path.Join(mnt0.Destination, "abc")
+ file1 := path.Join(mnt1.Destination, "abc")
+ execs := []execDesc{
+ {
+ c: containers[0],
+ cmd: []string{"/usr/bin/test", "-d", mnt0.Destination},
+ desc: "directory is mounted in container0",
+ },
+ {
+ c: containers[1],
+ cmd: []string{"/usr/bin/test", "-d", mnt1.Destination},
+ desc: "directory is mounted in container1",
+ },
+ {
+ c: containers[0],
+ cmd: []string{"/usr/bin/touch", file0},
+ desc: "create file in container0",
+ },
+ {
+ c: containers[0],
+ cmd: []string{"/usr/bin/test", "-f", file0},
+ desc: "file appears in container0",
+ },
+ {
+ c: containers[1],
+ cmd: []string{"/usr/bin/test", "-f", file1},
+ desc: "file appears in container1",
+ },
+ {
+ c: containers[1],
+ cmd: []string{"/bin/rm", file1},
+ desc: "file removed from container1",
+ },
+ {
+ c: containers[0],
+ cmd: []string{"/usr/bin/test", "!", "-f", file0},
+ desc: "file removed from container0",
+ },
+ {
+ c: containers[1],
+ cmd: []string{"/usr/bin/test", "!", "-f", file1},
+ desc: "file removed from container1",
+ },
+ {
+ c: containers[1],
+ cmd: []string{"/bin/mkdir", file1},
+ desc: "create directory in container1",
+ },
+ {
+ c: containers[0],
+ cmd: []string{"/usr/bin/test", "-d", file0},
+ desc: "dir appears in container0",
+ },
+ {
+ c: containers[1],
+ cmd: []string{"/usr/bin/test", "-d", file1},
+ desc: "dir appears in container1",
+ },
+ {
+ c: containers[0],
+ cmd: []string{"/bin/rmdir", file0},
+ desc: "create directory in container0",
+ },
+ {
+ c: containers[0],
+ cmd: []string{"/usr/bin/test", "!", "-d", file0},
+ desc: "dir removed from container0",
+ },
+ {
+ c: containers[1],
+ cmd: []string{"/usr/bin/test", "!", "-d", file1},
+ desc: "dir removed from container1",
+ },
+ }
+ if err := execMany(execs); err != nil {
+ t.Fatal(err.Error())
+ }
+ }
+}
+
+// Test that pod mounts are mounted as readonly when requested.
+func TestMultiContainerSharedMountReadonly(t *testing.T) {
+ for _, conf := range configs(all...) {
+ t.Logf("Running test with conf: %+v", conf)
+
+ // Setup the containers.
+ sleep := []string{"sleep", "100"}
+ podSpec, ids := createSpecs(sleep, sleep)
+ mnt0 := specs.Mount{
+ Destination: "/mydir/test",
+ Source: "/some/dir",
+ Type: "tmpfs",
+ Options: []string{"ro"},
+ }
+ podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0)
+
+ mnt1 := mnt0
+ mnt1.Destination = "/mydir2/test2"
+ podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1)
+
+ createSharedMount(mnt0, "test-mount", podSpec...)
+
+ containers, cleanup, err := startContainers(conf, podSpec, ids)
+ if err != nil {
+ t.Fatalf("error starting containers: %v", err)
+ }
+ defer cleanup()
+
+ file0 := path.Join(mnt0.Destination, "abc")
+ file1 := path.Join(mnt1.Destination, "abc")
+ execs := []execDesc{
+ {
+ c: containers[0],
+ cmd: []string{"/usr/bin/test", "-d", mnt0.Destination},
+ desc: "directory is mounted in container0",
+ },
+ {
+ c: containers[1],
+ cmd: []string{"/usr/bin/test", "-d", mnt1.Destination},
+ desc: "directory is mounted in container1",
+ },
+ {
+ c: containers[0],
+ cmd: []string{"/usr/bin/touch", file0},
+ want: 1,
+ desc: "fails to write to container0",
+ },
+ {
+ c: containers[1],
+ cmd: []string{"/usr/bin/touch", file1},
+ want: 1,
+ desc: "fails to write to container1",
+ },
+ }
+ if err := execMany(execs); err != nil {
+ t.Fatal(err.Error())
+ }
+ }
+}
+
+// Test that shared pod mounts continue to work after container is restarted.
+func TestMultiContainerSharedMountRestart(t *testing.T) {
+ for _, conf := range configs(all...) {
+ t.Logf("Running test with conf: %+v", conf)
+
+ // Setup the containers.
+ sleep := []string{"sleep", "100"}
+ podSpec, ids := createSpecs(sleep, sleep)
+ mnt0 := specs.Mount{
+ Destination: "/mydir/test",
+ Source: "/some/dir",
+ Type: "tmpfs",
+ Options: nil,
+ }
+ podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0)
+
+ mnt1 := mnt0
+ mnt1.Destination = "/mydir2/test2"
+ podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1)
+
+ createSharedMount(mnt0, "test-mount", podSpec...)
+
+ containers, cleanup, err := startContainers(conf, podSpec, ids)
+ if err != nil {
+ t.Fatalf("error starting containers: %v", err)
+ }
+ defer cleanup()
+
+ file0 := path.Join(mnt0.Destination, "abc")
+ file1 := path.Join(mnt1.Destination, "abc")
+ execs := []execDesc{
+ {
+ c: containers[0],
+ cmd: []string{"/usr/bin/touch", file0},
+ desc: "create file in container0",
+ },
+ {
+ c: containers[0],
+ cmd: []string{"/usr/bin/test", "-f", file0},
+ desc: "file appears in container0",
+ },
+ {
+ c: containers[1],
+ cmd: []string{"/usr/bin/test", "-f", file1},
+ desc: "file appears in container1",
+ },
+ }
+ if err := execMany(execs); err != nil {
+ t.Fatal(err.Error())
+ }
+
+ containers[1].Destroy()
+
+ bundleDir, err := testutil.SetupBundleDir(podSpec[1])
+ if err != nil {
+ t.Fatalf("error restarting container: %v", err)
+ }
+ defer os.RemoveAll(bundleDir)
+
+ containers[1], err = Create(ids[1], podSpec[1], conf, bundleDir, "", "", "")
+ if err != nil {
+ t.Fatalf("error creating container: %v", err)
+ }
+ if err := containers[1].Start(conf); err != nil {
+ t.Fatalf("error starting container: %v", err)
+ }
+
+ execs = []execDesc{
+ {
+ c: containers[0],
+ cmd: []string{"/usr/bin/test", "-f", file0},
+ desc: "file is still in container0",
+ },
+ {
+ c: containers[1],
+ cmd: []string{"/usr/bin/test", "-f", file1},
+ desc: "file is still in container1",
+ },
+ {
+ c: containers[1],
+ cmd: []string{"/bin/rm", file1},
+ desc: "file removed from container1",
+ },
+ {
+ c: containers[0],
+ cmd: []string{"/usr/bin/test", "!", "-f", file0},
+ desc: "file removed from container0",
+ },
+ {
+ c: containers[1],
+ cmd: []string{"/usr/bin/test", "!", "-f", file1},
+ desc: "file removed from container1",
+ },
+ }
+ if err := execMany(execs); err != nil {
+ t.Fatal(err.Error())
+ }
+ }
+}
diff --git a/runsc/main.go b/runsc/main.go
index 11bc73f75..cfe3a78d0 100644
--- a/runsc/main.go
+++ b/runsc/main.go
@@ -48,11 +48,12 @@ var (
// system that are not covered by the runtime spec.
// Debugging flags.
- debugLog = flag.String("debug-log", "", "additional location for logs. If it ends with '/', log files are created inside the directory with default names. The following variables are available: %TIMESTAMP%, %COMMAND%.")
- logPackets = flag.Bool("log-packets", false, "enable network packet logging")
- logFD = flag.Int("log-fd", -1, "file descriptor to log to. If set, the 'log' flag is ignored.")
- debugLogFD = flag.Int("debug-log-fd", -1, "file descriptor to write debug logs to. If set, the 'debug-log-dir' flag is ignored.")
- debugLogFormat = flag.String("debug-log-format", "text", "log format: text (default), json, or json-k8s")
+ debugLog = flag.String("debug-log", "", "additional location for logs. If it ends with '/', log files are created inside the directory with default names. The following variables are available: %TIMESTAMP%, %COMMAND%.")
+ logPackets = flag.Bool("log-packets", false, "enable network packet logging")
+ logFD = flag.Int("log-fd", -1, "file descriptor to log to. If set, the 'log' flag is ignored.")
+ debugLogFD = flag.Int("debug-log-fd", -1, "file descriptor to write debug logs to. If set, the 'debug-log-dir' flag is ignored.")
+ debugLogFormat = flag.String("debug-log-format", "text", "log format: text (default), json, or json-k8s")
+ alsoLogToStderr = flag.Bool("alsologtostderr", false, "send log messages to stderr")
// Debugging flags: strace related
strace = flag.Bool("strace", false, "enable strace")
@@ -60,22 +61,27 @@ var (
straceLogSize = flag.Uint("strace-log-size", 1024, "default size (in bytes) to log data argument blobs")
// Flags that control sandbox runtime behavior.
- platform = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm")
- network = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.")
- gso = flag.Bool("gso", true, "enable generic segmenation offload")
- fileAccess = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.")
- overlay = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
- watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.")
- panicSignal = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.")
- profile = flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).")
- netRaw = flag.Bool("net-raw", false, "enable raw sockets. When false, raw sockets are disabled by removing CAP_NET_RAW from containers (`runsc exec` will still be able to utilize raw sockets). Raw sockets allow malicious containers to craft packets and potentially attack the network.")
-
+ platform = flag.String("platform", "ptrace", "specifies which platform to use: ptrace (default), kvm")
+ network = flag.String("network", "sandbox", "specifies which network to use: sandbox (default), host, none. Using network inside the sandbox is more secure because it's isolated from the host network.")
+ gso = flag.Bool("gso", true, "enable generic segmenation offload")
+ fileAccess = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.")
+ overlay = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.")
+ watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.")
+ panicSignal = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.")
+ profile = flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).")
+ netRaw = flag.Bool("net-raw", false, "enable raw sockets. When false, raw sockets are disabled by removing CAP_NET_RAW from containers (`runsc exec` will still be able to utilize raw sockets). Raw sockets allow malicious containers to craft packets and potentially attack the network.")
+ numNetworkChannels = flag.Int("num-network-channels", 1, "number of underlying channels(FDs) to use for network link endpoints.")
+ rootless = flag.Bool("rootless", false, "it allows the sandbox to be started with a user that is not root. Sandbox and Gofer processes may run with same privileges as current user.")
+
+ // Test flags, not to be used outside tests, ever.
testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.")
)
func main() {
// Help and flags commands are generated automatically.
- subcommands.Register(subcommands.HelpCommand(), "")
+ help := cmd.NewHelp(subcommands.DefaultCommander)
+ help.Register(new(cmd.Syscalls))
+ subcommands.Register(help, "")
subcommands.Register(subcommands.FlagsCommand(), "")
// Register user-facing runsc commands.
@@ -117,6 +123,22 @@ func main() {
os.Exit(0)
}
+ var errorLogger io.Writer
+ if *logFD > -1 {
+ errorLogger = os.NewFile(uintptr(*logFD), "error log file")
+
+ } else if *logFilename != "" {
+ // We must set O_APPEND and not O_TRUNC because Docker passes
+ // the same log file for all commands (and also parses these
+ // log files), so we can't destroy them on each command.
+ var err error
+ errorLogger, err = os.OpenFile(*logFilename, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0644)
+ if err != nil {
+ cmd.Fatalf("error opening log file %q: %v", *logFilename, err)
+ }
+ }
+ cmd.ErrorLogger = errorLogger
+
platformType, err := boot.MakePlatformType(*platform)
if err != nil {
cmd.Fatalf("%v", err)
@@ -141,26 +163,33 @@ func main() {
cmd.Fatalf("%v", err)
}
+ if *numNetworkChannels <= 0 {
+ cmd.Fatalf("num_network_channels must be > 0, got: %d", *numNetworkChannels)
+ }
+
// Create a new Config from the flags.
conf := &boot.Config{
- RootDir: *rootDir,
- Debug: *debug,
- LogFilename: *logFilename,
- LogFormat: *logFormat,
- DebugLog: *debugLog,
- DebugLogFormat: *debugLogFormat,
- FileAccess: fsAccess,
- Overlay: *overlay,
- Network: netType,
- GSO: *gso,
- LogPackets: *logPackets,
- Platform: platformType,
- Strace: *strace,
- StraceLogSize: *straceLogSize,
- WatchdogAction: wa,
- PanicSignal: *panicSignal,
- ProfileEnable: *profile,
- EnableRaw: *netRaw,
+ RootDir: *rootDir,
+ Debug: *debug,
+ LogFilename: *logFilename,
+ LogFormat: *logFormat,
+ DebugLog: *debugLog,
+ DebugLogFormat: *debugLogFormat,
+ FileAccess: fsAccess,
+ Overlay: *overlay,
+ Network: netType,
+ GSO: *gso,
+ LogPackets: *logPackets,
+ Platform: platformType,
+ Strace: *strace,
+ StraceLogSize: *straceLogSize,
+ WatchdogAction: wa,
+ PanicSignal: *panicSignal,
+ ProfileEnable: *profile,
+ EnableRaw: *netRaw,
+ NumNetworkChannels: *numNetworkChannels,
+ Rootless: *rootless,
+
TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot,
}
if len(*straceSyscalls) != 0 {
@@ -174,24 +203,7 @@ func main() {
subcommand := flag.CommandLine.Arg(0)
- var logFile io.Writer = os.Stderr
- if *logFD > -1 {
- logFile = os.NewFile(uintptr(*logFD), "log file")
- } else if *logFilename != "" {
- // We must set O_APPEND and not O_TRUNC because Docker passes
- // the same log file for all commands (and also parses these
- // log files), so we can't destroy them on each command.
- f, err := os.OpenFile(*logFilename, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0644)
- if err != nil {
- cmd.Fatalf("error opening log file %q: %v", *logFilename, err)
- }
- logFile = f
- } else if subcommand == "do" {
- logFile = ioutil.Discard
- }
-
- e := newEmitter(*logFormat, logFile)
-
+ var e log.Emitter
if *debugLogFD > -1 {
f := os.NewFile(uintptr(*debugLogFD), "debug log file")
@@ -201,28 +213,31 @@ func main() {
cmd.Fatalf("flag --debug-log-fd should only be passed to 'boot' and 'gofer' command, but was passed to %q", subcommand)
}
- // If we are the boot process, then we own our stdio FDs and
- // can do what we want with them. Since Docker and Containerd
- // both eat boot's stderr, we dup our stderr to the provided
- // log FD so that panics will appear in the logs, rather than
- // just disappear.
+ // If we are the boot process, then we own our stdio FDs and can do what we
+ // want with them. Since Docker and Containerd both eat boot's stderr, we
+ // dup our stderr to the provided log FD so that panics will appear in the
+ // logs, rather than just disappear.
if err := syscall.Dup2(int(f.Fd()), int(os.Stderr.Fd())); err != nil {
cmd.Fatalf("error dup'ing fd %d to stderr: %v", f.Fd(), err)
}
- if logFile == os.Stderr {
- // Suppress logging to stderr when debug log is enabled. Otherwise all
- // messages will be duplicated in the debug log (see Dup2() call above).
- e = newEmitter(*debugLogFormat, f)
- } else {
- e = log.MultiEmitter{e, newEmitter(*debugLogFormat, f)}
- }
+ e = newEmitter(*debugLogFormat, f)
+
} else if *debugLog != "" {
f, err := specutils.DebugLogFile(*debugLog, subcommand)
if err != nil {
cmd.Fatalf("error opening debug log file in %q: %v", *debugLog, err)
}
- e = log.MultiEmitter{e, newEmitter(*debugLogFormat, f)}
+ e = newEmitter(*debugLogFormat, f)
+
+ } else {
+ // Stderr is reserved for the application, just discard the logs if no debug
+ // log is specified.
+ e = newEmitter("text", ioutil.Discard)
+ }
+
+ if *alsoLogToStderr {
+ e = log.MultiEmitter{e, newEmitter(*debugLogFormat, os.Stderr)}
}
log.SetTarget(e)
diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go
index 0460d5f1a..e9e24fc58 100644
--- a/runsc/sandbox/network.go
+++ b/runsc/sandbox/network.go
@@ -68,7 +68,7 @@ func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Confi
// Build the path to the net namespace of the sandbox process.
// This is what we will copy.
nsPath := filepath.Join("/proc", strconv.Itoa(pid), "ns/net")
- if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.GSO); err != nil {
+ if err := createInterfacesAndRoutesFromNS(conn, nsPath, conf.GSO, conf.NumNetworkChannels); err != nil {
return fmt.Errorf("creating interfaces from net namespace %q: %v", nsPath, err)
}
case boot.NetworkHost:
@@ -138,7 +138,7 @@ func isRootNS() (bool, error) {
// createInterfacesAndRoutesFromNS scrapes the interface and routes from the
// net namespace with the given path, creates them in the sandbox, and removes
// them from the host.
-func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO bool) error {
+func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO bool, numNetworkChannels int) error {
// Join the network namespace that we will be copying.
restore, err := joinNetNS(nsPath)
if err != nil {
@@ -202,25 +202,6 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO
continue
}
- // Create the socket.
- const protocol = 0x0300 // htons(ETH_P_ALL)
- fd, err := syscall.Socket(syscall.AF_PACKET, syscall.SOCK_RAW, protocol)
- if err != nil {
- return fmt.Errorf("unable to create raw socket: %v", err)
- }
- deviceFile := os.NewFile(uintptr(fd), "raw-device-fd")
-
- // Bind to the appropriate device.
- ll := syscall.SockaddrLinklayer{
- Protocol: protocol,
- Ifindex: iface.Index,
- Hatype: 0, // No ARP type.
- Pkttype: syscall.PACKET_OTHERHOST,
- }
- if err := syscall.Bind(fd, &ll); err != nil {
- return fmt.Errorf("unable to bind to %q: %v", iface.Name, err)
- }
-
// Scrape the routes before removing the address, since that
// will remove the routes as well.
routes, def, err := routesForIface(iface)
@@ -236,9 +217,10 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO
}
link := boot.FDBasedLink{
- Name: iface.Name,
- MTU: iface.MTU,
- Routes: routes,
+ Name: iface.Name,
+ MTU: iface.MTU,
+ Routes: routes,
+ NumChannels: numNetworkChannels,
}
// Get the link for the interface.
@@ -246,32 +228,25 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO
if err != nil {
return fmt.Errorf("getting link for interface %q: %v", iface.Name, err)
}
- link.LinkAddress = []byte(ifaceLink.Attrs().HardwareAddr)
+ link.LinkAddress = ifaceLink.Attrs().HardwareAddr
- if enableGSO {
- gso, err := isGSOEnabled(fd, iface.Name)
+ log.Debugf("Setting up network channels")
+ // Create the socket for the device.
+ for i := 0; i < link.NumChannels; i++ {
+ log.Debugf("Creating Channel %d", i)
+ socketEntry, err := createSocket(iface, ifaceLink, enableGSO)
if err != nil {
- return fmt.Errorf("getting GSO for interface %q: %v", iface.Name, err)
+ return fmt.Errorf("failed to createSocket for %s : %v", iface.Name, err)
}
- if gso {
- if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_VNET_HDR, 1); err != nil {
- return fmt.Errorf("unable to enable the PACKET_VNET_HDR option: %v", err)
- }
- link.GSOMaxSize = ifaceLink.Attrs().GSOMaxSize
+ if i == 0 {
+ link.GSOMaxSize = socketEntry.gsoMaxSize
} else {
- log.Infof("GSO not available in host.")
+ if link.GSOMaxSize != socketEntry.gsoMaxSize {
+ return fmt.Errorf("inconsistent gsoMaxSize %d and %d when creating multiple channels for same interface: %s",
+ link.GSOMaxSize, socketEntry.gsoMaxSize, iface.Name)
+ }
}
- }
-
- // Use SO_RCVBUFFORCE because on linux the receive buffer for an
- // AF_PACKET socket is capped by "net.core.rmem_max". rmem_max
- // defaults to a unusually low value of 208KB. This is too low
- // for gVisor to be able to receive packets at high throughputs
- // without incurring packet drops.
- const rcvBufSize = 4 << 20 // 4MB.
-
- if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUFFORCE, rcvBufSize); err != nil {
- return fmt.Errorf("failed to increase socket rcv buffer to %d: %v", rcvBufSize, err)
+ args.FilePayload.Files = append(args.FilePayload.Files, socketEntry.deviceFile)
}
// Collect the addresses for the interface, enable forwarding,
@@ -285,7 +260,6 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO
}
}
- args.FilePayload.Files = append(args.FilePayload.Files, deviceFile)
args.FDBasedLinks = append(args.FDBasedLinks, link)
}
@@ -296,6 +270,61 @@ func createInterfacesAndRoutesFromNS(conn *urpc.Client, nsPath string, enableGSO
return nil
}
+type socketEntry struct {
+ deviceFile *os.File
+ gsoMaxSize uint32
+}
+
+// createSocket creates an underlying AF_PACKET socket and configures it for use by
+// the sentry and returns an *os.File that wraps the underlying socket fd.
+func createSocket(iface net.Interface, ifaceLink netlink.Link, enableGSO bool) (*socketEntry, error) {
+ // Create the socket.
+ const protocol = 0x0300 // htons(ETH_P_ALL)
+ fd, err := syscall.Socket(syscall.AF_PACKET, syscall.SOCK_RAW, protocol)
+ if err != nil {
+ return nil, fmt.Errorf("unable to create raw socket: %v", err)
+ }
+ deviceFile := os.NewFile(uintptr(fd), "raw-device-fd")
+ // Bind to the appropriate device.
+ ll := syscall.SockaddrLinklayer{
+ Protocol: protocol,
+ Ifindex: iface.Index,
+ Hatype: 0, // No ARP type.
+ Pkttype: syscall.PACKET_OTHERHOST,
+ }
+ if err := syscall.Bind(fd, &ll); err != nil {
+ return nil, fmt.Errorf("unable to bind to %q: %v", iface.Name, err)
+ }
+
+ gsoMaxSize := uint32(0)
+ if enableGSO {
+ gso, err := isGSOEnabled(fd, iface.Name)
+ if err != nil {
+ return nil, fmt.Errorf("getting GSO for interface %q: %v", iface.Name, err)
+ }
+ if gso {
+ if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_VNET_HDR, 1); err != nil {
+ return nil, fmt.Errorf("unable to enable the PACKET_VNET_HDR option: %v", err)
+ }
+ gsoMaxSize = ifaceLink.Attrs().GSOMaxSize
+ } else {
+ log.Infof("GSO not available in host.")
+ }
+ }
+
+ // Use SO_RCVBUFFORCE because on linux the receive buffer for an
+ // AF_PACKET socket is capped by "net.core.rmem_max". rmem_max
+ // defaults to a unusually low value of 208KB. This is too low
+ // for gVisor to be able to receive packets at high throughputs
+ // without incurring packet drops.
+ const rcvBufSize = 4 << 20 // 4MB.
+
+ if err := syscall.SetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_RCVBUFFORCE, rcvBufSize); err != nil {
+ return nil, fmt.Errorf("failed to increase socket rcv buffer to %d: %v", rcvBufSize, err)
+ }
+ return &socketEntry{deviceFile, gsoMaxSize}, nil
+}
+
// loopbackLinks collects the links for a loopback interface.
func loopbackLinks(iface net.Interface, addrs []net.Addr) ([]boot.LoopbackLink, error) {
var links []boot.LoopbackLink
diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go
index 47a66afb2..5ff6f879c 100644
--- a/runsc/sandbox/sandbox.go
+++ b/runsc/sandbox/sandbox.go
@@ -515,46 +515,64 @@ func (s *Sandbox) createSandboxProcess(spec *specs.Spec, conf *boot.Config, bund
} else if specutils.HasCapabilities(capability.CAP_SETUID, capability.CAP_SETGID) {
log.Infof("Sandbox will be started in new user namespace")
nss = append(nss, specs.LinuxNamespace{Type: specs.UserNamespace})
+ cmd.Args = append(cmd.Args, "--setup-root")
- // Map nobody in the new namespace to nobody in the parent namespace.
- //
- // A sandbox process will construct an empty
- // root for itself, so it has to have the CAP_SYS_ADMIN
- // capability.
- //
- // FIXME(b/122554829): The current implementations of
- // os/exec doesn't allow to set ambient capabilities if
- // a process is started in a new user namespace. As a
- // workaround, we start the sandbox process with the 0
- // UID and then it constructs a chroot and sets UID to
- // nobody. https://github.com/golang/go/issues/2315
- const nobody = 65534
- cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{
- {
- ContainerID: int(0),
- HostID: int(nobody - 1),
- Size: int(1),
- },
- {
- ContainerID: int(nobody),
- HostID: int(nobody),
- Size: int(1),
- },
- }
- cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{
- {
- ContainerID: int(nobody),
- HostID: int(nobody),
- Size: int(1),
- },
+ if conf.Rootless {
+ log.Infof("Rootless mode: sandbox will run as root inside user namespace, mapped to the current user, uid: %d, gid: %d", os.Getuid(), os.Getgid())
+ cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{
+ {
+ ContainerID: 0,
+ HostID: os.Getuid(),
+ Size: 1,
+ },
+ }
+ cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{
+ {
+ ContainerID: 0,
+ HostID: os.Getgid(),
+ Size: 1,
+ },
+ }
+ cmd.SysProcAttr.Credential = &syscall.Credential{Uid: 0, Gid: 0}
+
+ } else {
+ // Map nobody in the new namespace to nobody in the parent namespace.
+ //
+ // A sandbox process will construct an empty
+ // root for itself, so it has to have the CAP_SYS_ADMIN
+ // capability.
+ //
+ // FIXME(b/122554829): The current implementations of
+ // os/exec doesn't allow to set ambient capabilities if
+ // a process is started in a new user namespace. As a
+ // workaround, we start the sandbox process with the 0
+ // UID and then it constructs a chroot and sets UID to
+ // nobody. https://github.com/golang/go/issues/2315
+ const nobody = 65534
+ cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{
+ {
+ ContainerID: 0,
+ HostID: nobody - 1,
+ Size: 1,
+ },
+ {
+ ContainerID: nobody,
+ HostID: nobody,
+ Size: 1,
+ },
+ }
+ cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{
+ {
+ ContainerID: nobody,
+ HostID: nobody,
+ Size: 1,
+ },
+ }
+
+ // Set credentials to run as user and group nobody.
+ cmd.SysProcAttr.Credential = &syscall.Credential{Uid: 0, Gid: nobody}
}
- // Set credentials to run as user and group nobody.
- cmd.SysProcAttr.Credential = &syscall.Credential{
- Uid: 0,
- Gid: nobody,
- }
- cmd.Args = append(cmd.Args, "--setup-root")
} else {
return fmt.Errorf("can't run sandbox process as user nobody since we don't have CAP_SETUID or CAP_SETGID")
}
@@ -649,7 +667,7 @@ func (s *Sandbox) Wait(cid string) (syscall.WaitStatus, error) {
// WaitPID waits for process 'pid' in the container's sandbox and returns its
// WaitStatus.
-func (s *Sandbox) WaitPID(cid string, pid int32, clearStatus bool) (syscall.WaitStatus, error) {
+func (s *Sandbox) WaitPID(cid string, pid int32) (syscall.WaitStatus, error) {
log.Debugf("Waiting for PID %d in sandbox %q", pid, s.ID)
var ws syscall.WaitStatus
conn, err := s.sandboxConnect()
@@ -659,9 +677,8 @@ func (s *Sandbox) WaitPID(cid string, pid int32, clearStatus bool) (syscall.Wait
defer conn.Close()
args := &boot.WaitPIDArgs{
- PID: pid,
- CID: cid,
- ClearStatus: clearStatus,
+ PID: pid,
+ CID: cid,
}
if err := conn.Call(boot.ContainerWaitPID, args, &ws); err != nil {
return ws, fmt.Errorf("waiting on PID %d in sandbox %q: %v", pid, s.ID, err)
diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD
index 15476de6f..0456e4c4f 100644
--- a/runsc/specutils/BUILD
+++ b/runsc/specutils/BUILD
@@ -10,10 +10,7 @@ go_library(
"specutils.go",
],
importpath = "gvisor.googlesource.com/gvisor/runsc/specutils",
- visibility = [
- "//runsc:__subpackages__",
- "//test:__subpackages__",
- ],
+ visibility = ["//:sandbox"],
deps = [
"//pkg/abi/linux",
"//pkg/log",
diff --git a/runsc/specutils/fs.go b/runsc/specutils/fs.go
index 1f3afb4e4..6e6902e9f 100644
--- a/runsc/specutils/fs.go
+++ b/runsc/specutils/fs.go
@@ -16,6 +16,7 @@ package specutils
import (
"fmt"
+ "math/bits"
"path"
"syscall"
@@ -105,22 +106,30 @@ func optionsToFlags(opts []string, source map[string]mapping) uint32 {
return rv
}
-// ValidateMount validates that spec mounts are correct.
+// validateMount validates that spec mounts are correct.
func validateMount(mnt *specs.Mount) error {
if !path.IsAbs(mnt.Destination) {
return fmt.Errorf("Mount.Destination must be an absolute path: %v", mnt)
}
-
if mnt.Type == "bind" {
- for _, o := range mnt.Options {
- if ContainsStr(invalidOptions, o) {
- return fmt.Errorf("mount option %q is not supported: %v", o, mnt)
- }
- _, ok1 := optionsMap[o]
- _, ok2 := propOptionsMap[o]
- if !ok1 && !ok2 {
- return fmt.Errorf("unknown mount option %q", o)
- }
+ return ValidateMountOptions(mnt.Options)
+ }
+ return nil
+}
+
+// ValidateMountOptions validates that mount options are correct.
+func ValidateMountOptions(opts []string) error {
+ for _, o := range opts {
+ if ContainsStr(invalidOptions, o) {
+ return fmt.Errorf("mount option %q is not supported", o)
+ }
+ _, ok1 := optionsMap[o]
+ _, ok2 := propOptionsMap[o]
+ if !ok1 && !ok2 {
+ return fmt.Errorf("unknown mount option %q", o)
+ }
+ if err := validatePropagation(o); err != nil {
+ return err
}
}
return nil
@@ -133,5 +142,14 @@ func validateRootfsPropagation(opt string) error {
if flags&(syscall.MS_SLAVE|syscall.MS_PRIVATE) == 0 {
return fmt.Errorf("root mount propagation option must specify private or slave: %q", opt)
}
+ return validatePropagation(opt)
+}
+
+func validatePropagation(opt string) error {
+ flags := PropOptionsToFlags([]string{opt})
+ exclusive := flags & (syscall.MS_SLAVE | syscall.MS_PRIVATE | syscall.MS_SHARED | syscall.MS_UNBINDABLE)
+ if bits.OnesCount32(exclusive) > 1 {
+ return fmt.Errorf("mount propagation options are mutually exclusive: %q", opt)
+ }
return nil
}
diff --git a/runsc/specutils/namespace.go b/runsc/specutils/namespace.go
index 7d194335c..06c13d1ab 100644
--- a/runsc/specutils/namespace.go
+++ b/runsc/specutils/namespace.go
@@ -220,3 +220,55 @@ func HasCapabilities(cs ...capability.Cap) bool {
}
return true
}
+
+// MaybeRunAsRoot ensures the process runs with capabilities needed to create a
+// sandbox, e.g. CAP_SYS_ADMIN, CAP_SYS_CHROOT, etc. If capabilities are needed,
+// it will create a new user namespace and re-execute the process as root
+// inside the namespace with the same arguments and environment.
+//
+// This function returns immediately when no new capability is needed. If
+// another process is executed, it returns straight from here with the same exit
+// code as the child.
+func MaybeRunAsRoot() error {
+ if HasCapabilities(capability.CAP_SYS_ADMIN, capability.CAP_SYS_CHROOT, capability.CAP_SETUID, capability.CAP_SETGID) {
+ return nil
+ }
+
+ // Current process doesn't have required capabilities, create user namespace
+ // and run as root inside the namespace to acquire capabilities.
+ log.Infof("*** Re-running as root in new user namespace ***")
+
+ cmd := exec.Command("/proc/self/exe", os.Args[1:]...)
+
+ cmd.SysProcAttr = &syscall.SysProcAttr{
+ Cloneflags: syscall.CLONE_NEWUSER | syscall.CLONE_NEWNS,
+ // Set current user/group as root inside the namespace. Since we may not
+ // have CAP_SETUID/CAP_SETGID, just map root to the current user/group.
+ UidMappings: []syscall.SysProcIDMap{
+ {ContainerID: 0, HostID: os.Getuid(), Size: 1},
+ },
+ GidMappings: []syscall.SysProcIDMap{
+ {ContainerID: 0, HostID: os.Getgid(), Size: 1},
+ },
+ Credential: &syscall.Credential{Uid: 0, Gid: 0},
+ GidMappingsEnableSetgroups: false,
+ }
+
+ cmd.Env = os.Environ()
+ cmd.Stdin = os.Stdin
+ cmd.Stdout = os.Stdout
+ cmd.Stderr = os.Stderr
+ if err := cmd.Run(); err != nil {
+ if exit, ok := err.(*exec.ExitError); ok {
+ if ws, ok := exit.Sys().(syscall.WaitStatus); ok {
+ os.Exit(ws.ExitStatus())
+ }
+ log.Warningf("No wait status provided, exiting with -1: %v", err)
+ os.Exit(-1)
+ }
+ return fmt.Errorf("re-executing self: %v", err)
+ }
+ // Child completed with success.
+ os.Exit(0)
+ panic("unreachable")
+}
diff --git a/runsc/test/integration/BUILD b/runsc/test/integration/BUILD
index 0c4e4fa80..04ed885c6 100644
--- a/runsc/test/integration/BUILD
+++ b/runsc/test/integration/BUILD
@@ -8,6 +8,7 @@ go_test(
srcs = [
"exec_test.go",
"integration_test.go",
+ "regression_test.go",
],
embed = [":integration"],
tags = [
diff --git a/runsc/test/integration/exec_test.go b/runsc/test/integration/exec_test.go
index 7af064d79..7c0e61ac3 100644
--- a/runsc/test/integration/exec_test.go
+++ b/runsc/test/integration/exec_test.go
@@ -29,6 +29,7 @@ package integration
import (
"fmt"
"strconv"
+ "strings"
"syscall"
"testing"
"time"
@@ -136,3 +137,25 @@ func TestExecJobControl(t *testing.T) {
t.Errorf("ws.ExitedStatus got %d, want %d", got, want)
}
}
+
+// Test that failure to exec returns proper error message.
+func TestExecError(t *testing.T) {
+ if err := testutil.Pull("alpine"); err != nil {
+ t.Fatalf("docker pull failed: %v", err)
+ }
+ d := testutil.MakeDocker("exec-error-test")
+
+ // Start the container.
+ if err := d.Run("alpine", "sleep", "1000"); err != nil {
+ t.Fatalf("docker run failed: %v", err)
+ }
+ defer d.CleanUp()
+
+ _, err := d.Exec("no_can_find")
+ if err == nil {
+ t.Fatalf("docker exec didn't fail")
+ }
+ if want := `error finding executable "no_can_find" in PATH`; !strings.Contains(err.Error(), want) {
+ t.Fatalf("docker exec wrong error, got: %s, want: .*%s.*", err.Error(), want)
+ }
+}
diff --git a/runsc/test/integration/regression_test.go b/runsc/test/integration/regression_test.go
new file mode 100644
index 000000000..80bae9970
--- /dev/null
+++ b/runsc/test/integration/regression_test.go
@@ -0,0 +1,45 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package integration
+
+import (
+ "strings"
+ "testing"
+
+ "gvisor.googlesource.com/gvisor/runsc/test/testutil"
+)
+
+// Test that UDS can be created using overlay when parent directory is in lower
+// layer only (b/134090485).
+//
+// Prerequisite: the directory where the socket file is created must not have
+// been open for write before bind(2) is called.
+func TestBindOverlay(t *testing.T) {
+ if err := testutil.Pull("ubuntu:trusty"); err != nil {
+ t.Fatal("docker pull failed:", err)
+ }
+ d := testutil.MakeDocker("bind-overlay-test")
+
+ cmd := "nc -l -U /var/run/sock& sleep 1 && echo foobar-asdf | nc -U /var/run/sock"
+ got, err := d.RunFg("ubuntu:trusty", "bash", "-c", cmd)
+ if err != nil {
+ t.Fatal("docker run failed:", err)
+ }
+
+ if want := "foobar-asdf"; !strings.Contains(got, want) {
+ t.Fatalf("docker run output is missing %q: %s", want, got)
+ }
+ defer d.CleanUp()
+}
diff --git a/runsc/test/testutil/BUILD b/runsc/test/testutil/BUILD
index ddec81444..eedf962a4 100644
--- a/runsc/test/testutil/BUILD
+++ b/runsc/test/testutil/BUILD
@@ -18,6 +18,5 @@ go_library(
"@com_github_cenkalti_backoff//:go_default_library",
"@com_github_kr_pty//:go_default_library",
"@com_github_opencontainers_runtime-spec//specs-go:go_default_library",
- "@com_github_syndtr_gocapability//capability:go_default_library",
],
)
diff --git a/runsc/test/testutil/testutil.go b/runsc/test/testutil/testutil.go
index 9efb1ba8e..1bd5adc54 100644
--- a/runsc/test/testutil/testutil.go
+++ b/runsc/test/testutil/testutil.go
@@ -30,7 +30,6 @@ import (
"os/exec"
"os/signal"
"path/filepath"
- "runtime"
"strings"
"sync"
"sync/atomic"
@@ -39,7 +38,6 @@ import (
"github.com/cenkalti/backoff"
specs "github.com/opencontainers/runtime-spec/specs-go"
- "github.com/syndtr/gocapability/capability"
"gvisor.googlesource.com/gvisor/runsc/boot"
"gvisor.googlesource.com/gvisor/runsc/specutils"
)
@@ -136,6 +134,7 @@ func TestConfig() *boot.Config {
Strace: true,
FileAccess: boot.FileAccessExclusive,
TestOnlyAllowRunAsCurrentUserWithoutChroot: true,
+ NumNetworkChannels: 1,
}
}
@@ -283,54 +282,6 @@ func WaitForHTTP(port int, timeout time.Duration) error {
return Poll(cb, timeout)
}
-// RunAsRoot ensures the test runs with CAP_SYS_ADMIN and CAP_SYS_CHROOT. If
-// needed it will create a new user namespace and re-execute the test as root
-// inside of the namespace. This function returns when it's running as root. If
-// it needs to create another process, it will exit from there and not return.
-func RunAsRoot() {
- if specutils.HasCapabilities(capability.CAP_SYS_ADMIN, capability.CAP_SYS_CHROOT) {
- return
- }
-
- fmt.Println("*** Re-running test as root in new user namespace ***")
-
- // Current process doesn't have CAP_SYS_ADMIN, create user namespace and run
- // as root inside that namespace to get it.
- runtime.LockOSThread()
- defer runtime.UnlockOSThread()
-
- cmd := exec.Command("/proc/self/exe", os.Args[1:]...)
- cmd.SysProcAttr = &syscall.SysProcAttr{
- Cloneflags: syscall.CLONE_NEWUSER | syscall.CLONE_NEWNS,
- // Set current user/group as root inside the namespace.
- UidMappings: []syscall.SysProcIDMap{
- {ContainerID: 0, HostID: os.Getuid(), Size: 1},
- },
- GidMappings: []syscall.SysProcIDMap{
- {ContainerID: 0, HostID: os.Getgid(), Size: 1},
- },
- GidMappingsEnableSetgroups: false,
- Credential: &syscall.Credential{
- Uid: 0,
- Gid: 0,
- },
- }
- cmd.Env = os.Environ()
- cmd.Stdin = os.Stdin
- cmd.Stdout = os.Stdout
- cmd.Stderr = os.Stderr
- if err := cmd.Run(); err != nil {
- if exit, ok := err.(*exec.ExitError); ok {
- if ws, ok := exit.Sys().(syscall.WaitStatus); ok {
- os.Exit(ws.ExitStatus())
- }
- os.Exit(-1)
- }
- panic(fmt.Sprint("error running child process:", err.Error()))
- }
- os.Exit(0)
-}
-
// Reaper reaps child processes.
type Reaper struct {
// mu protects ch, which will be nil if the reaper is not running.
diff --git a/test/BUILD b/test/BUILD
index e99b4e501..8e1dc5228 100644
--- a/test/BUILD
+++ b/test/BUILD
@@ -1,8 +1,4 @@
-# gVisor is a general-purpose sandbox.
-
-package(licenses = ["notice"])
-
-exports_files(["LICENSE"])
+package(licenses = ["notice"]) # Apache 2.0
# We need to define a bazel platform and toolchain to specify dockerPrivileged
# and dockerRunAsRoot options, they are required to run tests on the RBE
diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD
index c53742d14..731e2aa85 100644
--- a/test/syscalls/BUILD
+++ b/test/syscalls/BUILD
@@ -13,11 +13,17 @@ syscall_test(
test = "//test/syscalls/linux:accept_bind_test",
)
-syscall_test(test = "//test/syscalls/linux:access_test")
+syscall_test(
+ add_overlay = True,
+ test = "//test/syscalls/linux:access_test",
+)
syscall_test(test = "//test/syscalls/linux:affinity_test")
-syscall_test(test = "//test/syscalls/linux:aio_test")
+syscall_test(
+ add_overlay = True,
+ test = "//test/syscalls/linux:aio_test",
+)
syscall_test(
size = "medium",
@@ -30,6 +36,7 @@ syscall_test(test = "//test/syscalls/linux:bad_test")
syscall_test(
size = "large",
+ add_overlay = True,
test = "//test/syscalls/linux:bind_test",
)
@@ -37,17 +44,27 @@ syscall_test(test = "//test/syscalls/linux:brk_test")
syscall_test(test = "//test/syscalls/linux:socket_test")
-syscall_test(test = "//test/syscalls/linux:chdir_test")
+syscall_test(
+ add_overlay = True,
+ test = "//test/syscalls/linux:chdir_test",
+)
-syscall_test(test = "//test/syscalls/linux:chmod_test")
+syscall_test(
+ add_overlay = True,
+ test = "//test/syscalls/linux:chmod_test",
+)
syscall_test(
size = "medium",
+ add_overlay = True,
test = "//test/syscalls/linux:chown_test",
use_tmpfs = True, # chwon tests require gofer to be running as root.
)
-syscall_test(test = "//test/syscalls/linux:chroot_test")
+syscall_test(
+ add_overlay = True,
+ test = "//test/syscalls/linux:chroot_test",
+)
syscall_test(test = "//test/syscalls/linux:clock_getres_test")
@@ -60,11 +77,17 @@ syscall_test(test = "//test/syscalls/linux:clock_nanosleep_test")
syscall_test(test = "//test/syscalls/linux:concurrency_test")
-syscall_test(test = "//test/syscalls/linux:creat_test")
+syscall_test(
+ add_overlay = True,
+ test = "//test/syscalls/linux:creat_test",
+)
syscall_test(test = "//test/syscalls/linux:dev_test")
-syscall_test(test = "//test/syscalls/linux:dup_test")
+syscall_test(
+ add_overlay = True,
+ test = "//test/syscalls/linux:dup_test",
+)
syscall_test(test = "//test/syscalls/linux:epoll_test")
@@ -74,23 +97,34 @@ syscall_test(test = "//test/syscalls/linux:exceptions_test")
syscall_test(
size = "medium",
+ add_overlay = True,
test = "//test/syscalls/linux:exec_test",
)
syscall_test(
size = "medium",
+ add_overlay = True,
test = "//test/syscalls/linux:exec_binary_test",
)
syscall_test(test = "//test/syscalls/linux:exit_test")
-syscall_test(test = "//test/syscalls/linux:fadvise64_test")
+syscall_test(
+ add_overlay = True,
+ test = "//test/syscalls/linux:fadvise64_test",
+)
-syscall_test(test = "//test/syscalls/linux:fallocate_test")
+syscall_test(
+ add_overlay = True,
+ test = "//test/syscalls/linux:fallocate_test",
+)
syscall_test(test = "//test/syscalls/linux:fault_test")
-syscall_test(test = "//test/syscalls/linux:fchdir_test")
+syscall_test(
+ add_overlay = True,
+ test = "//test/syscalls/linux:fchdir_test",
+)
syscall_test(
size = "medium",
@@ -99,6 +133,7 @@ syscall_test(
syscall_test(
size = "medium",
+ add_overlay = True,
test = "//test/syscalls/linux:flock_test",
)
@@ -108,7 +143,10 @@ syscall_test(test = "//test/syscalls/linux:fpsig_fork_test")
syscall_test(test = "//test/syscalls/linux:fpsig_nested_test")
-syscall_test(test = "//test/syscalls/linux:fsync_test")
+syscall_test(
+ add_overlay = True,
+ test = "//test/syscalls/linux:fsync_test",
+)
syscall_test(
size = "medium",
@@ -120,7 +158,10 @@ syscall_test(test = "//test/syscalls/linux:getcpu_host_test")
syscall_test(test = "//test/syscalls/linux:getcpu_test")
-syscall_test(test = "//test/syscalls/linux:getdents_test")
+syscall_test(
+ add_overlay = True,
+ test = "//test/syscalls/linux:getdents_test",
+)
syscall_test(test = "//test/syscalls/linux:getrandom_test")
@@ -128,11 +169,13 @@ syscall_test(test = "//test/syscalls/linux:getrusage_test")
syscall_test(
size = "medium",
+ add_overlay = False, # TODO(gvisor.dev/issue/317): enable when fixed.
test = "//test/syscalls/linux:inotify_test",
)
syscall_test(
size = "medium",
+ add_overlay = True,
test = "//test/syscalls/linux:ioctl_test",
)
@@ -144,11 +187,15 @@ syscall_test(
syscall_test(test = "//test/syscalls/linux:kill_test")
syscall_test(
+ add_overlay = True,
test = "//test/syscalls/linux:link_test",
use_tmpfs = True, # gofer needs CAP_DAC_READ_SEARCH to use AT_EMPTY_PATH with linkat(2)
)
-syscall_test(test = "//test/syscalls/linux:lseek_test")
+syscall_test(
+ add_overlay = True,
+ test = "//test/syscalls/linux:lseek_test",
+)
syscall_test(test = "//test/syscalls/linux:madvise_test")
@@ -158,9 +205,13 @@ syscall_test(test = "//test/syscalls/linux:mempolicy_test")
syscall_test(test = "//test/syscalls/linux:mincore_test")
-syscall_test(test = "//test/syscalls/linux:mkdir_test")
+syscall_test(
+ add_overlay = True,
+ test = "//test/syscalls/linux:mkdir_test",
+)
syscall_test(
+ add_overlay = True,
test = "//test/syscalls/linux:mknod_test",
use_tmpfs = True, # mknod is not supported over gofer.
)
@@ -171,7 +222,10 @@ syscall_test(
test = "//test/syscalls/linux:mmap_test",
)
-syscall_test(test = "//test/syscalls/linux:mount_test")
+syscall_test(
+ add_overlay = True,
+ test = "//test/syscalls/linux:mount_test",
+)
syscall_test(
size = "medium",
@@ -185,9 +239,15 @@ syscall_test(
syscall_test(test = "//test/syscalls/linux:munmap_test")
-syscall_test(test = "//test/syscalls/linux:open_create_test")
+syscall_test(
+ add_overlay = False, # TODO(gvisor.dev/issue/316): enable when fixed.
+ test = "//test/syscalls/linux:open_create_test",
+)
-syscall_test(test = "//test/syscalls/linux:open_test")
+syscall_test(
+ add_overlay = True,
+ test = "//test/syscalls/linux:open_test",
+)
syscall_test(test = "//test/syscalls/linux:partial_bad_buffer_test")
@@ -195,6 +255,7 @@ syscall_test(test = "//test/syscalls/linux:pause_test")
syscall_test(
size = "large",
+ add_overlay = False, # TODO(gvisor.dev/issue/318): enable when fixed.
shard_count = 5,
test = "//test/syscalls/linux:pipe_test",
)
@@ -210,11 +271,20 @@ syscall_test(test = "//test/syscalls/linux:prctl_setuid_test")
syscall_test(test = "//test/syscalls/linux:prctl_test")
-syscall_test(test = "//test/syscalls/linux:pread64_test")
+syscall_test(
+ add_overlay = True,
+ test = "//test/syscalls/linux:pread64_test",
+)
-syscall_test(test = "//test/syscalls/linux:preadv_test")
+syscall_test(
+ add_overlay = True,
+ test = "//test/syscalls/linux:preadv_test",
+)
-syscall_test(test = "//test/syscalls/linux:preadv2_test")
+syscall_test(
+ add_overlay = True,
+ test = "//test/syscalls/linux:preadv2_test",
+)
syscall_test(test = "//test/syscalls/linux:priority_test")
@@ -239,13 +309,22 @@ syscall_test(
test = "//test/syscalls/linux:pty_test",
)
-syscall_test(test = "//test/syscalls/linux:pwritev2_test")
+syscall_test(
+ add_overlay = True,
+ test = "//test/syscalls/linux:pwritev2_test",
+)
-syscall_test(test = "//test/syscalls/linux:pwrite64_test")
+syscall_test(
+ add_overlay = True,
+ test = "//test/syscalls/linux:pwrite64_test",
+)
syscall_test(test = "//test/syscalls/linux:raw_socket_ipv4_test")
-syscall_test(test = "//test/syscalls/linux:read_test")
+syscall_test(
+ add_overlay = True,
+ test = "//test/syscalls/linux:read_test",
+)
syscall_test(
size = "medium",
@@ -254,11 +333,13 @@ syscall_test(
syscall_test(
size = "medium",
+ add_overlay = True,
test = "//test/syscalls/linux:readv_test",
)
syscall_test(
size = "medium",
+ add_overlay = True,
test = "//test/syscalls/linux:rename_test",
)
@@ -279,11 +360,20 @@ syscall_test(
test = "//test/syscalls/linux:semaphore_test",
)
-syscall_test(test = "//test/syscalls/linux:sendfile_socket_test")
+syscall_test(
+ add_overlay = True,
+ test = "//test/syscalls/linux:sendfile_socket_test",
+)
-syscall_test(test = "//test/syscalls/linux:sendfile_test")
+syscall_test(
+ add_overlay = True,
+ test = "//test/syscalls/linux:sendfile_test",
+)
-syscall_test(test = "//test/syscalls/linux:splice_test")
+syscall_test(
+ add_overlay = True,
+ test = "//test/syscalls/linux:splice_test",
+)
syscall_test(test = "//test/syscalls/linux:sigaction_test")
@@ -330,11 +420,13 @@ syscall_test(
syscall_test(
size = "medium",
+ add_overlay = True,
test = "//test/syscalls/linux:socket_filesystem_non_blocking_test",
)
syscall_test(
size = "large",
+ add_overlay = True,
shard_count = 10,
test = "//test/syscalls/linux:socket_filesystem_test",
)
@@ -418,12 +510,6 @@ syscall_test(
)
syscall_test(
- size = "large",
- shard_count = 10,
- test = "//test/syscalls/linux:socket_unix_abstract_test",
-)
-
-syscall_test(
# NOTE(b/116636318): Large sendmsg may stall a long time.
size = "enormous",
test = "//test/syscalls/linux:socket_unix_dgram_local_test",
@@ -436,12 +522,7 @@ syscall_test(
syscall_test(
size = "large",
- shard_count = 10,
- test = "//test/syscalls/linux:socket_unix_filesystem_test",
-)
-
-syscall_test(
- size = "large",
+ add_overlay = True,
shard_count = 10,
test = "//test/syscalls/linux:socket_unix_pair_test",
)
@@ -484,19 +565,40 @@ syscall_test(
test = "//test/syscalls/linux:socket_unix_unbound_stream_test",
)
-syscall_test(test = "//test/syscalls/linux:statfs_test")
+syscall_test(
+ add_overlay = True,
+ test = "//test/syscalls/linux:statfs_test",
+)
-syscall_test(test = "//test/syscalls/linux:stat_test")
+syscall_test(
+ add_overlay = True,
+ test = "//test/syscalls/linux:stat_test",
+)
-syscall_test(test = "//test/syscalls/linux:stat_times_test")
+syscall_test(
+ add_overlay = True,
+ test = "//test/syscalls/linux:stat_times_test",
+)
-syscall_test(test = "//test/syscalls/linux:sticky_test")
+syscall_test(
+ add_overlay = True,
+ test = "//test/syscalls/linux:sticky_test",
+)
-syscall_test(test = "//test/syscalls/linux:symlink_test")
+syscall_test(
+ add_overlay = True,
+ test = "//test/syscalls/linux:symlink_test",
+)
-syscall_test(test = "//test/syscalls/linux:sync_test")
+syscall_test(
+ add_overlay = True,
+ test = "//test/syscalls/linux:sync_test",
+)
-syscall_test(test = "//test/syscalls/linux:sync_file_range_test")
+syscall_test(
+ add_overlay = True,
+ test = "//test/syscalls/linux:sync_file_range_test",
+)
syscall_test(test = "//test/syscalls/linux:sysinfo_test")
@@ -520,7 +622,10 @@ syscall_test(test = "//test/syscalls/linux:time_test")
syscall_test(test = "//test/syscalls/linux:tkill_test")
-syscall_test(test = "//test/syscalls/linux:truncate_test")
+syscall_test(
+ add_overlay = True,
+ test = "//test/syscalls/linux:truncate_test",
+)
syscall_test(test = "//test/syscalls/linux:udp_bind_test")
@@ -534,7 +639,10 @@ syscall_test(test = "//test/syscalls/linux:uidgid_test")
syscall_test(test = "//test/syscalls/linux:uname_test")
-syscall_test(test = "//test/syscalls/linux:unlink_test")
+syscall_test(
+ add_overlay = True,
+ test = "//test/syscalls/linux:unlink_test",
+)
syscall_test(test = "//test/syscalls/linux:unshare_test")
@@ -556,15 +664,13 @@ syscall_test(
test = "//test/syscalls/linux:wait_test",
)
-syscall_test(test = "//test/syscalls/linux:write_test")
-
syscall_test(
- test = "//test/syscalls/linux:proc_net_unix_test",
- # Unix domain socket creation isn't supported on all file systems. The
- # sentry-internal tmpfs is known to support it.
- use_tmpfs = True,
+ add_overlay = True,
+ test = "//test/syscalls/linux:write_test",
)
+syscall_test(test = "//test/syscalls/linux:proc_net_unix_test")
+
go_binary(
name = "syscall_test_runner",
srcs = ["syscall_test_runner.go"],
diff --git a/test/syscalls/build_defs.bzl b/test/syscalls/build_defs.bzl
index cd74a769d..9f2fc9109 100644
--- a/test/syscalls/build_defs.bzl
+++ b/test/syscalls/build_defs.bzl
@@ -7,6 +7,7 @@ def syscall_test(
shard_count = 1,
size = "small",
use_tmpfs = False,
+ add_overlay = False,
tags = None,
parallel = True):
_syscall_test(
@@ -39,6 +40,18 @@ def syscall_test(
parallel = parallel,
)
+ if add_overlay:
+ _syscall_test(
+ test = test,
+ shard_count = shard_count,
+ size = size,
+ platform = "ptrace",
+ use_tmpfs = False, # overlay is adding a writable tmpfs on top of root.
+ tags = tags,
+ parallel = parallel,
+ overlay = True,
+ )
+
if not use_tmpfs:
# Also test shared gofer access.
_syscall_test(
@@ -60,7 +73,8 @@ def _syscall_test(
use_tmpfs,
tags,
parallel,
- file_access = "exclusive"):
+ file_access = "exclusive",
+ overlay = False):
test_name = test.split(":")[1]
# Prepend "runsc" to non-native platform names.
@@ -69,6 +83,8 @@ def _syscall_test(
name = test_name + "_" + full_platform
if file_access == "shared":
name += "_shared"
+ if overlay:
+ name += "_overlay"
if tags == None:
tags = []
@@ -92,6 +108,7 @@ def _syscall_test(
"--platform=" + platform,
"--use-tmpfs=" + str(use_tmpfs),
"--file-access=" + file_access,
+ "--overlay=" + str(overlay),
]
if parallel:
diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD
index 8465e5ad0..9bafc6e4f 100644
--- a/test/syscalls/linux/BUILD
+++ b/test/syscalls/linux/BUILD
@@ -999,6 +999,7 @@ cc_binary(
linkstatic = 1,
deps = [
"//test/util:cleanup",
+ "//test/util:memory_util",
"//test/util:test_main",
"//test/util:test_util",
"//test/util:thread_util",
@@ -1317,6 +1318,7 @@ cc_binary(
linkstatic = 1,
deps = [
"//test/util:capability_util",
+ "//test/util:cleanup",
"//test/util:multiprocess_util",
"//test/util:posix_error",
"//test/util:test_util",
@@ -2095,6 +2097,7 @@ cc_binary(
deps = [
":socket_generic_test_cases",
":socket_test_util",
+ ":socket_unix_cmsg_test_cases",
":socket_unix_test_cases",
":unix_domain_socket_test_util",
"//test/util:test_main",
@@ -2368,6 +2371,7 @@ cc_binary(
deps = [
":socket_generic_test_cases",
":socket_test_util",
+ ":socket_unix_cmsg_test_cases",
":socket_unix_test_cases",
":unix_domain_socket_test_util",
"//test/util:test_main",
@@ -2490,6 +2494,26 @@ cc_library(
)
cc_library(
+ name = "socket_unix_cmsg_test_cases",
+ testonly = 1,
+ srcs = [
+ "socket_unix_cmsg.cc",
+ ],
+ hdrs = [
+ "socket_unix_cmsg.h",
+ ],
+ deps = [
+ ":socket_test_util",
+ ":unix_domain_socket_test_util",
+ "//test/util:test_util",
+ "//test/util:thread_util",
+ "@com_google_absl//absl/strings",
+ "@com_google_googletest//:gtest",
+ ],
+ alwayslink = 1,
+)
+
+cc_library(
name = "socket_stream_blocking_test_cases",
testonly = 1,
srcs = [
@@ -2614,22 +2638,6 @@ cc_binary(
)
cc_binary(
- name = "socket_unix_abstract_test",
- testonly = 1,
- srcs = [
- "socket_unix_abstract.cc",
- ],
- linkstatic = 1,
- deps = [
- ":socket_test_util",
- ":socket_unix_test_cases",
- ":unix_domain_socket_test_util",
- "//test/util:test_main",
- "//test/util:test_util",
- ],
-)
-
-cc_binary(
name = "socket_unix_unbound_dgram_test",
testonly = 1,
srcs = ["socket_unix_unbound_dgram.cc"],
@@ -2672,23 +2680,6 @@ cc_binary(
)
cc_binary(
- name = "socket_unix_filesystem_test",
- testonly = 1,
- srcs = [
- "socket_unix_filesystem.cc",
- ],
- linkstatic = 1,
- deps = [
- ":socket_test_util",
- ":socket_unix_test_cases",
- ":unix_domain_socket_test_util",
- "//test/util:test_main",
- "//test/util:test_util",
- "@com_google_googletest//:gtest",
- ],
-)
-
-cc_binary(
name = "socket_blocking_local_test",
testonly = 1,
srcs = [
@@ -2765,6 +2756,7 @@ cc_binary(
linkstatic = 1,
deps = [
":socket_test_util",
+ ":socket_unix_cmsg_test_cases",
":socket_unix_test_cases",
":unix_domain_socket_test_util",
"//test/util:test_main",
diff --git a/test/syscalls/linux/accept_bind.cc b/test/syscalls/linux/accept_bind.cc
index 56377feab..1122ea240 100644
--- a/test/syscalls/linux/accept_bind.cc
+++ b/test/syscalls/linux/accept_bind.cc
@@ -448,19 +448,7 @@ TEST_P(AllSocketPairTest, UnboundSenderAddr) {
RetryEINTR(recvfrom)(accepted_fd.get(), &i, sizeof(i), 0,
reinterpret_cast<sockaddr*>(&addr), &addr_len),
SyscallSucceedsWithValue(sizeof(i)));
- if (!IsRunningOnGvisor()) {
- // Linux returns a zero length for addresses from recvfrom(2) and
- // recvmsg(2). This differs from the behavior of getpeername(2) and
- // getsockname(2). For simplicity, we use the getpeername(2) and
- // getsockname(2) behavior for recvfrom(2) and recvmsg(2).
- EXPECT_EQ(addr_len, 0);
- return;
- }
- EXPECT_EQ(addr_len, 2);
- EXPECT_EQ(
- memcmp(&addr, sockets->second_addr(),
- std::min((size_t)addr_len, (size_t)sockets->second_addr_len())),
- 0);
+ EXPECT_EQ(addr_len, 0);
}
TEST_P(AllSocketPairTest, BoundSenderAddr) {
diff --git a/test/syscalls/linux/mempolicy.cc b/test/syscalls/linux/mempolicy.cc
index 4ac4cb88f..9d5f47651 100644
--- a/test/syscalls/linux/mempolicy.cc
+++ b/test/syscalls/linux/mempolicy.cc
@@ -18,6 +18,7 @@
#include "gtest/gtest.h"
#include "absl/memory/memory.h"
#include "test/util/cleanup.h"
+#include "test/util/memory_util.h"
#include "test/util/test_util.h"
#include "test/util/thread_util.h"
@@ -34,7 +35,7 @@ namespace {
#define MPOL_PREFERRED 1
#define MPOL_BIND 2
#define MPOL_INTERLEAVE 3
-#define MPOL_MAX MPOL_INTERLEAVE
+#define MPOL_LOCAL 4
#define MPOL_F_NODE (1 << 0)
#define MPOL_F_ADDR (1 << 1)
#define MPOL_F_MEMS_ALLOWED (1 << 2)
@@ -44,11 +45,17 @@ namespace {
int get_mempolicy(int *policy, uint64_t *nmask, uint64_t maxnode, void *addr,
int flags) {
- return syscall(__NR_get_mempolicy, policy, nmask, maxnode, addr, flags);
+ return syscall(SYS_get_mempolicy, policy, nmask, maxnode, addr, flags);
}
int set_mempolicy(int mode, uint64_t *nmask, uint64_t maxnode) {
- return syscall(__NR_set_mempolicy, mode, nmask, maxnode);
+ return syscall(SYS_set_mempolicy, mode, nmask, maxnode);
+}
+
+int mbind(void *addr, unsigned long len, int mode,
+ const unsigned long *nodemask, unsigned long maxnode,
+ unsigned flags) {
+ return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags);
}
// Creates a cleanup object that resets the calling thread's mempolicy to the
@@ -252,6 +259,30 @@ TEST(MempolicyTest, GetMempolicyNextInterleaveNode) {
EXPECT_EQ(0, mode);
}
+TEST(MempolicyTest, Mbind) {
+ // Temporarily set the thread policy to MPOL_PREFERRED.
+ const auto cleanup_thread_policy =
+ ASSERT_NO_ERRNO_AND_VALUE(ScopedSetMempolicy(MPOL_PREFERRED, nullptr, 0));
+
+ const auto mapping = ASSERT_NO_ERRNO_AND_VALUE(
+ MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS));
+
+ // vmas default to MPOL_DEFAULT irrespective of the thread policy (currently
+ // MPOL_PREFERRED).
+ int mode;
+ ASSERT_THAT(get_mempolicy(&mode, nullptr, 0, mapping.ptr(), MPOL_F_ADDR),
+ SyscallSucceeds());
+ EXPECT_EQ(mode, MPOL_DEFAULT);
+
+ // Set MPOL_PREFERRED for the vma and read it back.
+ ASSERT_THAT(
+ mbind(mapping.ptr(), mapping.len(), MPOL_PREFERRED, nullptr, 0, 0),
+ SyscallSucceeds());
+ ASSERT_THAT(get_mempolicy(&mode, nullptr, 0, mapping.ptr(), MPOL_F_ADDR),
+ SyscallSucceeds());
+ EXPECT_EQ(mode, MPOL_PREFERRED);
+}
+
} // namespace
} // namespace testing
diff --git a/test/syscalls/linux/pipe.cc b/test/syscalls/linux/pipe.cc
index bce351e08..67b93ecf5 100644
--- a/test/syscalls/linux/pipe.cc
+++ b/test/syscalls/linux/pipe.cc
@@ -55,7 +55,7 @@ class PipeTest : public ::testing::TestWithParam<PipeCreator> {
FileDescriptor wfd;
public:
- static void SetUpTestCase() {
+ static void SetUpTestSuite() {
// Tests intentionally generate SIGPIPE.
TEST_PCHECK(signal(SIGPIPE, SIG_IGN) != SIG_ERR);
}
@@ -82,7 +82,7 @@ class PipeTest : public ::testing::TestWithParam<PipeCreator> {
return s1;
}
- static void TearDownTestCase() {
+ static void TearDownTestSuite() {
TEST_PCHECK(signal(SIGPIPE, SIG_DFL) != SIG_ERR);
}
diff --git a/test/syscalls/linux/prctl.cc b/test/syscalls/linux/prctl.cc
index bce42dc74..bd1779557 100644
--- a/test/syscalls/linux/prctl.cc
+++ b/test/syscalls/linux/prctl.cc
@@ -17,10 +17,12 @@
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
+
#include <string>
#include "gtest/gtest.h"
#include "test/util/capability_util.h"
+#include "test/util/cleanup.h"
#include "test/util/multiprocess_util.h"
#include "test/util/posix_error.h"
#include "test/util/test_util.h"
@@ -35,6 +37,16 @@ namespace testing {
namespace {
+#ifndef SUID_DUMP_DISABLE
+#define SUID_DUMP_DISABLE 0
+#endif /* SUID_DUMP_DISABLE */
+#ifndef SUID_DUMP_USER
+#define SUID_DUMP_USER 1
+#endif /* SUID_DUMP_USER */
+#ifndef SUID_DUMP_ROOT
+#define SUID_DUMP_ROOT 2
+#endif /* SUID_DUMP_ROOT */
+
TEST(PrctlTest, NameInitialized) {
const size_t name_length = 20;
char name[name_length] = {};
@@ -178,6 +190,28 @@ TEST(PrctlTest, InvalidPrSetMM) {
ASSERT_THAT(prctl(PR_SET_MM, 0, 0, 0, 0), SyscallFailsWithErrno(EPERM));
}
+// Sanity check that dumpability is remembered.
+TEST(PrctlTest, SetGetDumpability) {
+ int before;
+ ASSERT_THAT(before = prctl(PR_GET_DUMPABLE), SyscallSucceeds());
+ auto cleanup = Cleanup([before] {
+ ASSERT_THAT(prctl(PR_SET_DUMPABLE, before), SyscallSucceeds());
+ });
+
+ EXPECT_THAT(prctl(PR_SET_DUMPABLE, SUID_DUMP_DISABLE), SyscallSucceeds());
+ EXPECT_THAT(prctl(PR_GET_DUMPABLE),
+ SyscallSucceedsWithValue(SUID_DUMP_DISABLE));
+
+ EXPECT_THAT(prctl(PR_SET_DUMPABLE, SUID_DUMP_USER), SyscallSucceeds());
+ EXPECT_THAT(prctl(PR_GET_DUMPABLE), SyscallSucceedsWithValue(SUID_DUMP_USER));
+}
+
+// SUID_DUMP_ROOT cannot be set via PR_SET_DUMPABLE.
+TEST(PrctlTest, RootDumpability) {
+ EXPECT_THAT(prctl(PR_SET_DUMPABLE, SUID_DUMP_ROOT),
+ SyscallFailsWithErrno(EINVAL));
+}
+
} // namespace
} // namespace testing
diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc
index ede6fb860..924b98e3a 100644
--- a/test/syscalls/linux/proc.cc
+++ b/test/syscalls/linux/proc.cc
@@ -69,9 +69,11 @@
// way to get it tested on both gVisor, PTrace and Linux.
using ::testing::AllOf;
+using ::testing::AnyOf;
using ::testing::ContainerEq;
using ::testing::Contains;
using ::testing::ContainsRegex;
+using ::testing::Eq;
using ::testing::Gt;
using ::testing::HasSubstr;
using ::testing::IsSupersetOf;
@@ -86,6 +88,16 @@ namespace gvisor {
namespace testing {
namespace {
+#ifndef SUID_DUMP_DISABLE
+#define SUID_DUMP_DISABLE 0
+#endif /* SUID_DUMP_DISABLE */
+#ifndef SUID_DUMP_USER
+#define SUID_DUMP_USER 1
+#endif /* SUID_DUMP_USER */
+#ifndef SUID_DUMP_ROOT
+#define SUID_DUMP_ROOT 2
+#endif /* SUID_DUMP_ROOT */
+
// O_LARGEFILE as defined by Linux. glibc tries to be clever by setting it to 0
// because "it isn't needed", even though Linux can return it via F_GETFL.
constexpr int kOLargeFile = 00100000;
@@ -1896,6 +1908,51 @@ void CheckDuplicatesRecursively(std::string path) {
TEST(Proc, NoDuplicates) { CheckDuplicatesRecursively("/proc"); }
+// Most /proc/PID files are owned by the task user with SUID_DUMP_USER.
+TEST(ProcPid, UserDumpableOwner) {
+ int before;
+ ASSERT_THAT(before = prctl(PR_GET_DUMPABLE), SyscallSucceeds());
+ auto cleanup = Cleanup([before] {
+ ASSERT_THAT(prctl(PR_SET_DUMPABLE, before), SyscallSucceeds());
+ });
+
+ EXPECT_THAT(prctl(PR_SET_DUMPABLE, SUID_DUMP_USER), SyscallSucceeds());
+
+ // This applies to the task directory itself and files inside.
+ struct stat st;
+ ASSERT_THAT(stat("/proc/self/", &st), SyscallSucceeds());
+ EXPECT_EQ(st.st_uid, geteuid());
+ EXPECT_EQ(st.st_gid, getegid());
+
+ ASSERT_THAT(stat("/proc/self/stat", &st), SyscallSucceeds());
+ EXPECT_EQ(st.st_uid, geteuid());
+ EXPECT_EQ(st.st_gid, getegid());
+}
+
+// /proc/PID files are owned by root with SUID_DUMP_DISABLE.
+TEST(ProcPid, RootDumpableOwner) {
+ int before;
+ ASSERT_THAT(before = prctl(PR_GET_DUMPABLE), SyscallSucceeds());
+ auto cleanup = Cleanup([before] {
+ ASSERT_THAT(prctl(PR_SET_DUMPABLE, before), SyscallSucceeds());
+ });
+
+ EXPECT_THAT(prctl(PR_SET_DUMPABLE, SUID_DUMP_DISABLE), SyscallSucceeds());
+
+ // This *does not* applies to the task directory itself (or other 0555
+ // directories), but does to files inside.
+ struct stat st;
+ ASSERT_THAT(stat("/proc/self/", &st), SyscallSucceeds());
+ EXPECT_EQ(st.st_uid, geteuid());
+ EXPECT_EQ(st.st_gid, getegid());
+
+ // This file is owned by root. Also allow nobody in case this test is running
+ // in a userns without root mapped.
+ ASSERT_THAT(stat("/proc/self/stat", &st), SyscallSucceeds());
+ EXPECT_THAT(st.st_uid, AnyOf(Eq(0), Eq(65534)));
+ EXPECT_THAT(st.st_gid, AnyOf(Eq(0), Eq(65534)));
+}
+
} // namespace
} // namespace testing
} // namespace gvisor
diff --git a/test/syscalls/linux/proc_net_unix.cc b/test/syscalls/linux/proc_net_unix.cc
index 6d745f728..82d325c17 100644
--- a/test/syscalls/linux/proc_net_unix.cc
+++ b/test/syscalls/linux/proc_net_unix.cc
@@ -34,6 +34,16 @@ using absl::StrFormat;
constexpr char kProcNetUnixHeader[] =
"Num RefCount Protocol Flags Type St Inode Path";
+// Possible values of the "st" field in a /proc/net/unix entry. Source: Linux
+// kernel, include/uapi/linux/net.h.
+enum {
+ SS_FREE = 0, // Not allocated
+ SS_UNCONNECTED, // Unconnected to any socket
+ SS_CONNECTING, // In process of connecting
+ SS_CONNECTED, // Connected to socket
+ SS_DISCONNECTING // In process of disconnecting
+};
+
// UnixEntry represents a single entry from /proc/net/unix.
struct UnixEntry {
uintptr_t addr;
@@ -71,7 +81,12 @@ PosixErrorOr<std::vector<UnixEntry>> ProcNetUnixEntries() {
bool skipped_header = false;
std::vector<UnixEntry> entries;
std::vector<std::string> lines = absl::StrSplit(content, absl::ByAnyChar("\n"));
+ std::cerr << "<contents of /proc/net/unix>" << std::endl;
for (std::string line : lines) {
+ // Emit the proc entry to the test output to provide context for the test
+ // results.
+ std::cerr << line << std::endl;
+
if (!skipped_header) {
EXPECT_EQ(line, kProcNetUnixHeader);
skipped_header = true;
@@ -139,6 +154,7 @@ PosixErrorOr<std::vector<UnixEntry>> ProcNetUnixEntries() {
entries.push_back(entry);
}
+ std::cerr << "<end of /proc/net/unix>" << std::endl;
return entries;
}
@@ -241,6 +257,168 @@ TEST(ProcNetUnix, SocketPair) {
EXPECT_EQ(entries.size(), 2);
}
+TEST(ProcNetUnix, StreamSocketStateUnconnectedOnBind) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(
+ AbstractUnboundUnixDomainSocketPair(SOCK_STREAM).Create());
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ std::vector<UnixEntry> entries =
+ ASSERT_NO_ERRNO_AND_VALUE(ProcNetUnixEntries());
+
+ const std::string address = ExtractPath(sockets->first_addr());
+ UnixEntry bind_entry;
+ ASSERT_TRUE(FindByPath(entries, &bind_entry, address));
+ EXPECT_EQ(bind_entry.state, SS_UNCONNECTED);
+}
+
+TEST(ProcNetUnix, StreamSocketStateStateUnconnectedOnListen) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(
+ AbstractUnboundUnixDomainSocketPair(SOCK_STREAM).Create());
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ std::vector<UnixEntry> entries =
+ ASSERT_NO_ERRNO_AND_VALUE(ProcNetUnixEntries());
+
+ const std::string address = ExtractPath(sockets->first_addr());
+ UnixEntry bind_entry;
+ ASSERT_TRUE(FindByPath(entries, &bind_entry, address));
+ EXPECT_EQ(bind_entry.state, SS_UNCONNECTED);
+
+ ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+
+ entries = ASSERT_NO_ERRNO_AND_VALUE(ProcNetUnixEntries());
+ UnixEntry listen_entry;
+ ASSERT_TRUE(
+ FindByPath(entries, &listen_entry, ExtractPath(sockets->first_addr())));
+ EXPECT_EQ(listen_entry.state, SS_UNCONNECTED);
+ // The bind and listen entries should refer to the same socket.
+ EXPECT_EQ(listen_entry.inode, bind_entry.inode);
+}
+
+TEST(ProcNetUnix, StreamSocketStateStateConnectedOnAccept) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(
+ AbstractUnboundUnixDomainSocketPair(SOCK_STREAM).Create());
+ const std::string address = ExtractPath(sockets->first_addr());
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+ ASSERT_THAT(listen(sockets->first_fd(), 5), SyscallSucceeds());
+ std::vector<UnixEntry> entries =
+ ASSERT_NO_ERRNO_AND_VALUE(ProcNetUnixEntries());
+ UnixEntry listen_entry;
+ ASSERT_TRUE(
+ FindByPath(entries, &listen_entry, ExtractPath(sockets->first_addr())));
+
+ ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ int clientfd;
+ ASSERT_THAT(clientfd = accept(sockets->first_fd(), nullptr, nullptr),
+ SyscallSucceeds());
+
+ // Find the entry for the accepted socket. UDS proc entries don't have a
+ // remote address, so we distinguish the accepted socket from the listen
+ // socket by checking for a different inode.
+ entries = ASSERT_NO_ERRNO_AND_VALUE(ProcNetUnixEntries());
+ UnixEntry accept_entry;
+ ASSERT_TRUE(FindBy(
+ entries, &accept_entry, [address, listen_entry](const UnixEntry& e) {
+ return e.path == address && e.inode != listen_entry.inode;
+ }));
+ EXPECT_EQ(accept_entry.state, SS_CONNECTED);
+ // Listen entry should still be in SS_UNCONNECTED state.
+ ASSERT_TRUE(FindBy(entries, &listen_entry,
+ [&sockets, listen_entry](const UnixEntry& e) {
+ return e.path == ExtractPath(sockets->first_addr()) &&
+ e.inode == listen_entry.inode;
+ }));
+ EXPECT_EQ(listen_entry.state, SS_UNCONNECTED);
+}
+
+TEST(ProcNetUnix, DgramSocketStateDisconnectingOnBind) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(
+ AbstractUnboundUnixDomainSocketPair(SOCK_DGRAM).Create());
+
+ std::vector<UnixEntry> entries =
+ ASSERT_NO_ERRNO_AND_VALUE(ProcNetUnixEntries());
+
+ // On gVisor, the only two UDS on the system are the ones we just created and
+ // we rely on this to locate the test socket entries in the remainder of the
+ // test. On a generic Linux system, we have no easy way to locate the
+ // corresponding entries, as they don't have an address yet.
+ if (IsRunningOnGvisor()) {
+ ASSERT_EQ(entries.size(), 2);
+ for (auto e : entries) {
+ ASSERT_EQ(e.state, SS_DISCONNECTING);
+ }
+ }
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ entries = ASSERT_NO_ERRNO_AND_VALUE(ProcNetUnixEntries());
+ const std::string address = ExtractPath(sockets->first_addr());
+ UnixEntry bind_entry;
+ ASSERT_TRUE(FindByPath(entries, &bind_entry, address));
+ EXPECT_EQ(bind_entry.state, SS_UNCONNECTED);
+}
+
+TEST(ProcNetUnix, DgramSocketStateConnectingOnConnect) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(
+ AbstractUnboundUnixDomainSocketPair(SOCK_DGRAM).Create());
+
+ std::vector<UnixEntry> entries =
+ ASSERT_NO_ERRNO_AND_VALUE(ProcNetUnixEntries());
+
+ // On gVisor, the only two UDS on the system are the ones we just created and
+ // we rely on this to locate the test socket entries in the remainder of the
+ // test. On a generic Linux system, we have no easy way to locate the
+ // corresponding entries, as they don't have an address yet.
+ if (IsRunningOnGvisor()) {
+ ASSERT_EQ(entries.size(), 2);
+ for (auto e : entries) {
+ ASSERT_EQ(e.state, SS_DISCONNECTING);
+ }
+ }
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ entries = ASSERT_NO_ERRNO_AND_VALUE(ProcNetUnixEntries());
+ const std::string address = ExtractPath(sockets->first_addr());
+ UnixEntry bind_entry;
+ ASSERT_TRUE(FindByPath(entries, &bind_entry, address));
+
+ ASSERT_THAT(connect(sockets->second_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ entries = ASSERT_NO_ERRNO_AND_VALUE(ProcNetUnixEntries());
+
+ // Once again, we have no easy way to identify the connecting socket as it has
+ // no listed address. We can only identify the entry as the "non-bind socket
+ // entry" on gVisor, where we're guaranteed to have only the two entries we
+ // create during this test.
+ if (IsRunningOnGvisor()) {
+ ASSERT_EQ(entries.size(), 2);
+ UnixEntry connect_entry;
+ ASSERT_TRUE(
+ FindBy(entries, &connect_entry, [bind_entry](const UnixEntry& e) {
+ return e.inode != bind_entry.inode;
+ }));
+ EXPECT_EQ(connect_entry.state, SS_CONNECTING);
+ }
+}
+
} // namespace
} // namespace testing
} // namespace gvisor
diff --git a/test/syscalls/linux/sendfile_socket.cc b/test/syscalls/linux/sendfile_socket.cc
index 66adda515..1c56540bc 100644
--- a/test/syscalls/linux/sendfile_socket.cc
+++ b/test/syscalls/linux/sendfile_socket.cc
@@ -33,9 +33,69 @@ namespace gvisor {
namespace testing {
namespace {
+class SendFileTest : public ::testing::TestWithParam<int> {
+ protected:
+ PosixErrorOr<std::tuple<int, int>> Sockets() {
+ // Bind a server socket.
+ int family = GetParam();
+ struct sockaddr server_addr = {};
+ switch (family) {
+ case AF_INET: {
+ struct sockaddr_in *server_addr_in =
+ reinterpret_cast<struct sockaddr_in *>(&server_addr);
+ server_addr_in->sin_family = family;
+ server_addr_in->sin_addr.s_addr = INADDR_ANY;
+ break;
+ }
+ case AF_UNIX: {
+ struct sockaddr_un *server_addr_un =
+ reinterpret_cast<struct sockaddr_un *>(&server_addr);
+ server_addr_un->sun_family = family;
+ server_addr_un->sun_path[0] = '\0';
+ break;
+ }
+ default:
+ return PosixError(EINVAL);
+ }
+ int server = socket(family, SOCK_STREAM, 0);
+ if (bind(server, &server_addr, sizeof(server_addr)) < 0) {
+ return PosixError(errno);
+ }
+ if (listen(server, 1) < 0) {
+ close(server);
+ return PosixError(errno);
+ }
+
+ // Fetch the address; both are anonymous.
+ socklen_t length = sizeof(server_addr);
+ if (getsockname(server, &server_addr, &length) < 0) {
+ close(server);
+ return PosixError(errno);
+ }
+
+ // Connect the client.
+ int client = socket(family, SOCK_STREAM, 0);
+ if (connect(client, &server_addr, length) < 0) {
+ close(server);
+ close(client);
+ return PosixError(errno);
+ }
+
+ // Accept on the server.
+ int server_client = accept(server, nullptr, 0);
+ if (server_client < 0) {
+ close(server);
+ close(client);
+ return PosixError(errno);
+ }
+ close(server);
+ return std::make_tuple(client, server_client);
+ }
+};
+
// Sends large file to exercise the path that read and writes data multiple
// times, esp. when more data is read than can be written.
-TEST(SendFileTest, SendMultiple) {
+TEST_P(SendFileTest, SendMultiple) {
std::vector<char> data(5 * 1024 * 1024);
RandomizeBuffer(data.data(), data.size());
@@ -45,34 +105,20 @@ TEST(SendFileTest, SendMultiple) {
TempPath::kDefaultFileMode));
const TempPath out_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile());
- // Use a socket for target file to make the write window small.
- const FileDescriptor server(socket(AF_INET, SOCK_STREAM, IPPROTO_TCP));
- ASSERT_THAT(server.get(), SyscallSucceeds());
-
- struct sockaddr_in server_addr = {};
- server_addr.sin_family = AF_INET;
- server_addr.sin_addr.s_addr = INADDR_ANY;
- ASSERT_THAT(
- bind(server.get(), reinterpret_cast<struct sockaddr *>(&server_addr),
- sizeof(server_addr)),
- SyscallSucceeds());
- ASSERT_THAT(listen(server.get(), 1), SyscallSucceeds());
+ // Create sockets.
+ std::tuple<int, int> fds = ASSERT_NO_ERRNO_AND_VALUE(Sockets());
+ const FileDescriptor server(std::get<0>(fds));
+ FileDescriptor client(std::get<1>(fds)); // non-const, reset is used.
// Thread that reads data from socket and dumps to a file.
- ScopedThread th([&server, &out_file, &server_addr] {
- socklen_t addrlen = sizeof(server_addr);
- const FileDescriptor fd(RetryEINTR(accept)(
- server.get(), reinterpret_cast<struct sockaddr *>(&server_addr),
- &addrlen));
- ASSERT_THAT(fd.get(), SyscallSucceeds());
-
+ ScopedThread th([&] {
FileDescriptor outf =
ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_WRONLY));
// Read until socket is closed.
char buf[10240];
for (int cnt = 0;; cnt++) {
- int r = RetryEINTR(read)(fd.get(), buf, sizeof(buf));
+ int r = RetryEINTR(read)(server.get(), buf, sizeof(buf));
// We cannot afford to save on every read() call.
if (cnt % 1000 == 0) {
ASSERT_THAT(r, SyscallSucceeds());
@@ -99,25 +145,6 @@ TEST(SendFileTest, SendMultiple) {
const FileDescriptor inf =
ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
- FileDescriptor outf(socket(AF_INET, SOCK_STREAM, IPPROTO_TCP));
- ASSERT_THAT(outf.get(), SyscallSucceeds());
-
- // Get the port bound by the listening socket.
- socklen_t addrlen = sizeof(server_addr);
- ASSERT_THAT(getsockname(server.get(),
- reinterpret_cast<sockaddr *>(&server_addr), &addrlen),
- SyscallSucceeds());
-
- struct sockaddr_in addr = {};
- addr.sin_family = AF_INET;
- addr.sin_addr.s_addr = inet_addr("127.0.0.1");
- addr.sin_port = server_addr.sin_port;
- std::cout << "Connecting on port=" << server_addr.sin_port;
- ASSERT_THAT(
- RetryEINTR(connect)(
- outf.get(), reinterpret_cast<struct sockaddr *>(&addr), sizeof(addr)),
- SyscallSucceeds());
-
int cnt = 0;
for (size_t sent = 0; sent < data.size(); cnt++) {
const size_t remain = data.size() - sent;
@@ -125,7 +152,7 @@ TEST(SendFileTest, SendMultiple) {
<< ", remain=" << remain;
// Send data and verify that sendfile returns the correct value.
- int res = sendfile(outf.get(), inf.get(), nullptr, remain);
+ int res = sendfile(client.get(), inf.get(), nullptr, remain);
// We cannot afford to save on every sendfile() call.
if (cnt % 120 == 0) {
MaybeSave();
@@ -142,17 +169,74 @@ TEST(SendFileTest, SendMultiple) {
}
// Close socket to stop thread.
- outf.reset();
+ client.reset();
th.Join();
// Verify that the output file has the correct data.
- outf = ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_RDONLY));
+ const FileDescriptor outf =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(out_file.path(), O_RDONLY));
std::vector<char> actual(data.size(), '\0');
ASSERT_THAT(RetryEINTR(read)(outf.get(), actual.data(), actual.size()),
SyscallSucceedsWithValue(actual.size()));
ASSERT_EQ(memcmp(data.data(), actual.data(), data.size()), 0);
}
+TEST_P(SendFileTest, Shutdown) {
+ // Create a socket.
+ std::tuple<int, int> fds = ASSERT_NO_ERRNO_AND_VALUE(Sockets());
+ const FileDescriptor client(std::get<0>(fds));
+ FileDescriptor server(std::get<1>(fds)); // non-const, released below.
+
+ // If this is a TCP socket, then turn off linger.
+ if (GetParam() == AF_INET) {
+ struct linger sl;
+ sl.l_onoff = 1;
+ sl.l_linger = 0;
+ ASSERT_THAT(
+ setsockopt(server.get(), SOL_SOCKET, SO_LINGER, &sl, sizeof(sl)),
+ SyscallSucceeds());
+ }
+
+ // Create a 1m file with random data.
+ std::vector<char> data(1024 * 1024);
+ RandomizeBuffer(data.data(), data.size());
+ const TempPath in_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith(
+ GetAbsoluteTestTmpdir(), absl::string_view(data.data(), data.size()),
+ TempPath::kDefaultFileMode));
+ const FileDescriptor inf =
+ ASSERT_NO_ERRNO_AND_VALUE(Open(in_file.path(), O_RDONLY));
+
+ // Read some data, then shutdown the socket. We don't actually care about
+ // checking the contents (other tests do that), so we just re-use the same
+ // buffer as above.
+ ScopedThread t([&]() {
+ int done = 0;
+ while (done < data.size()) {
+ int n = read(server.get(), data.data(), data.size());
+ ASSERT_THAT(n, SyscallSucceeds());
+ done += n;
+ }
+ // Close the server side socket.
+ ASSERT_THAT(close(server.release()), SyscallSucceeds());
+ });
+
+ // Continuously stream from the file to the socket. Note we do not assert
+ // that a specific amount of data has been written at any time, just that some
+ // data is written. Eventually, we should get a connection reset error.
+ while (1) {
+ off_t offset = 0; // Always read from the start.
+ int n = sendfile(client.get(), inf.get(), &offset, data.size());
+ EXPECT_THAT(n, AnyOf(SyscallFailsWithErrno(ECONNRESET),
+ SyscallFailsWithErrno(EPIPE), SyscallSucceeds()));
+ if (n <= 0) {
+ break;
+ }
+ }
+}
+
+INSTANTIATE_TEST_SUITE_P(AddressFamily, SendFileTest,
+ ::testing::Values(AF_UNIX, AF_INET));
+
} // namespace
} // namespace testing
} // namespace gvisor
diff --git a/test/syscalls/linux/socket_abstract.cc b/test/syscalls/linux/socket_abstract.cc
index 2faf678f7..715d87b76 100644
--- a/test/syscalls/linux/socket_abstract.cc
+++ b/test/syscalls/linux/socket_abstract.cc
@@ -17,6 +17,7 @@
#include "test/syscalls/linux/socket_generic.h"
#include "test/syscalls/linux/socket_test_util.h"
#include "test/syscalls/linux/socket_unix.h"
+#include "test/syscalls/linux/socket_unix_cmsg.h"
#include "test/syscalls/linux/unix_domain_socket_test_util.h"
#include "test/util/test_util.h"
@@ -31,11 +32,15 @@ std::vector<SocketPairKind> GetSocketPairs() {
}
INSTANTIATE_TEST_SUITE_P(
- AllUnixDomainSockets, AllSocketPairTest,
+ AbstractUnixSockets, AllSocketPairTest,
::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
INSTANTIATE_TEST_SUITE_P(
- AllUnixDomainSockets, UnixSocketPairTest,
+ AbstractUnixSockets, UnixSocketPairTest,
+ ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+INSTANTIATE_TEST_SUITE_P(
+ AbstractUnixSockets, UnixSocketPairCmsgTest,
::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
} // namespace testing
diff --git a/test/syscalls/linux/socket_filesystem.cc b/test/syscalls/linux/socket_filesystem.cc
index f7cb72df4..74e262959 100644
--- a/test/syscalls/linux/socket_filesystem.cc
+++ b/test/syscalls/linux/socket_filesystem.cc
@@ -17,6 +17,7 @@
#include "test/syscalls/linux/socket_generic.h"
#include "test/syscalls/linux/socket_test_util.h"
#include "test/syscalls/linux/socket_unix.h"
+#include "test/syscalls/linux/socket_unix_cmsg.h"
#include "test/syscalls/linux/unix_domain_socket_test_util.h"
#include "test/util/test_util.h"
@@ -31,11 +32,15 @@ std::vector<SocketPairKind> GetSocketPairs() {
}
INSTANTIATE_TEST_SUITE_P(
- AllUnixDomainSockets, AllSocketPairTest,
+ FilesystemUnixSockets, AllSocketPairTest,
::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
INSTANTIATE_TEST_SUITE_P(
- AllUnixDomainSockets, UnixSocketPairTest,
+ FilesystemUnixSockets, UnixSocketPairTest,
+ ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
+INSTANTIATE_TEST_SUITE_P(
+ FilesystemUnixSockets, UnixSocketPairCmsgTest,
::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
} // namespace testing
diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc
index b216d14cb..df31d25b5 100644
--- a/test/syscalls/linux/socket_inet_loopback.cc
+++ b/test/syscalls/linux/socket_inet_loopback.cc
@@ -14,6 +14,7 @@
#include <arpa/inet.h>
#include <netinet/in.h>
+#include <poll.h>
#include <string.h>
#include <sys/socket.h>
@@ -144,6 +145,66 @@ TEST_P(SocketInetLoopbackTest, TCP) {
ASSERT_THAT(shutdown(conn_fd.get(), SHUT_RDWR), SyscallSucceeds());
}
+TEST_P(SocketInetLoopbackTest, TCPbacklog) {
+ auto const& param = GetParam();
+
+ TestAddress const& listener = param.listener;
+ TestAddress const& connector = param.connector;
+
+ // Create the listening socket.
+ const FileDescriptor listen_fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(listener.family(), SOCK_STREAM, IPPROTO_TCP));
+ sockaddr_storage listen_addr = listener.addr;
+ ASSERT_THAT(bind(listen_fd.get(), reinterpret_cast<sockaddr*>(&listen_addr),
+ listener.addr_len),
+ SyscallSucceeds());
+ ASSERT_THAT(listen(listen_fd.get(), 2), SyscallSucceeds());
+
+ // Get the port bound by the listening socket.
+ socklen_t addrlen = listener.addr_len;
+ ASSERT_THAT(getsockname(listen_fd.get(),
+ reinterpret_cast<sockaddr*>(&listen_addr), &addrlen),
+ SyscallSucceeds());
+ uint16_t const port =
+ ASSERT_NO_ERRNO_AND_VALUE(AddrPort(listener.family(), listen_addr));
+ int i = 0;
+ while (1) {
+ int ret;
+
+ // Connect to the listening socket.
+ const FileDescriptor conn_fd = ASSERT_NO_ERRNO_AND_VALUE(
+ Socket(connector.family(), SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP));
+ sockaddr_storage conn_addr = connector.addr;
+ ASSERT_NO_ERRNO(SetAddrPort(connector.family(), &conn_addr, port));
+ ret = connect(conn_fd.get(), reinterpret_cast<sockaddr*>(&conn_addr),
+ connector.addr_len);
+ if (ret != 0) {
+ EXPECT_THAT(ret, SyscallFailsWithErrno(EINPROGRESS));
+ struct pollfd pfd = {
+ .fd = conn_fd.get(),
+ .events = POLLOUT,
+ };
+ ret = poll(&pfd, 1, 3000);
+ if (ret == 0) break;
+ EXPECT_THAT(ret, SyscallSucceedsWithValue(1));
+ }
+ EXPECT_THAT(RetryEINTR(send)(conn_fd.get(), &i, sizeof(i), 0),
+ SyscallSucceedsWithValue(sizeof(i)));
+ ASSERT_THAT(shutdown(conn_fd.get(), SHUT_RDWR), SyscallSucceeds());
+ i++;
+ }
+
+ for (; i != 0; i--) {
+ // Accept the connection.
+ //
+ // We have to assign a name to the accepted socket, as unamed temporary
+ // objects are destructed upon full evaluation of the expression it is in,
+ // potentially causing the connecting socket to fail to shutdown properly.
+ auto accepted =
+ ASSERT_NO_ERRNO_AND_VALUE(Accept(listen_fd.get(), nullptr, nullptr));
+ }
+}
+
INSTANTIATE_TEST_SUITE_P(
All, SocketInetLoopbackTest,
::testing::Values(
diff --git a/test/syscalls/linux/socket_ip_loopback_blocking.cc b/test/syscalls/linux/socket_ip_loopback_blocking.cc
index d7fc20aad..d7fc9715b 100644
--- a/test/syscalls/linux/socket_ip_loopback_blocking.cc
+++ b/test/syscalls/linux/socket_ip_loopback_blocking.cc
@@ -39,7 +39,7 @@ std::vector<SocketPairKind> GetSocketPairs() {
}
INSTANTIATE_TEST_SUITE_P(
- AllUnixDomainSockets, BlockingSocketPairTest,
+ BlockingIPSockets, BlockingSocketPairTest,
::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
} // namespace testing
diff --git a/test/syscalls/linux/socket_ip_tcp_generic.cc b/test/syscalls/linux/socket_ip_tcp_generic.cc
index 5b198f49d..0b76280a7 100644
--- a/test/syscalls/linux/socket_ip_tcp_generic.cc
+++ b/test/syscalls/linux/socket_ip_tcp_generic.cc
@@ -592,5 +592,109 @@ TEST_P(TCPSocketPairTest, MsgTruncMsgPeek) {
EXPECT_EQ(0, memcmp(received_data2, sent_data, sizeof(sent_data)));
}
+TEST_P(TCPSocketPairTest, SetCongestionControlSucceedsForSupported) {
+ // This is Linux's net/tcp.h TCP_CA_NAME_MAX.
+ const int kTcpCaNameMax = 16;
+
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ // Netstack only supports reno & cubic so we only test these two values here.
+ {
+ const char kSetCC[kTcpCaNameMax] = "reno";
+ ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CONGESTION,
+ &kSetCC, strlen(kSetCC)),
+ SyscallSucceedsWithValue(0));
+
+ char got_cc[kTcpCaNameMax];
+ memset(got_cc, '1', sizeof(got_cc));
+ socklen_t optlen = sizeof(got_cc);
+ ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CONGESTION,
+ &got_cc, &optlen),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(0, memcmp(got_cc, kSetCC, sizeof(kSetCC)));
+ }
+ {
+ const char kSetCC[kTcpCaNameMax] = "cubic";
+ ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CONGESTION,
+ &kSetCC, strlen(kSetCC)),
+ SyscallSucceedsWithValue(0));
+
+ char got_cc[kTcpCaNameMax];
+ memset(got_cc, '1', sizeof(got_cc));
+ socklen_t optlen = sizeof(got_cc);
+ ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CONGESTION,
+ &got_cc, &optlen),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(0, memcmp(got_cc, kSetCC, sizeof(kSetCC)));
+ }
+}
+
+TEST_P(TCPSocketPairTest, SetGetTCPCongestionShortReadBuffer) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ {
+ // Verify that getsockopt/setsockopt work with buffers smaller than
+ // kTcpCaNameMax.
+ const char kSetCC[] = "cubic";
+ ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CONGESTION,
+ &kSetCC, strlen(kSetCC)),
+ SyscallSucceedsWithValue(0));
+
+ char got_cc[sizeof(kSetCC)];
+ socklen_t optlen = sizeof(got_cc);
+ ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CONGESTION,
+ &got_cc, &optlen),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(0, memcmp(got_cc, kSetCC, sizeof(got_cc)));
+ }
+}
+
+TEST_P(TCPSocketPairTest, SetGetTCPCongestionLargeReadBuffer) {
+ // This is Linux's net/tcp.h TCP_CA_NAME_MAX.
+ const int kTcpCaNameMax = 16;
+
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ {
+ // Verify that getsockopt works with buffers larger than
+ // kTcpCaNameMax.
+ const char kSetCC[] = "cubic";
+ ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CONGESTION,
+ &kSetCC, strlen(kSetCC)),
+ SyscallSucceedsWithValue(0));
+
+ char got_cc[kTcpCaNameMax + 5];
+ socklen_t optlen = sizeof(got_cc);
+ ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CONGESTION,
+ &got_cc, &optlen),
+ SyscallSucceedsWithValue(0));
+ // Linux copies the minimum of kTcpCaNameMax or the length of the passed in
+ // buffer and sets optlen to the number of bytes actually copied
+ // irrespective of the actual length of the congestion control name.
+ EXPECT_EQ(kTcpCaNameMax, optlen);
+ EXPECT_EQ(0, memcmp(got_cc, kSetCC, sizeof(kSetCC)));
+ }
+}
+
+TEST_P(TCPSocketPairTest, SetCongestionControlFailsForUnsupported) {
+ // This is Linux's net/tcp.h TCP_CA_NAME_MAX.
+ const int kTcpCaNameMax = 16;
+
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ char old_cc[kTcpCaNameMax];
+ socklen_t optlen;
+ ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CONGESTION,
+ &old_cc, &optlen),
+ SyscallSucceedsWithValue(0));
+
+ const char kSetCC[] = "invalid_ca_cc";
+ ASSERT_THAT(setsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CONGESTION,
+ &kSetCC, strlen(kSetCC)),
+ SyscallFailsWithErrno(ENOENT));
+
+ char got_cc[kTcpCaNameMax];
+ ASSERT_THAT(getsockopt(sockets->first_fd(), IPPROTO_TCP, TCP_CONGESTION,
+ &got_cc, &optlen),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(0, memcmp(got_cc, old_cc, sizeof(old_cc)));
+}
+
} // namespace testing
} // namespace gvisor
diff --git a/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc b/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc
index 2c6ae17bf..0dc274e2d 100644
--- a/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc
+++ b/test/syscalls/linux/socket_ip_tcp_generic_loopback.cc
@@ -35,7 +35,7 @@ std::vector<SocketPairKind> GetSocketPairs() {
}
INSTANTIATE_TEST_SUITE_P(
- AllUnixDomainSockets, TCPSocketPairTest,
+ AllTCPSockets, TCPSocketPairTest,
::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
} // namespace testing
diff --git a/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc b/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc
index d1ea8ef12..cd3ad97d0 100644
--- a/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc
+++ b/test/syscalls/linux/socket_ip_tcp_loopback_blocking.cc
@@ -35,7 +35,7 @@ std::vector<SocketPairKind> GetSocketPairs() {
}
INSTANTIATE_TEST_SUITE_P(
- AllUnixDomainSockets, BlockingStreamSocketPairTest,
+ BlockingTCPSockets, BlockingStreamSocketPairTest,
::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
} // namespace testing
diff --git a/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc b/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc
index 96c1b3b3d..1acdecc17 100644
--- a/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc
+++ b/test/syscalls/linux/socket_ip_tcp_loopback_nonblock.cc
@@ -34,7 +34,7 @@ std::vector<SocketPairKind> GetSocketPairs() {
}
INSTANTIATE_TEST_SUITE_P(
- AllUnixDomainSockets, NonBlockingSocketPairTest,
+ NonBlockingTCPSockets, NonBlockingSocketPairTest,
::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
} // namespace testing
diff --git a/test/syscalls/linux/socket_ip_tcp_udp_generic.cc b/test/syscalls/linux/socket_ip_tcp_udp_generic.cc
index 251817a9f..de63f79d9 100644
--- a/test/syscalls/linux/socket_ip_tcp_udp_generic.cc
+++ b/test/syscalls/linux/socket_ip_tcp_udp_generic.cc
@@ -69,7 +69,7 @@ std::vector<SocketPairKind> GetSocketPairs() {
}
INSTANTIATE_TEST_SUITE_P(
- AllTCPSockets, TcpUdpSocketPairTest,
+ AllIPSockets, TcpUdpSocketPairTest,
::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
} // namespace
diff --git a/test/syscalls/linux/socket_ip_udp_loopback.cc b/test/syscalls/linux/socket_ip_udp_loopback.cc
index fc124e9ef..1df74a348 100644
--- a/test/syscalls/linux/socket_ip_udp_loopback.cc
+++ b/test/syscalls/linux/socket_ip_udp_loopback.cc
@@ -33,15 +33,15 @@ std::vector<SocketPairKind> GetSocketPairs() {
}
INSTANTIATE_TEST_SUITE_P(
- AllUnixDomainSockets, AllSocketPairTest,
+ AllUDPSockets, AllSocketPairTest,
::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
INSTANTIATE_TEST_SUITE_P(
- AllUnixDomainSockets, NonStreamSocketPairTest,
+ AllUDPSockets, NonStreamSocketPairTest,
::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
INSTANTIATE_TEST_SUITE_P(
- UDPSockets, UDPSocketPairTest,
+ AllUDPSockets, UDPSocketPairTest,
::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
} // namespace testing
diff --git a/test/syscalls/linux/socket_ip_udp_loopback_blocking.cc b/test/syscalls/linux/socket_ip_udp_loopback_blocking.cc
index 1c3d1c0ad..1e259efa7 100644
--- a/test/syscalls/linux/socket_ip_udp_loopback_blocking.cc
+++ b/test/syscalls/linux/socket_ip_udp_loopback_blocking.cc
@@ -30,7 +30,7 @@ std::vector<SocketPairKind> GetSocketPairs() {
}
INSTANTIATE_TEST_SUITE_P(
- AllUnixDomainSockets, BlockingNonStreamSocketPairTest,
+ BlockingUDPSockets, BlockingNonStreamSocketPairTest,
::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
} // namespace testing
diff --git a/test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc b/test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc
index 7554b08d5..74cbd326d 100644
--- a/test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc
+++ b/test/syscalls/linux/socket_ip_udp_loopback_nonblock.cc
@@ -30,7 +30,7 @@ std::vector<SocketPairKind> GetSocketPairs() {
}
INSTANTIATE_TEST_SUITE_P(
- AllUnixDomainSockets, NonBlockingSocketPairTest,
+ NonBlockingUDPSockets, NonBlockingSocketPairTest,
::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
} // namespace testing
diff --git a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc
index 040bb176e..92f03e045 100644
--- a/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc
+++ b/test/syscalls/linux/socket_ipv4_tcp_unbound_external_networking_test.cc
@@ -28,7 +28,7 @@ std::vector<SocketKind> GetSockets() {
AllBitwiseCombinations(List<int>{0, SOCK_NONBLOCK}));
}
-INSTANTIATE_TEST_SUITE_P(IPv4TCPSockets,
+INSTANTIATE_TEST_SUITE_P(IPv4TCPUnboundSockets,
IPv4TCPUnboundExternalNetworkingSocketTest,
::testing::ValuesIn(GetSockets()));
} // namespace testing
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
index 53dcd58cd..6b92e05aa 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking.cc
@@ -559,5 +559,134 @@ TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
}
+// Check that two sockets can join the same multicast group at the same time,
+// and both will receive data on it.
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest, TestSendMulticastToTwo) {
+ auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ std::unique_ptr<FileDescriptor> receivers[2] = {
+ ASSERT_NO_ERRNO_AND_VALUE(NewSocket()),
+ ASSERT_NO_ERRNO_AND_VALUE(NewSocket())};
+
+ ip_mreq group = {};
+ group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+ auto receiver_addr = V4Any();
+ int bound_port = 0;
+ for (auto& receiver : receivers) {
+ ASSERT_THAT(setsockopt(receiver->get(), SOL_SOCKET, SO_REUSEPORT,
+ &kSockOptOn, sizeof(kSockOptOn)),
+ SyscallSucceeds());
+ // Bind the receiver to the v4 any address to ensure that we can receive the
+ // multicast packet.
+ ASSERT_THAT(
+ bind(receiver->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ receiver_addr.addr_len),
+ SyscallSucceeds());
+ socklen_t receiver_addr_len = receiver_addr.addr_len;
+ ASSERT_THAT(getsockname(receiver->get(),
+ reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ &receiver_addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+ // On the first iteration, save the port we are bound to. On the second
+ // iteration, verify the port is the same as the one from the first
+ // iteration. In other words, both sockets listen on the same port.
+ if (bound_port == 0) {
+ bound_port =
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+ } else {
+ EXPECT_EQ(bound_port,
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port);
+ }
+
+ // Register to receive multicast packets.
+ ASSERT_THAT(setsockopt(receiver->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP,
+ &group, sizeof(group)),
+ SyscallSucceeds());
+ }
+
+ // Send a multicast packet to the group and verify both receivers get it.
+ auto send_addr = V4Multicast();
+ reinterpret_cast<sockaddr_in*>(&send_addr.addr)->sin_port = bound_port;
+ char send_buf[200];
+ RandomizeBuffer(send_buf, sizeof(send_buf));
+ ASSERT_THAT(RetryEINTR(sendto)(sender->get(), send_buf, sizeof(send_buf), 0,
+ reinterpret_cast<sockaddr*>(&send_addr.addr),
+ send_addr.addr_len),
+ SyscallSucceedsWithValue(sizeof(send_buf)));
+ for (auto& receiver : receivers) {
+ char recv_buf[sizeof(send_buf)] = {};
+ ASSERT_THAT(
+ RetryEINTR(recv)(receiver->get(), recv_buf, sizeof(recv_buf), 0),
+ SyscallSucceedsWithValue(sizeof(recv_buf)));
+ EXPECT_EQ(0, memcmp(send_buf, recv_buf, sizeof(send_buf)));
+ }
+}
+
+// Check that when receiving a looped-back multicast packet, its source address
+// is not a multicast address.
+TEST_P(IPv4UDPUnboundExternalNetworkingSocketTest,
+ IpMulticastLoopbackFromAddr) {
+ auto sender = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+ auto receiver = ASSERT_NO_ERRNO_AND_VALUE(NewSocket());
+
+ auto receiver_addr = V4Any();
+ ASSERT_THAT(
+ bind(receiver->get(), reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ receiver_addr.addr_len),
+ SyscallSucceeds());
+ socklen_t receiver_addr_len = receiver_addr.addr_len;
+ ASSERT_THAT(getsockname(receiver->get(),
+ reinterpret_cast<sockaddr*>(&receiver_addr.addr),
+ &receiver_addr_len),
+ SyscallSucceeds());
+ EXPECT_EQ(receiver_addr_len, receiver_addr.addr_len);
+ int receiver_port =
+ reinterpret_cast<sockaddr_in*>(&receiver_addr.addr)->sin_port;
+
+ ip_mreq group = {};
+ group.imr_multiaddr.s_addr = inet_addr(kMulticastAddress);
+ ASSERT_THAT(setsockopt(receiver->get(), IPPROTO_IP, IP_ADD_MEMBERSHIP, &group,
+ sizeof(group)),
+ SyscallSucceeds());
+
+ // Connect to the multicast address. This binds us to the outgoing interface
+ // and allows us to get its IP (to be compared against the src-IP on the
+ // receiver side).
+ auto sendto_addr = V4Multicast();
+ reinterpret_cast<sockaddr_in*>(&sendto_addr.addr)->sin_port = receiver_port;
+ ASSERT_THAT(RetryEINTR(connect)(
+ sender->get(), reinterpret_cast<sockaddr*>(&sendto_addr.addr),
+ sendto_addr.addr_len),
+ SyscallSucceeds());
+ TestAddress sender_addr("");
+ ASSERT_THAT(
+ getsockname(sender->get(), reinterpret_cast<sockaddr*>(&sender_addr.addr),
+ &sender_addr.addr_len),
+ SyscallSucceeds());
+ ASSERT_EQ(sizeof(struct sockaddr_in), sender_addr.addr_len);
+ sockaddr_in* sender_addr_in =
+ reinterpret_cast<sockaddr_in*>(&sender_addr.addr);
+
+ // Send a multicast packet.
+ char send_buf[4] = {};
+ ASSERT_THAT(RetryEINTR(send)(sender->get(), send_buf, sizeof(send_buf), 0),
+ SyscallSucceedsWithValue(sizeof(send_buf)));
+
+ // Receive a multicast packet.
+ char recv_buf[sizeof(send_buf)] = {};
+ TestAddress src_addr("");
+ ASSERT_THAT(
+ RetryEINTR(recvfrom)(receiver->get(), recv_buf, sizeof(recv_buf), 0,
+ reinterpret_cast<sockaddr*>(&src_addr.addr),
+ &src_addr.addr_len),
+ SyscallSucceedsWithValue(sizeof(recv_buf)));
+ ASSERT_EQ(sizeof(struct sockaddr_in), src_addr.addr_len);
+ sockaddr_in* src_addr_in = reinterpret_cast<sockaddr_in*>(&src_addr.addr);
+
+ // Verify that the received source IP:port matches the sender one.
+ EXPECT_EQ(sender_addr_in->sin_port, src_addr_in->sin_port);
+ EXPECT_EQ(sender_addr_in->sin_addr.s_addr, src_addr_in->sin_addr.s_addr);
+}
+
} // namespace testing
} // namespace gvisor
diff --git a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc
index ffbb8e6eb..9d4e1ab97 100644
--- a/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc
+++ b/test/syscalls/linux/socket_ipv4_udp_unbound_external_networking_test.cc
@@ -28,7 +28,7 @@ std::vector<SocketKind> GetSockets() {
AllBitwiseCombinations(List<int>{0, SOCK_NONBLOCK}));
}
-INSTANTIATE_TEST_SUITE_P(IPv4UDPSockets,
+INSTANTIATE_TEST_SUITE_P(IPv4UDPUnboundSockets,
IPv4UDPUnboundExternalNetworkingSocketTest,
::testing::ValuesIn(GetSockets()));
} // namespace testing
diff --git a/test/syscalls/linux/socket_unix.cc b/test/syscalls/linux/socket_unix.cc
index 95cf8d2a3..875f0391f 100644
--- a/test/syscalls/linux/socket_unix.cc
+++ b/test/syscalls/linux/socket_unix.cc
@@ -32,6 +32,9 @@
#include "test/util/test_util.h"
#include "test/util/thread_util.h"
+// This file contains tests specific to Unix domain sockets. It does not contain
+// tests for UDS control messages. Those belong in socket_unix_cmsg.cc.
+//
// This file is a generic socket test file. It must be built with another file
// that provides the test types.
@@ -40,1430 +43,6 @@ namespace testing {
namespace {
-TEST_P(UnixSocketPairTest, BasicFDPass) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- char sent_data[20];
- RandomizeBuffer(sent_data, sizeof(sent_data));
-
- auto pair =
- ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-
- ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
- sent_data, sizeof(sent_data)));
-
- char received_data[20];
- int fd = -1;
- ASSERT_NO_FATAL_FAILURE(RecvSingleFD(sockets->second_fd(), &fd, received_data,
- sizeof(received_data)));
-
- EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-
- ASSERT_NO_FATAL_FAILURE(TransferTest(fd, pair->first_fd()));
-}
-
-TEST_P(UnixSocketPairTest, BasicTwoFDPass) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- char sent_data[20];
- RandomizeBuffer(sent_data, sizeof(sent_data));
-
- auto pair1 =
- ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
- auto pair2 =
- ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
- int sent_fds[] = {pair1->second_fd(), pair2->second_fd()};
-
- ASSERT_NO_FATAL_FAILURE(
- SendFDs(sockets->first_fd(), sent_fds, 2, sent_data, sizeof(sent_data)));
-
- char received_data[20];
- int received_fds[] = {-1, -1};
-
- ASSERT_NO_FATAL_FAILURE(RecvFDs(sockets->second_fd(), received_fds, 2,
- received_data, sizeof(received_data)));
-
- EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-
- ASSERT_NO_FATAL_FAILURE(TransferTest(received_fds[0], pair1->first_fd()));
- ASSERT_NO_FATAL_FAILURE(TransferTest(received_fds[1], pair2->first_fd()));
-}
-
-TEST_P(UnixSocketPairTest, BasicThreeFDPass) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- char sent_data[20];
- RandomizeBuffer(sent_data, sizeof(sent_data));
-
- auto pair1 =
- ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
- auto pair2 =
- ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
- auto pair3 =
- ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
- int sent_fds[] = {pair1->second_fd(), pair2->second_fd(), pair3->second_fd()};
-
- ASSERT_NO_FATAL_FAILURE(
- SendFDs(sockets->first_fd(), sent_fds, 3, sent_data, sizeof(sent_data)));
-
- char received_data[20];
- int received_fds[] = {-1, -1, -1};
-
- ASSERT_NO_FATAL_FAILURE(RecvFDs(sockets->second_fd(), received_fds, 3,
- received_data, sizeof(received_data)));
-
- EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-
- ASSERT_NO_FATAL_FAILURE(TransferTest(received_fds[0], pair1->first_fd()));
- ASSERT_NO_FATAL_FAILURE(TransferTest(received_fds[1], pair2->first_fd()));
- ASSERT_NO_FATAL_FAILURE(TransferTest(received_fds[2], pair3->first_fd()));
-}
-
-TEST_P(UnixSocketPairTest, BadFDPass) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- char sent_data[20];
- RandomizeBuffer(sent_data, sizeof(sent_data));
-
- int sent_fd = -1;
-
- struct msghdr msg = {};
- char control[CMSG_SPACE(sizeof(sent_fd))];
- msg.msg_control = control;
- msg.msg_controllen = sizeof(control);
-
- struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
- cmsg->cmsg_len = CMSG_LEN(sizeof(sent_fd));
- cmsg->cmsg_level = SOL_SOCKET;
- cmsg->cmsg_type = SCM_RIGHTS;
- memcpy(CMSG_DATA(cmsg), &sent_fd, sizeof(sent_fd));
-
- struct iovec iov;
- iov.iov_base = sent_data;
- iov.iov_len = sizeof(sent_data);
- msg.msg_iov = &iov;
- msg.msg_iovlen = 1;
-
- ASSERT_THAT(RetryEINTR(sendmsg)(sockets->first_fd(), &msg, 0),
- SyscallFailsWithErrno(EBADF));
-}
-
-// BasicFDPassNoSpace starts off by sending a single FD just like BasicFDPass.
-// The difference is that when calling recvmsg, no space for FDs is provided,
-// only space for the cmsg header.
-TEST_P(UnixSocketPairTest, BasicFDPassNoSpace) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- char sent_data[20];
- RandomizeBuffer(sent_data, sizeof(sent_data));
-
- auto pair =
- ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-
- ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
- sent_data, sizeof(sent_data)));
-
- char received_data[20];
-
- struct msghdr msg = {};
- std::vector<char> control(CMSG_SPACE(0));
- msg.msg_control = &control[0];
- msg.msg_controllen = control.size();
-
- struct iovec iov;
- iov.iov_base = received_data;
- iov.iov_len = sizeof(received_data);
- msg.msg_iov = &iov;
- msg.msg_iovlen = 1;
-
- ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
- SyscallSucceedsWithValue(sizeof(received_data)));
-
- EXPECT_EQ(msg.msg_controllen, 0);
- EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-}
-
-// BasicFDPassNoSpaceMsgCtrunc sends an FD, but does not provide any space to
-// receive it. It then verifies that the MSG_CTRUNC flag is set in the msghdr.
-TEST_P(UnixSocketPairTest, BasicFDPassNoSpaceMsgCtrunc) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- char sent_data[20];
- RandomizeBuffer(sent_data, sizeof(sent_data));
-
- auto pair =
- ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-
- ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
- sent_data, sizeof(sent_data)));
-
- struct msghdr msg = {};
- std::vector<char> control(CMSG_SPACE(0));
- msg.msg_control = &control[0];
- msg.msg_controllen = control.size();
-
- char received_data[sizeof(sent_data)];
- struct iovec iov;
- iov.iov_base = received_data;
- iov.iov_len = sizeof(received_data);
- msg.msg_iov = &iov;
- msg.msg_iovlen = 1;
-
- ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
- SyscallSucceedsWithValue(sizeof(received_data)));
-
- EXPECT_EQ(msg.msg_controllen, 0);
- EXPECT_EQ(msg.msg_flags, MSG_CTRUNC);
-}
-
-// BasicFDPassNullControlMsgCtrunc sends an FD and sets contradictory values for
-// msg_controllen and msg_control. msg_controllen is set to the correct size to
-// accomidate the FD, but msg_control is set to NULL. In this case, msg_control
-// should override msg_controllen.
-TEST_P(UnixSocketPairTest, BasicFDPassNullControlMsgCtrunc) {
- // FIXME(gvisor.dev/issue/207): Fix handling of NULL msg_control.
- SKIP_IF(IsRunningOnGvisor());
-
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- char sent_data[20];
- RandomizeBuffer(sent_data, sizeof(sent_data));
-
- auto pair =
- ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-
- ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
- sent_data, sizeof(sent_data)));
-
- struct msghdr msg = {};
- msg.msg_controllen = CMSG_SPACE(1);
-
- char received_data[sizeof(sent_data)];
- struct iovec iov;
- iov.iov_base = received_data;
- iov.iov_len = sizeof(received_data);
- msg.msg_iov = &iov;
- msg.msg_iovlen = 1;
-
- ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
- SyscallSucceedsWithValue(sizeof(received_data)));
-
- EXPECT_EQ(msg.msg_controllen, 0);
- EXPECT_EQ(msg.msg_flags, MSG_CTRUNC);
-}
-
-// BasicFDPassNotEnoughSpaceMsgCtrunc sends an FD, but does not provide enough
-// space to receive it. It then verifies that the MSG_CTRUNC flag is set in the
-// msghdr.
-TEST_P(UnixSocketPairTest, BasicFDPassNotEnoughSpaceMsgCtrunc) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- char sent_data[20];
- RandomizeBuffer(sent_data, sizeof(sent_data));
-
- auto pair =
- ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-
- ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
- sent_data, sizeof(sent_data)));
-
- struct msghdr msg = {};
- std::vector<char> control(CMSG_SPACE(0) + 1);
- msg.msg_control = &control[0];
- msg.msg_controllen = control.size();
-
- char received_data[sizeof(sent_data)];
- struct iovec iov;
- iov.iov_base = received_data;
- iov.iov_len = sizeof(received_data);
- msg.msg_iov = &iov;
- msg.msg_iovlen = 1;
-
- ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
- SyscallSucceedsWithValue(sizeof(received_data)));
-
- EXPECT_EQ(msg.msg_controllen, 0);
- EXPECT_EQ(msg.msg_flags, MSG_CTRUNC);
-}
-
-// BasicThreeFDPassTruncationMsgCtrunc sends three FDs, but only provides enough
-// space to receive two of them. It then verifies that the MSG_CTRUNC flag is
-// set in the msghdr.
-TEST_P(UnixSocketPairTest, BasicThreeFDPassTruncationMsgCtrunc) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- char sent_data[20];
- RandomizeBuffer(sent_data, sizeof(sent_data));
-
- auto pair1 =
- ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
- auto pair2 =
- ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
- auto pair3 =
- ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
- int sent_fds[] = {pair1->second_fd(), pair2->second_fd(), pair3->second_fd()};
-
- ASSERT_NO_FATAL_FAILURE(
- SendFDs(sockets->first_fd(), sent_fds, 3, sent_data, sizeof(sent_data)));
-
- struct msghdr msg = {};
- std::vector<char> control(CMSG_SPACE(2 * sizeof(int)));
- msg.msg_control = &control[0];
- msg.msg_controllen = control.size();
-
- char received_data[sizeof(sent_data)];
- struct iovec iov;
- iov.iov_base = received_data;
- iov.iov_len = sizeof(received_data);
- msg.msg_iov = &iov;
- msg.msg_iovlen = 1;
-
- ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
- SyscallSucceedsWithValue(sizeof(received_data)));
-
- EXPECT_EQ(msg.msg_flags, MSG_CTRUNC);
-
- struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
- ASSERT_NE(cmsg, nullptr);
- EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(2 * sizeof(int)));
- EXPECT_EQ(cmsg->cmsg_level, SOL_SOCKET);
- EXPECT_EQ(cmsg->cmsg_type, SCM_RIGHTS);
-}
-
-// BasicFDPassUnalignedRecv starts off by sending a single FD just like
-// BasicFDPass. The difference is that when calling recvmsg, the length of the
-// receive data is only aligned on a 4 byte boundry instead of the normal 8.
-TEST_P(UnixSocketPairTest, BasicFDPassUnalignedRecv) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- char sent_data[20];
- RandomizeBuffer(sent_data, sizeof(sent_data));
-
- auto pair =
- ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-
- ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
- sent_data, sizeof(sent_data)));
-
- char received_data[20];
- int fd = -1;
- ASSERT_NO_FATAL_FAILURE(RecvSingleFDUnaligned(
- sockets->second_fd(), &fd, received_data, sizeof(received_data)));
-
- EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-
- ASSERT_NO_FATAL_FAILURE(TransferTest(fd, pair->first_fd()));
-}
-
-// BasicFDPassUnalignedRecvNoMsgTrunc sends one FD and only provides enough
-// space to receive just it. (Normally the minimum amount of space one would
-// provide would be enough space for two FDs.) It then verifies that the
-// MSG_CTRUNC flag is not set in the msghdr.
-TEST_P(UnixSocketPairTest, BasicFDPassUnalignedRecvNoMsgTrunc) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- char sent_data[20];
- RandomizeBuffer(sent_data, sizeof(sent_data));
-
- auto pair =
- ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-
- ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
- sent_data, sizeof(sent_data)));
-
- struct msghdr msg = {};
- char control[CMSG_SPACE(sizeof(int)) - sizeof(int)];
- msg.msg_control = control;
- msg.msg_controllen = sizeof(control);
-
- char received_data[sizeof(sent_data)] = {};
- struct iovec iov;
- iov.iov_base = received_data;
- iov.iov_len = sizeof(received_data);
- msg.msg_iov = &iov;
- msg.msg_iovlen = 1;
-
- ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
- SyscallSucceedsWithValue(sizeof(received_data)));
-
- EXPECT_EQ(msg.msg_flags, 0);
-
- struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
- ASSERT_NE(cmsg, nullptr);
- EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(int)));
- EXPECT_EQ(cmsg->cmsg_level, SOL_SOCKET);
- EXPECT_EQ(cmsg->cmsg_type, SCM_RIGHTS);
-}
-
-// BasicTwoFDPassUnalignedRecvTruncationMsgTrunc sends two FDs, but only
-// provides enough space to receive one of them. It then verifies that the
-// MSG_CTRUNC flag is set in the msghdr.
-TEST_P(UnixSocketPairTest, BasicTwoFDPassUnalignedRecvTruncationMsgTrunc) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- char sent_data[20];
- RandomizeBuffer(sent_data, sizeof(sent_data));
-
- auto pair =
- ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
- int sent_fds[] = {pair->first_fd(), pair->second_fd()};
-
- ASSERT_NO_FATAL_FAILURE(
- SendFDs(sockets->first_fd(), sent_fds, 2, sent_data, sizeof(sent_data)));
-
- struct msghdr msg = {};
- // CMSG_SPACE rounds up to two FDs, we only want one.
- char control[CMSG_SPACE(sizeof(int)) - sizeof(int)];
- msg.msg_control = control;
- msg.msg_controllen = sizeof(control);
-
- char received_data[sizeof(sent_data)] = {};
- struct iovec iov;
- iov.iov_base = received_data;
- iov.iov_len = sizeof(received_data);
- msg.msg_iov = &iov;
- msg.msg_iovlen = 1;
-
- ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
- SyscallSucceedsWithValue(sizeof(received_data)));
-
- EXPECT_EQ(msg.msg_flags, MSG_CTRUNC);
-
- struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
- ASSERT_NE(cmsg, nullptr);
- EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(int)));
- EXPECT_EQ(cmsg->cmsg_level, SOL_SOCKET);
- EXPECT_EQ(cmsg->cmsg_type, SCM_RIGHTS);
-}
-
-TEST_P(UnixSocketPairTest, ConcurrentBasicFDPass) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- char sent_data[20];
- RandomizeBuffer(sent_data, sizeof(sent_data));
-
- int sockfd1 = sockets->first_fd();
- auto recv_func = [sockfd1, sent_data]() {
- char received_data[20];
- int fd = -1;
- RecvSingleFD(sockfd1, &fd, received_data, sizeof(received_data));
- ASSERT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
- char buf[20];
- ASSERT_THAT(ReadFd(fd, buf, sizeof(buf)),
- SyscallSucceedsWithValue(sizeof(buf)));
- ASSERT_THAT(WriteFd(fd, buf, sizeof(buf)),
- SyscallSucceedsWithValue(sizeof(buf)));
- };
-
- auto pair =
- ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-
- ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->second_fd(), pair->second_fd(),
- sent_data, sizeof(sent_data)));
-
- ScopedThread t(recv_func);
-
- RandomizeBuffer(sent_data, sizeof(sent_data));
- ASSERT_THAT(WriteFd(pair->first_fd(), sent_data, sizeof(sent_data)),
- SyscallSucceedsWithValue(sizeof(sent_data)));
-
- char received_data[20];
- ASSERT_THAT(ReadFd(pair->first_fd(), received_data, sizeof(received_data)),
- SyscallSucceedsWithValue(sizeof(received_data)));
-
- t.Join();
-
- EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-}
-
-// FDPassNoRecv checks that the control message can be safely ignored by using
-// read(2) instead of recvmsg(2).
-TEST_P(UnixSocketPairTest, FDPassNoRecv) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- char sent_data[20];
- RandomizeBuffer(sent_data, sizeof(sent_data));
-
- auto pair =
- ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-
- ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
- sent_data, sizeof(sent_data)));
-
- // Read while ignoring the passed FD.
- char received_data[20];
- ASSERT_THAT(
- ReadFd(sockets->second_fd(), received_data, sizeof(received_data)),
- SyscallSucceedsWithValue(sizeof(received_data)));
-
- EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-
- // Check that the socket still works for reads and writes.
- ASSERT_NO_FATAL_FAILURE(
- TransferTest(sockets->first_fd(), sockets->second_fd()));
-}
-
-// FDPassInterspersed1 checks that sent control messages cannot be read before
-// their associated data has been read.
-TEST_P(UnixSocketPairTest, FDPassInterspersed1) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- char written_data[20];
- RandomizeBuffer(written_data, sizeof(written_data));
-
- ASSERT_THAT(WriteFd(sockets->first_fd(), written_data, sizeof(written_data)),
- SyscallSucceedsWithValue(sizeof(written_data)));
-
- char sent_data[20];
- RandomizeBuffer(sent_data, sizeof(sent_data));
-
- auto pair =
- ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
- ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
- sent_data, sizeof(sent_data)));
-
- // Check that we don't get a control message, but do get the data.
- char received_data[20];
- RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data));
- EXPECT_EQ(0, memcmp(written_data, received_data, sizeof(written_data)));
-}
-
-// FDPassInterspersed2 checks that sent control messages cannot be read after
-// their assocated data has been read while ignoring the control message by
-// using read(2) instead of recvmsg(2).
-TEST_P(UnixSocketPairTest, FDPassInterspersed2) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- char sent_data[20];
- RandomizeBuffer(sent_data, sizeof(sent_data));
-
- auto pair =
- ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-
- ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
- sent_data, sizeof(sent_data)));
-
- char written_data[20];
- RandomizeBuffer(written_data, sizeof(written_data));
- ASSERT_THAT(WriteFd(sockets->first_fd(), written_data, sizeof(written_data)),
- SyscallSucceedsWithValue(sizeof(written_data)));
-
- char received_data[20];
- ASSERT_THAT(
- ReadFd(sockets->second_fd(), received_data, sizeof(received_data)),
- SyscallSucceedsWithValue(sizeof(received_data)));
-
- EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-
- ASSERT_NO_FATAL_FAILURE(
- RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data)));
- EXPECT_EQ(0, memcmp(written_data, received_data, sizeof(written_data)));
-}
-
-TEST_P(UnixSocketPairTest, FDPassNotCoalesced) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- char sent_data1[20];
- RandomizeBuffer(sent_data1, sizeof(sent_data1));
-
- auto pair1 =
- ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-
- ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair1->second_fd(),
- sent_data1, sizeof(sent_data1)));
-
- char sent_data2[20];
- RandomizeBuffer(sent_data2, sizeof(sent_data2));
-
- auto pair2 =
- ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-
- ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair2->second_fd(),
- sent_data2, sizeof(sent_data2)));
-
- char received_data1[sizeof(sent_data1) + sizeof(sent_data2)];
- int received_fd1 = -1;
-
- RecvSingleFD(sockets->second_fd(), &received_fd1, received_data1,
- sizeof(received_data1), sizeof(sent_data1));
-
- EXPECT_EQ(0, memcmp(sent_data1, received_data1, sizeof(sent_data1)));
- TransferTest(pair1->first_fd(), pair1->second_fd());
-
- char received_data2[sizeof(sent_data1) + sizeof(sent_data2)];
- int received_fd2 = -1;
-
- RecvSingleFD(sockets->second_fd(), &received_fd2, received_data2,
- sizeof(received_data2), sizeof(sent_data2));
-
- EXPECT_EQ(0, memcmp(sent_data2, received_data2, sizeof(sent_data2)));
- TransferTest(pair2->first_fd(), pair2->second_fd());
-}
-
-TEST_P(UnixSocketPairTest, FDPassPeek) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- char sent_data[20];
- RandomizeBuffer(sent_data, sizeof(sent_data));
-
- auto pair =
- ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-
- ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
- sent_data, sizeof(sent_data)));
-
- char peek_data[20];
- int peek_fd = -1;
- PeekSingleFD(sockets->second_fd(), &peek_fd, peek_data, sizeof(peek_data));
- EXPECT_EQ(0, memcmp(sent_data, peek_data, sizeof(sent_data)));
- TransferTest(peek_fd, pair->first_fd());
- EXPECT_THAT(close(peek_fd), SyscallSucceeds());
-
- char received_data[20];
- int received_fd = -1;
- RecvSingleFD(sockets->second_fd(), &received_fd, received_data,
- sizeof(received_data));
- EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
- TransferTest(received_fd, pair->first_fd());
- EXPECT_THAT(close(received_fd), SyscallSucceeds());
-}
-
-TEST_P(UnixSocketPairTest, BasicCredPass) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- char sent_data[20];
- RandomizeBuffer(sent_data, sizeof(sent_data));
-
- struct ucred sent_creds;
-
- ASSERT_THAT(sent_creds.pid = getpid(), SyscallSucceeds());
- ASSERT_THAT(sent_creds.uid = getuid(), SyscallSucceeds());
- ASSERT_THAT(sent_creds.gid = getgid(), SyscallSucceeds());
-
- ASSERT_NO_FATAL_FAILURE(
- SendCreds(sockets->first_fd(), sent_creds, sent_data, sizeof(sent_data)));
-
- SetSoPassCred(sockets->second_fd());
-
- char received_data[20];
- struct ucred received_creds;
- ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
- received_data, sizeof(received_data)));
-
- EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
- EXPECT_EQ(sent_creds.pid, received_creds.pid);
- EXPECT_EQ(sent_creds.uid, received_creds.uid);
- EXPECT_EQ(sent_creds.gid, received_creds.gid);
-}
-
-TEST_P(UnixSocketPairTest, SendNullCredsBeforeSoPassCredRecvEnd) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- char sent_data[20];
- RandomizeBuffer(sent_data, sizeof(sent_data));
-
- ASSERT_NO_FATAL_FAILURE(
- SendNullCmsg(sockets->first_fd(), sent_data, sizeof(sent_data)));
-
- SetSoPassCred(sockets->second_fd());
-
- char received_data[20];
- struct ucred received_creds;
- ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
- received_data, sizeof(received_data)));
-
- EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-
- struct ucred want_creds {
- 0, 65534, 65534
- };
-
- EXPECT_EQ(want_creds.pid, received_creds.pid);
- EXPECT_EQ(want_creds.uid, received_creds.uid);
- EXPECT_EQ(want_creds.gid, received_creds.gid);
-}
-
-TEST_P(UnixSocketPairTest, SendNullCredsAfterSoPassCredRecvEnd) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- char sent_data[20];
- RandomizeBuffer(sent_data, sizeof(sent_data));
-
- SetSoPassCred(sockets->second_fd());
-
- ASSERT_NO_FATAL_FAILURE(
- SendNullCmsg(sockets->first_fd(), sent_data, sizeof(sent_data)));
-
- char received_data[20];
- struct ucred received_creds;
- ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
- received_data, sizeof(received_data)));
-
- EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-
- struct ucred want_creds;
- ASSERT_THAT(want_creds.pid = getpid(), SyscallSucceeds());
- ASSERT_THAT(want_creds.uid = getuid(), SyscallSucceeds());
- ASSERT_THAT(want_creds.gid = getgid(), SyscallSucceeds());
-
- EXPECT_EQ(want_creds.pid, received_creds.pid);
- EXPECT_EQ(want_creds.uid, received_creds.uid);
- EXPECT_EQ(want_creds.gid, received_creds.gid);
-}
-
-TEST_P(UnixSocketPairTest, SendNullCredsBeforeSoPassCredSendEnd) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- char sent_data[20];
- RandomizeBuffer(sent_data, sizeof(sent_data));
-
- ASSERT_NO_FATAL_FAILURE(
- SendNullCmsg(sockets->first_fd(), sent_data, sizeof(sent_data)));
-
- SetSoPassCred(sockets->first_fd());
-
- char received_data[20];
- ASSERT_NO_FATAL_FAILURE(
- RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data)));
-
- EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-}
-
-TEST_P(UnixSocketPairTest, SendNullCredsAfterSoPassCredSendEnd) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- char sent_data[20];
- RandomizeBuffer(sent_data, sizeof(sent_data));
-
- SetSoPassCred(sockets->first_fd());
-
- ASSERT_NO_FATAL_FAILURE(
- SendNullCmsg(sockets->first_fd(), sent_data, sizeof(sent_data)));
-
- char received_data[20];
- ASSERT_NO_FATAL_FAILURE(
- RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data)));
-
- EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-}
-
-TEST_P(UnixSocketPairTest, SendNullCredsBeforeSoPassCredRecvEndAfterSendEnd) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- char sent_data[20];
- RandomizeBuffer(sent_data, sizeof(sent_data));
-
- SetSoPassCred(sockets->first_fd());
-
- ASSERT_NO_FATAL_FAILURE(
- SendNullCmsg(sockets->first_fd(), sent_data, sizeof(sent_data)));
-
- SetSoPassCred(sockets->second_fd());
-
- char received_data[20];
- struct ucred received_creds;
- ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
- received_data, sizeof(received_data)));
-
- EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-
- struct ucred want_creds;
- ASSERT_THAT(want_creds.pid = getpid(), SyscallSucceeds());
- ASSERT_THAT(want_creds.uid = getuid(), SyscallSucceeds());
- ASSERT_THAT(want_creds.gid = getgid(), SyscallSucceeds());
-
- EXPECT_EQ(want_creds.pid, received_creds.pid);
- EXPECT_EQ(want_creds.uid, received_creds.uid);
- EXPECT_EQ(want_creds.gid, received_creds.gid);
-}
-
-TEST_P(UnixSocketPairTest, WriteBeforeSoPassCredRecvEnd) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- char sent_data[20];
- RandomizeBuffer(sent_data, sizeof(sent_data));
-
- ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data, sizeof(sent_data)),
- SyscallSucceedsWithValue(sizeof(sent_data)));
-
- SetSoPassCred(sockets->second_fd());
-
- char received_data[20];
-
- struct ucred received_creds;
- ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
- received_data, sizeof(received_data)));
-
- EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-
- struct ucred want_creds {
- 0, 65534, 65534
- };
-
- EXPECT_EQ(want_creds.pid, received_creds.pid);
- EXPECT_EQ(want_creds.uid, received_creds.uid);
- EXPECT_EQ(want_creds.gid, received_creds.gid);
-}
-
-TEST_P(UnixSocketPairTest, WriteAfterSoPassCredRecvEnd) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- SetSoPassCred(sockets->second_fd());
-
- char sent_data[20];
- RandomizeBuffer(sent_data, sizeof(sent_data));
- ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data, sizeof(sent_data)),
- SyscallSucceedsWithValue(sizeof(sent_data)));
-
- char received_data[20];
-
- struct ucred received_creds;
- ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
- received_data, sizeof(received_data)));
-
- EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-
- struct ucred want_creds;
- ASSERT_THAT(want_creds.pid = getpid(), SyscallSucceeds());
- ASSERT_THAT(want_creds.uid = getuid(), SyscallSucceeds());
- ASSERT_THAT(want_creds.gid = getgid(), SyscallSucceeds());
-
- EXPECT_EQ(want_creds.pid, received_creds.pid);
- EXPECT_EQ(want_creds.uid, received_creds.uid);
- EXPECT_EQ(want_creds.gid, received_creds.gid);
-}
-
-TEST_P(UnixSocketPairTest, WriteBeforeSoPassCredSendEnd) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- char sent_data[20];
- RandomizeBuffer(sent_data, sizeof(sent_data));
-
- ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data, sizeof(sent_data)),
- SyscallSucceedsWithValue(sizeof(sent_data)));
-
- SetSoPassCred(sockets->first_fd());
-
- char received_data[20];
- ASSERT_NO_FATAL_FAILURE(
- RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data)));
-
- EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-}
-
-TEST_P(UnixSocketPairTest, WriteAfterSoPassCredSendEnd) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- SetSoPassCred(sockets->first_fd());
-
- char sent_data[20];
- RandomizeBuffer(sent_data, sizeof(sent_data));
-
- ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data, sizeof(sent_data)),
- SyscallSucceedsWithValue(sizeof(sent_data)));
-
- char received_data[20];
- ASSERT_NO_FATAL_FAILURE(
- RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data)));
-
- EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-}
-
-TEST_P(UnixSocketPairTest, WriteBeforeSoPassCredRecvEndAfterSendEnd) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- char sent_data[20];
- RandomizeBuffer(sent_data, sizeof(sent_data));
-
- SetSoPassCred(sockets->first_fd());
-
- ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data, sizeof(sent_data)),
- SyscallSucceedsWithValue(sizeof(sent_data)));
-
- SetSoPassCred(sockets->second_fd());
-
- char received_data[20];
-
- struct ucred received_creds;
- ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
- received_data, sizeof(received_data)));
-
- EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-
- struct ucred want_creds;
- ASSERT_THAT(want_creds.pid = getpid(), SyscallSucceeds());
- ASSERT_THAT(want_creds.uid = getuid(), SyscallSucceeds());
- ASSERT_THAT(want_creds.gid = getgid(), SyscallSucceeds());
-
- EXPECT_EQ(want_creds.pid, received_creds.pid);
- EXPECT_EQ(want_creds.uid, received_creds.uid);
- EXPECT_EQ(want_creds.gid, received_creds.gid);
-}
-
-TEST_P(UnixSocketPairTest, CredPassTruncated) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- char sent_data[20];
- RandomizeBuffer(sent_data, sizeof(sent_data));
-
- struct ucred sent_creds;
-
- ASSERT_THAT(sent_creds.pid = getpid(), SyscallSucceeds());
- ASSERT_THAT(sent_creds.uid = getuid(), SyscallSucceeds());
- ASSERT_THAT(sent_creds.gid = getgid(), SyscallSucceeds());
-
- ASSERT_NO_FATAL_FAILURE(
- SendCreds(sockets->first_fd(), sent_creds, sent_data, sizeof(sent_data)));
-
- SetSoPassCred(sockets->second_fd());
-
- struct msghdr msg = {};
- char control[CMSG_SPACE(0) + sizeof(pid_t)];
- msg.msg_control = control;
- msg.msg_controllen = sizeof(control);
-
- char received_data[sizeof(sent_data)] = {};
- struct iovec iov;
- iov.iov_base = received_data;
- iov.iov_len = sizeof(received_data);
- msg.msg_iov = &iov;
- msg.msg_iovlen = 1;
-
- ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
- SyscallSucceedsWithValue(sizeof(received_data)));
-
- EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-
- EXPECT_EQ(msg.msg_controllen, sizeof(control));
-
- struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
- ASSERT_NE(cmsg, nullptr);
- EXPECT_EQ(cmsg->cmsg_len, sizeof(control));
- EXPECT_EQ(cmsg->cmsg_level, SOL_SOCKET);
- EXPECT_EQ(cmsg->cmsg_type, SCM_CREDENTIALS);
-
- pid_t pid = 0;
- memcpy(&pid, CMSG_DATA(cmsg), sizeof(pid));
- EXPECT_EQ(pid, sent_creds.pid);
-}
-
-// CredPassNoMsgCtrunc passes a full set of credentials. It then verifies that
-// receiving the full set does not result in MSG_CTRUNC being set in the msghdr.
-TEST_P(UnixSocketPairTest, CredPassNoMsgCtrunc) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- char sent_data[20];
- RandomizeBuffer(sent_data, sizeof(sent_data));
-
- struct ucred sent_creds;
-
- ASSERT_THAT(sent_creds.pid = getpid(), SyscallSucceeds());
- ASSERT_THAT(sent_creds.uid = getuid(), SyscallSucceeds());
- ASSERT_THAT(sent_creds.gid = getgid(), SyscallSucceeds());
-
- ASSERT_NO_FATAL_FAILURE(
- SendCreds(sockets->first_fd(), sent_creds, sent_data, sizeof(sent_data)));
-
- SetSoPassCred(sockets->second_fd());
-
- struct msghdr msg = {};
- char control[CMSG_SPACE(sizeof(struct ucred))];
- msg.msg_control = control;
- msg.msg_controllen = sizeof(control);
-
- char received_data[sizeof(sent_data)] = {};
- struct iovec iov;
- iov.iov_base = received_data;
- iov.iov_len = sizeof(received_data);
- msg.msg_iov = &iov;
- msg.msg_iovlen = 1;
-
- ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
- SyscallSucceedsWithValue(sizeof(received_data)));
-
- EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-
- // The control message should not be truncated.
- EXPECT_EQ(msg.msg_flags, 0);
- EXPECT_EQ(msg.msg_controllen, sizeof(control));
-
- struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
- ASSERT_NE(cmsg, nullptr);
- EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(struct ucred)));
- EXPECT_EQ(cmsg->cmsg_level, SOL_SOCKET);
- EXPECT_EQ(cmsg->cmsg_type, SCM_CREDENTIALS);
-}
-
-// CredPassNoSpaceMsgCtrunc passes a full set of credentials. It then receives
-// the data without providing space for any credentials and verifies that
-// MSG_CTRUNC is set in the msghdr.
-TEST_P(UnixSocketPairTest, CredPassNoSpaceMsgCtrunc) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- char sent_data[20];
- RandomizeBuffer(sent_data, sizeof(sent_data));
-
- struct ucred sent_creds;
-
- ASSERT_THAT(sent_creds.pid = getpid(), SyscallSucceeds());
- ASSERT_THAT(sent_creds.uid = getuid(), SyscallSucceeds());
- ASSERT_THAT(sent_creds.gid = getgid(), SyscallSucceeds());
-
- ASSERT_NO_FATAL_FAILURE(
- SendCreds(sockets->first_fd(), sent_creds, sent_data, sizeof(sent_data)));
-
- SetSoPassCred(sockets->second_fd());
-
- struct msghdr msg = {};
- char control[CMSG_SPACE(0)];
- msg.msg_control = control;
- msg.msg_controllen = sizeof(control);
-
- char received_data[sizeof(sent_data)] = {};
- struct iovec iov;
- iov.iov_base = received_data;
- iov.iov_len = sizeof(received_data);
- msg.msg_iov = &iov;
- msg.msg_iovlen = 1;
-
- ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
- SyscallSucceedsWithValue(sizeof(received_data)));
-
- EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-
- // The control message should be truncated.
- EXPECT_EQ(msg.msg_flags, MSG_CTRUNC);
- EXPECT_EQ(msg.msg_controllen, sizeof(control));
-
- struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
- ASSERT_NE(cmsg, nullptr);
- EXPECT_EQ(cmsg->cmsg_len, sizeof(control));
- EXPECT_EQ(cmsg->cmsg_level, SOL_SOCKET);
- EXPECT_EQ(cmsg->cmsg_type, SCM_CREDENTIALS);
-}
-
-// CredPassTruncatedMsgCtrunc passes a full set of credentials. It then receives
-// the data while providing enough space for only the first field of the
-// credentials and verifies that MSG_CTRUNC is set in the msghdr.
-TEST_P(UnixSocketPairTest, CredPassTruncatedMsgCtrunc) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- char sent_data[20];
- RandomizeBuffer(sent_data, sizeof(sent_data));
-
- struct ucred sent_creds;
-
- ASSERT_THAT(sent_creds.pid = getpid(), SyscallSucceeds());
- ASSERT_THAT(sent_creds.uid = getuid(), SyscallSucceeds());
- ASSERT_THAT(sent_creds.gid = getgid(), SyscallSucceeds());
-
- ASSERT_NO_FATAL_FAILURE(
- SendCreds(sockets->first_fd(), sent_creds, sent_data, sizeof(sent_data)));
-
- SetSoPassCred(sockets->second_fd());
-
- struct msghdr msg = {};
- char control[CMSG_SPACE(0) + sizeof(pid_t)];
- msg.msg_control = control;
- msg.msg_controllen = sizeof(control);
-
- char received_data[sizeof(sent_data)] = {};
- struct iovec iov;
- iov.iov_base = received_data;
- iov.iov_len = sizeof(received_data);
- msg.msg_iov = &iov;
- msg.msg_iovlen = 1;
-
- ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
- SyscallSucceedsWithValue(sizeof(received_data)));
-
- EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-
- // The control message should be truncated.
- EXPECT_EQ(msg.msg_flags, MSG_CTRUNC);
- EXPECT_EQ(msg.msg_controllen, sizeof(control));
-
- struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
- ASSERT_NE(cmsg, nullptr);
- EXPECT_EQ(cmsg->cmsg_len, sizeof(control));
- EXPECT_EQ(cmsg->cmsg_level, SOL_SOCKET);
- EXPECT_EQ(cmsg->cmsg_type, SCM_CREDENTIALS);
-}
-
-TEST_P(UnixSocketPairTest, SoPassCred) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- int opt;
- socklen_t optLen = sizeof(opt);
- EXPECT_THAT(
- getsockopt(sockets->first_fd(), SOL_SOCKET, SO_PASSCRED, &opt, &optLen),
- SyscallSucceeds());
- EXPECT_FALSE(opt);
-
- optLen = sizeof(opt);
- EXPECT_THAT(
- getsockopt(sockets->second_fd(), SOL_SOCKET, SO_PASSCRED, &opt, &optLen),
- SyscallSucceeds());
- EXPECT_FALSE(opt);
-
- SetSoPassCred(sockets->first_fd());
-
- optLen = sizeof(opt);
- EXPECT_THAT(
- getsockopt(sockets->first_fd(), SOL_SOCKET, SO_PASSCRED, &opt, &optLen),
- SyscallSucceeds());
- EXPECT_TRUE(opt);
-
- optLen = sizeof(opt);
- EXPECT_THAT(
- getsockopt(sockets->second_fd(), SOL_SOCKET, SO_PASSCRED, &opt, &optLen),
- SyscallSucceeds());
- EXPECT_FALSE(opt);
-
- int zero = 0;
- EXPECT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_PASSCRED, &zero,
- sizeof(zero)),
- SyscallSucceeds());
-
- optLen = sizeof(opt);
- EXPECT_THAT(
- getsockopt(sockets->first_fd(), SOL_SOCKET, SO_PASSCRED, &opt, &optLen),
- SyscallSucceeds());
- EXPECT_FALSE(opt);
-
- optLen = sizeof(opt);
- EXPECT_THAT(
- getsockopt(sockets->second_fd(), SOL_SOCKET, SO_PASSCRED, &opt, &optLen),
- SyscallSucceeds());
- EXPECT_FALSE(opt);
-}
-
-TEST_P(UnixSocketPairTest, NoDataCredPass) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- char sent_data[20];
- RandomizeBuffer(sent_data, sizeof(sent_data));
-
- struct msghdr msg = {};
-
- struct iovec iov;
- iov.iov_base = sent_data;
- iov.iov_len = sizeof(sent_data);
- msg.msg_iov = &iov;
- msg.msg_iovlen = 1;
-
- char control[CMSG_SPACE(0)];
- msg.msg_control = control;
- msg.msg_controllen = sizeof(control);
-
- struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
- cmsg->cmsg_level = SOL_SOCKET;
- cmsg->cmsg_type = SCM_CREDENTIALS;
- cmsg->cmsg_len = CMSG_LEN(0);
-
- ASSERT_THAT(RetryEINTR(sendmsg)(sockets->first_fd(), &msg, 0),
- SyscallFailsWithErrno(EINVAL));
-}
-
-TEST_P(UnixSocketPairTest, NoPassCred) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- char sent_data[20];
- RandomizeBuffer(sent_data, sizeof(sent_data));
-
- struct ucred sent_creds;
-
- ASSERT_THAT(sent_creds.pid = getpid(), SyscallSucceeds());
- ASSERT_THAT(sent_creds.uid = getuid(), SyscallSucceeds());
- ASSERT_THAT(sent_creds.gid = getgid(), SyscallSucceeds());
-
- ASSERT_NO_FATAL_FAILURE(
- SendCreds(sockets->first_fd(), sent_creds, sent_data, sizeof(sent_data)));
-
- char received_data[20];
-
- ASSERT_NO_FATAL_FAILURE(
- RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data)));
-
- EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-}
-
-TEST_P(UnixSocketPairTest, CredAndFDPass) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- char sent_data[20];
- RandomizeBuffer(sent_data, sizeof(sent_data));
-
- struct ucred sent_creds;
-
- ASSERT_THAT(sent_creds.pid = getpid(), SyscallSucceeds());
- ASSERT_THAT(sent_creds.uid = getuid(), SyscallSucceeds());
- ASSERT_THAT(sent_creds.gid = getgid(), SyscallSucceeds());
-
- auto pair =
- ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-
- ASSERT_NO_FATAL_FAILURE(SendCredsAndFD(sockets->first_fd(), sent_creds,
- pair->second_fd(), sent_data,
- sizeof(sent_data)));
-
- SetSoPassCred(sockets->second_fd());
-
- char received_data[20];
- struct ucred received_creds;
- int fd = -1;
- ASSERT_NO_FATAL_FAILURE(RecvCredsAndFD(sockets->second_fd(), &received_creds,
- &fd, received_data,
- sizeof(received_data)));
-
- EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-
- EXPECT_EQ(sent_creds.pid, received_creds.pid);
- EXPECT_EQ(sent_creds.uid, received_creds.uid);
- EXPECT_EQ(sent_creds.gid, received_creds.gid);
-
- ASSERT_NO_FATAL_FAILURE(TransferTest(fd, pair->first_fd()));
-}
-
-TEST_P(UnixSocketPairTest, FDPassBeforeSoPassCred) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- char sent_data[20];
- RandomizeBuffer(sent_data, sizeof(sent_data));
-
- auto pair =
- ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-
- ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
- sent_data, sizeof(sent_data)));
-
- SetSoPassCred(sockets->second_fd());
-
- char received_data[20];
- struct ucred received_creds;
- int fd = -1;
- ASSERT_NO_FATAL_FAILURE(RecvCredsAndFD(sockets->second_fd(), &received_creds,
- &fd, received_data,
- sizeof(received_data)));
-
- EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-
- struct ucred want_creds {
- 0, 65534, 65534
- };
-
- EXPECT_EQ(want_creds.pid, received_creds.pid);
- EXPECT_EQ(want_creds.uid, received_creds.uid);
- EXPECT_EQ(want_creds.gid, received_creds.gid);
-
- ASSERT_NO_FATAL_FAILURE(TransferTest(fd, pair->first_fd()));
-}
-
-TEST_P(UnixSocketPairTest, FDPassAfterSoPassCred) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- char sent_data[20];
- RandomizeBuffer(sent_data, sizeof(sent_data));
-
- auto pair =
- ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-
- SetSoPassCred(sockets->second_fd());
-
- ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
- sent_data, sizeof(sent_data)));
-
- char received_data[20];
- struct ucred received_creds;
- int fd = -1;
- ASSERT_NO_FATAL_FAILURE(RecvCredsAndFD(sockets->second_fd(), &received_creds,
- &fd, received_data,
- sizeof(received_data)));
-
- EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-
- struct ucred want_creds;
- ASSERT_THAT(want_creds.pid = getpid(), SyscallSucceeds());
- ASSERT_THAT(want_creds.uid = getuid(), SyscallSucceeds());
- ASSERT_THAT(want_creds.gid = getgid(), SyscallSucceeds());
-
- EXPECT_EQ(want_creds.pid, received_creds.pid);
- EXPECT_EQ(want_creds.uid, received_creds.uid);
- EXPECT_EQ(want_creds.gid, received_creds.gid);
-
- ASSERT_NO_FATAL_FAILURE(TransferTest(fd, pair->first_fd()));
-}
-
-TEST_P(UnixSocketPairTest, CloexecDroppedWhenFDPassed) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- char sent_data[20];
- RandomizeBuffer(sent_data, sizeof(sent_data));
-
- auto pair = ASSERT_NO_ERRNO_AND_VALUE(
- UnixDomainSocketPair(SOCK_SEQPACKET | SOCK_CLOEXEC).Create());
-
- ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
- sent_data, sizeof(sent_data)));
-
- char received_data[20];
- int fd = -1;
- ASSERT_NO_FATAL_FAILURE(RecvSingleFD(sockets->second_fd(), &fd, received_data,
- sizeof(received_data)));
-
- EXPECT_THAT(fcntl(fd, F_GETFD), SyscallSucceedsWithValue(0));
-}
-
-TEST_P(UnixSocketPairTest, CloexecRecvFDPass) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- char sent_data[20];
- RandomizeBuffer(sent_data, sizeof(sent_data));
-
- auto pair =
- ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-
- ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
- sent_data, sizeof(sent_data)));
-
- struct msghdr msg = {};
- char control[CMSG_SPACE(sizeof(int))];
- msg.msg_control = control;
- msg.msg_controllen = sizeof(control);
-
- struct iovec iov;
- char received_data[20];
- iov.iov_base = received_data;
- iov.iov_len = sizeof(received_data);
- msg.msg_iov = &iov;
- msg.msg_iovlen = 1;
-
- ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, MSG_CMSG_CLOEXEC),
- SyscallSucceedsWithValue(sizeof(received_data)));
- struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
- ASSERT_NE(cmsg, nullptr);
- ASSERT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(int)));
- ASSERT_EQ(cmsg->cmsg_level, SOL_SOCKET);
- ASSERT_EQ(cmsg->cmsg_type, SCM_RIGHTS);
-
- int fd = -1;
- memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
-
- EXPECT_THAT(fcntl(fd, F_GETFD), SyscallSucceedsWithValue(FD_CLOEXEC));
-}
-
-TEST_P(UnixSocketPairTest, FDPassAfterSoPassCredWithoutCredSpace) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- char sent_data[20];
- RandomizeBuffer(sent_data, sizeof(sent_data));
-
- auto pair =
- ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-
- SetSoPassCred(sockets->second_fd());
-
- ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
- sent_data, sizeof(sent_data)));
-
- struct msghdr msg = {};
- char control[CMSG_LEN(0)];
- msg.msg_control = control;
- msg.msg_controllen = sizeof(control);
-
- char received_data[20];
- struct iovec iov;
- iov.iov_base = received_data;
- iov.iov_len = sizeof(received_data);
- msg.msg_iov = &iov;
- msg.msg_iovlen = 1;
-
- ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
- SyscallSucceedsWithValue(sizeof(received_data)));
-
- EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
-
- EXPECT_EQ(msg.msg_controllen, sizeof(control));
-
- struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
- ASSERT_NE(cmsg, nullptr);
- EXPECT_EQ(cmsg->cmsg_len, sizeof(control));
- EXPECT_EQ(cmsg->cmsg_level, SOL_SOCKET);
- EXPECT_EQ(cmsg->cmsg_type, SCM_CREDENTIALS);
-}
-
-// This test will validate that MSG_CTRUNC as an input flag to recvmsg will
-// not appear as an output flag on the control message when truncation doesn't
-// happen.
-TEST_P(UnixSocketPairTest, MsgCtruncInputIsNoop) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- char sent_data[20];
- RandomizeBuffer(sent_data, sizeof(sent_data));
-
- auto pair =
- ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-
- ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
- sent_data, sizeof(sent_data)));
-
- struct msghdr msg = {};
- char control[CMSG_SPACE(sizeof(int)) /* we're passing a single fd */];
- msg.msg_control = control;
- msg.msg_controllen = sizeof(control);
-
- struct iovec iov;
- char received_data[20];
- iov.iov_base = received_data;
- iov.iov_len = sizeof(received_data);
- msg.msg_iov = &iov;
- msg.msg_iovlen = 1;
-
- ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, MSG_CTRUNC),
- SyscallSucceedsWithValue(sizeof(received_data)));
- struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
- ASSERT_NE(cmsg, nullptr);
- ASSERT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(int)));
- ASSERT_EQ(cmsg->cmsg_level, SOL_SOCKET);
- ASSERT_EQ(cmsg->cmsg_type, SCM_RIGHTS);
-
- // Now we should verify that MSG_CTRUNC wasn't set as an output flag.
- EXPECT_EQ(msg.msg_flags & MSG_CTRUNC, 0);
-}
-
-TEST_P(UnixSocketPairTest, FDPassAfterSoPassCredWithoutCredHeaderSpace) {
- auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
-
- char sent_data[20];
- RandomizeBuffer(sent_data, sizeof(sent_data));
-
- auto pair =
- ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
-
- SetSoPassCred(sockets->second_fd());
-
- ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
- sent_data, sizeof(sent_data)));
-
- struct msghdr msg = {};
- char control[CMSG_LEN(0) / 2];
- msg.msg_control = control;
- msg.msg_controllen = sizeof(control);
-
- char received_data[20];
- struct iovec iov;
- iov.iov_base = received_data;
- iov.iov_len = sizeof(received_data);
- msg.msg_iov = &iov;
- msg.msg_iovlen = 1;
-
- ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
- SyscallSucceedsWithValue(sizeof(received_data)));
-
- EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
- EXPECT_EQ(msg.msg_controllen, 0);
-}
-
TEST_P(UnixSocketPairTest, InvalidGetSockOpt) {
auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
int opt;
@@ -1519,6 +98,14 @@ TEST_P(UnixSocketPairTest, RecvmmsgTimeoutAfterRecv) {
TEST_P(UnixSocketPairTest, TIOCINQSucceeds) {
auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ if (IsRunningOnGvisor()) {
+ // TODO(gvisor.dev/issue/273): Inherited host UDS don't support TIOCINQ.
+ // Skip the test.
+ int size = -1;
+ int ret = ioctl(sockets->first_fd(), TIOCINQ, &size);
+ SKIP_IF(ret == -1 && errno == ENOTTY);
+ }
+
int size = -1;
EXPECT_THAT(ioctl(sockets->first_fd(), TIOCINQ, &size), SyscallSucceeds());
EXPECT_EQ(size, 0);
@@ -1544,6 +131,14 @@ TEST_P(UnixSocketPairTest, TIOCINQSucceeds) {
TEST_P(UnixSocketPairTest, TIOCOUTQSucceeds) {
auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+ if (IsRunningOnGvisor()) {
+ // TODO(gvisor.dev/issue/273): Inherited host UDS don't support TIOCOUTQ.
+ // Skip the test.
+ int size = -1;
+ int ret = ioctl(sockets->second_fd(), TIOCOUTQ, &size);
+ SKIP_IF(ret == -1 && errno == ENOTTY);
+ }
+
int size = -1;
EXPECT_THAT(ioctl(sockets->second_fd(), TIOCOUTQ, &size), SyscallSucceeds());
EXPECT_EQ(size, 0);
@@ -1580,19 +175,70 @@ TEST_P(UnixSocketPairTest, NetdeviceIoctlsSucceed) {
}
}
-TEST_P(UnixSocketPairTest, SocketShutdown) {
+TEST_P(UnixSocketPairTest, Shutdown) {
auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
- char buf[20];
+
const std::string data = "abc";
- ASSERT_THAT(WriteFd(sockets->first_fd(), data.c_str(), 3),
- SyscallSucceedsWithValue(3));
+ ASSERT_THAT(WriteFd(sockets->first_fd(), data.c_str(), data.size()),
+ SyscallSucceedsWithValue(data.size()));
+
ASSERT_THAT(shutdown(sockets->first_fd(), SHUT_RDWR), SyscallSucceeds());
ASSERT_THAT(shutdown(sockets->second_fd(), SHUT_RDWR), SyscallSucceeds());
// Shutting down a socket does not clear the buffer.
- ASSERT_THAT(ReadFd(sockets->second_fd(), buf, 3),
- SyscallSucceedsWithValue(3));
- EXPECT_EQ(data, absl::string_view(buf, 3));
+ char buf[3];
+ ASSERT_THAT(ReadFd(sockets->second_fd(), buf, data.size()),
+ SyscallSucceedsWithValue(data.size()));
+ EXPECT_EQ(data, absl::string_view(buf, data.size()));
+}
+
+TEST_P(UnixSocketPairTest, ShutdownRead) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(shutdown(sockets->first_fd(), SHUT_RD), SyscallSucceeds());
+
+ // When the socket is shutdown for read, read behavior varies between
+ // different socket types. This is covered by the various ReadOneSideClosed
+ // test cases.
+
+ // ... and the peer cannot write.
+ const std::string data = "abc";
+ EXPECT_THAT(WriteFd(sockets->second_fd(), data.c_str(), data.size()),
+ SyscallFailsWithErrno(EPIPE));
+
+ // ... but the socket can still write.
+ ASSERT_THAT(WriteFd(sockets->first_fd(), data.c_str(), data.size()),
+ SyscallSucceedsWithValue(data.size()));
+
+ // ... and the peer can still read.
+ char buf[3];
+ EXPECT_THAT(ReadFd(sockets->second_fd(), buf, data.size()),
+ SyscallSucceedsWithValue(data.size()));
+ EXPECT_EQ(data, absl::string_view(buf, data.size()));
+}
+
+TEST_P(UnixSocketPairTest, ShutdownWrite) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(shutdown(sockets->first_fd(), SHUT_WR), SyscallSucceeds());
+
+ // When the socket is shutdown for write, it cannot write.
+ const std::string data = "abc";
+ EXPECT_THAT(WriteFd(sockets->first_fd(), data.c_str(), data.size()),
+ SyscallFailsWithErrno(EPIPE));
+
+ // ... and the peer read behavior varies between different socket types. This
+ // is covered by the various ReadOneSideClosed test cases.
+
+ // ... but the peer can still write.
+ char buf[3];
+ ASSERT_THAT(WriteFd(sockets->second_fd(), data.c_str(), data.size()),
+ SyscallSucceedsWithValue(data.size()));
+
+ // ... and the socket can still read.
+ EXPECT_THAT(ReadFd(sockets->first_fd(), buf, data.size()),
+ SyscallSucceedsWithValue(data.size()));
+ EXPECT_EQ(data, absl::string_view(buf, data.size()));
}
TEST_P(UnixSocketPairTest, SocketReopenFromProcfs) {
diff --git a/test/syscalls/linux/socket_unix_abstract_nonblock.cc b/test/syscalls/linux/socket_unix_abstract_nonblock.cc
index 9de0f6dfe..be31ab2a7 100644
--- a/test/syscalls/linux/socket_unix_abstract_nonblock.cc
+++ b/test/syscalls/linux/socket_unix_abstract_nonblock.cc
@@ -30,7 +30,7 @@ std::vector<SocketPairKind> GetSocketPairs() {
}
INSTANTIATE_TEST_SUITE_P(
- AllUnixDomainSockets, NonBlockingSocketPairTest,
+ NonBlockingAbstractUnixSockets, NonBlockingSocketPairTest,
::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
} // namespace testing
diff --git a/test/syscalls/linux/socket_unix_blocking_local.cc b/test/syscalls/linux/socket_unix_blocking_local.cc
index 320915b0f..1994139e6 100644
--- a/test/syscalls/linux/socket_unix_blocking_local.cc
+++ b/test/syscalls/linux/socket_unix_blocking_local.cc
@@ -37,7 +37,7 @@ std::vector<SocketPairKind> GetSocketPairs() {
}
INSTANTIATE_TEST_SUITE_P(
- AllUnixDomainSockets, BlockingSocketPairTest,
+ NonBlockingUnixDomainSockets, BlockingSocketPairTest,
::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
} // namespace testing
diff --git a/test/syscalls/linux/socket_unix_cmsg.cc b/test/syscalls/linux/socket_unix_cmsg.cc
new file mode 100644
index 000000000..b0ab26847
--- /dev/null
+++ b/test/syscalls/linux/socket_unix_cmsg.cc
@@ -0,0 +1,1473 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/syscalls/linux/socket_unix_cmsg.h"
+
+#include <errno.h>
+#include <net/if.h>
+#include <stdio.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/un.h>
+
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "gtest/gtest.h"
+#include "absl/strings/string_view.h"
+#include "test/syscalls/linux/socket_test_util.h"
+#include "test/syscalls/linux/unix_domain_socket_test_util.h"
+#include "test/util/test_util.h"
+#include "test/util/thread_util.h"
+
+// This file contains tests for control message in Unix domain sockets.
+//
+// This file is a generic socket test file. It must be built with another file
+// that provides the test types.
+
+namespace gvisor {
+namespace testing {
+
+namespace {
+
+TEST_P(UnixSocketPairCmsgTest, BasicFDPass) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+ sent_data, sizeof(sent_data)));
+
+ char received_data[20];
+ int fd = -1;
+ ASSERT_NO_FATAL_FAILURE(RecvSingleFD(sockets->second_fd(), &fd, received_data,
+ sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ ASSERT_NO_FATAL_FAILURE(TransferTest(fd, pair->first_fd()));
+}
+
+TEST_P(UnixSocketPairCmsgTest, BasicTwoFDPass) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair1 =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+ auto pair2 =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+ int sent_fds[] = {pair1->second_fd(), pair2->second_fd()};
+
+ ASSERT_NO_FATAL_FAILURE(
+ SendFDs(sockets->first_fd(), sent_fds, 2, sent_data, sizeof(sent_data)));
+
+ char received_data[20];
+ int received_fds[] = {-1, -1};
+
+ ASSERT_NO_FATAL_FAILURE(RecvFDs(sockets->second_fd(), received_fds, 2,
+ received_data, sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ ASSERT_NO_FATAL_FAILURE(TransferTest(received_fds[0], pair1->first_fd()));
+ ASSERT_NO_FATAL_FAILURE(TransferTest(received_fds[1], pair2->first_fd()));
+}
+
+TEST_P(UnixSocketPairCmsgTest, BasicThreeFDPass) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair1 =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+ auto pair2 =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+ auto pair3 =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+ int sent_fds[] = {pair1->second_fd(), pair2->second_fd(), pair3->second_fd()};
+
+ ASSERT_NO_FATAL_FAILURE(
+ SendFDs(sockets->first_fd(), sent_fds, 3, sent_data, sizeof(sent_data)));
+
+ char received_data[20];
+ int received_fds[] = {-1, -1, -1};
+
+ ASSERT_NO_FATAL_FAILURE(RecvFDs(sockets->second_fd(), received_fds, 3,
+ received_data, sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ ASSERT_NO_FATAL_FAILURE(TransferTest(received_fds[0], pair1->first_fd()));
+ ASSERT_NO_FATAL_FAILURE(TransferTest(received_fds[1], pair2->first_fd()));
+ ASSERT_NO_FATAL_FAILURE(TransferTest(received_fds[2], pair3->first_fd()));
+}
+
+TEST_P(UnixSocketPairCmsgTest, BadFDPass) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ int sent_fd = -1;
+
+ struct msghdr msg = {};
+ char control[CMSG_SPACE(sizeof(sent_fd))];
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_len = CMSG_LEN(sizeof(sent_fd));
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ memcpy(CMSG_DATA(cmsg), &sent_fd, sizeof(sent_fd));
+
+ struct iovec iov;
+ iov.iov_base = sent_data;
+ iov.iov_len = sizeof(sent_data);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(sendmsg)(sockets->first_fd(), &msg, 0),
+ SyscallFailsWithErrno(EBADF));
+}
+
+// BasicFDPassNoSpace starts off by sending a single FD just like BasicFDPass.
+// The difference is that when calling recvmsg, no space for FDs is provided,
+// only space for the cmsg header.
+TEST_P(UnixSocketPairCmsgTest, BasicFDPassNoSpace) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+ sent_data, sizeof(sent_data)));
+
+ char received_data[20];
+
+ struct msghdr msg = {};
+ std::vector<char> control(CMSG_SPACE(0));
+ msg.msg_control = &control[0];
+ msg.msg_controllen = control.size();
+
+ struct iovec iov;
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+
+ EXPECT_EQ(msg.msg_controllen, 0);
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+}
+
+// BasicFDPassNoSpaceMsgCtrunc sends an FD, but does not provide any space to
+// receive it. It then verifies that the MSG_CTRUNC flag is set in the msghdr.
+TEST_P(UnixSocketPairCmsgTest, BasicFDPassNoSpaceMsgCtrunc) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+ sent_data, sizeof(sent_data)));
+
+ struct msghdr msg = {};
+ std::vector<char> control(CMSG_SPACE(0));
+ msg.msg_control = &control[0];
+ msg.msg_controllen = control.size();
+
+ char received_data[sizeof(sent_data)];
+ struct iovec iov;
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+
+ EXPECT_EQ(msg.msg_controllen, 0);
+ EXPECT_EQ(msg.msg_flags, MSG_CTRUNC);
+}
+
+// BasicFDPassNullControlMsgCtrunc sends an FD and sets contradictory values for
+// msg_controllen and msg_control. msg_controllen is set to the correct size to
+// accomidate the FD, but msg_control is set to NULL. In this case, msg_control
+// should override msg_controllen.
+TEST_P(UnixSocketPairCmsgTest, BasicFDPassNullControlMsgCtrunc) {
+ // FIXME(gvisor.dev/issue/207): Fix handling of NULL msg_control.
+ SKIP_IF(IsRunningOnGvisor());
+
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+ sent_data, sizeof(sent_data)));
+
+ struct msghdr msg = {};
+ msg.msg_controllen = CMSG_SPACE(1);
+
+ char received_data[sizeof(sent_data)];
+ struct iovec iov;
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+
+ EXPECT_EQ(msg.msg_controllen, 0);
+ EXPECT_EQ(msg.msg_flags, MSG_CTRUNC);
+}
+
+// BasicFDPassNotEnoughSpaceMsgCtrunc sends an FD, but does not provide enough
+// space to receive it. It then verifies that the MSG_CTRUNC flag is set in the
+// msghdr.
+TEST_P(UnixSocketPairCmsgTest, BasicFDPassNotEnoughSpaceMsgCtrunc) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+ sent_data, sizeof(sent_data)));
+
+ struct msghdr msg = {};
+ std::vector<char> control(CMSG_SPACE(0) + 1);
+ msg.msg_control = &control[0];
+ msg.msg_controllen = control.size();
+
+ char received_data[sizeof(sent_data)];
+ struct iovec iov;
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+
+ EXPECT_EQ(msg.msg_controllen, 0);
+ EXPECT_EQ(msg.msg_flags, MSG_CTRUNC);
+}
+
+// BasicThreeFDPassTruncationMsgCtrunc sends three FDs, but only provides enough
+// space to receive two of them. It then verifies that the MSG_CTRUNC flag is
+// set in the msghdr.
+TEST_P(UnixSocketPairCmsgTest, BasicThreeFDPassTruncationMsgCtrunc) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair1 =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+ auto pair2 =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+ auto pair3 =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+ int sent_fds[] = {pair1->second_fd(), pair2->second_fd(), pair3->second_fd()};
+
+ ASSERT_NO_FATAL_FAILURE(
+ SendFDs(sockets->first_fd(), sent_fds, 3, sent_data, sizeof(sent_data)));
+
+ struct msghdr msg = {};
+ std::vector<char> control(CMSG_SPACE(2 * sizeof(int)));
+ msg.msg_control = &control[0];
+ msg.msg_controllen = control.size();
+
+ char received_data[sizeof(sent_data)];
+ struct iovec iov;
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+
+ EXPECT_EQ(msg.msg_flags, MSG_CTRUNC);
+
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, nullptr);
+ EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(2 * sizeof(int)));
+ EXPECT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+ EXPECT_EQ(cmsg->cmsg_type, SCM_RIGHTS);
+}
+
+// BasicFDPassUnalignedRecv starts off by sending a single FD just like
+// BasicFDPass. The difference is that when calling recvmsg, the length of the
+// receive data is only aligned on a 4 byte boundry instead of the normal 8.
+TEST_P(UnixSocketPairCmsgTest, BasicFDPassUnalignedRecv) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+ sent_data, sizeof(sent_data)));
+
+ char received_data[20];
+ int fd = -1;
+ ASSERT_NO_FATAL_FAILURE(RecvSingleFDUnaligned(
+ sockets->second_fd(), &fd, received_data, sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ ASSERT_NO_FATAL_FAILURE(TransferTest(fd, pair->first_fd()));
+}
+
+// BasicFDPassUnalignedRecvNoMsgTrunc sends one FD and only provides enough
+// space to receive just it. (Normally the minimum amount of space one would
+// provide would be enough space for two FDs.) It then verifies that the
+// MSG_CTRUNC flag is not set in the msghdr.
+TEST_P(UnixSocketPairCmsgTest, BasicFDPassUnalignedRecvNoMsgTrunc) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+ sent_data, sizeof(sent_data)));
+
+ struct msghdr msg = {};
+ char control[CMSG_SPACE(sizeof(int)) - sizeof(int)];
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ char received_data[sizeof(sent_data)] = {};
+ struct iovec iov;
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+
+ EXPECT_EQ(msg.msg_flags, 0);
+
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, nullptr);
+ EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(int)));
+ EXPECT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+ EXPECT_EQ(cmsg->cmsg_type, SCM_RIGHTS);
+}
+
+// BasicTwoFDPassUnalignedRecvTruncationMsgTrunc sends two FDs, but only
+// provides enough space to receive one of them. It then verifies that the
+// MSG_CTRUNC flag is set in the msghdr.
+TEST_P(UnixSocketPairCmsgTest, BasicTwoFDPassUnalignedRecvTruncationMsgTrunc) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+ int sent_fds[] = {pair->first_fd(), pair->second_fd()};
+
+ ASSERT_NO_FATAL_FAILURE(
+ SendFDs(sockets->first_fd(), sent_fds, 2, sent_data, sizeof(sent_data)));
+
+ struct msghdr msg = {};
+ // CMSG_SPACE rounds up to two FDs, we only want one.
+ char control[CMSG_SPACE(sizeof(int)) - sizeof(int)];
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ char received_data[sizeof(sent_data)] = {};
+ struct iovec iov;
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+
+ EXPECT_EQ(msg.msg_flags, MSG_CTRUNC);
+
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, nullptr);
+ EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(int)));
+ EXPECT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+ EXPECT_EQ(cmsg->cmsg_type, SCM_RIGHTS);
+}
+
+TEST_P(UnixSocketPairCmsgTest, ConcurrentBasicFDPass) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ int sockfd1 = sockets->first_fd();
+ auto recv_func = [sockfd1, sent_data]() {
+ char received_data[20];
+ int fd = -1;
+ RecvSingleFD(sockfd1, &fd, received_data, sizeof(received_data));
+ ASSERT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+ char buf[20];
+ ASSERT_THAT(ReadFd(fd, buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+ ASSERT_THAT(WriteFd(fd, buf, sizeof(buf)),
+ SyscallSucceedsWithValue(sizeof(buf)));
+ };
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->second_fd(), pair->second_fd(),
+ sent_data, sizeof(sent_data)));
+
+ ScopedThread t(recv_func);
+
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+ ASSERT_THAT(WriteFd(pair->first_fd(), sent_data, sizeof(sent_data)),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ char received_data[20];
+ ASSERT_THAT(ReadFd(pair->first_fd(), received_data, sizeof(received_data)),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+
+ t.Join();
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+}
+
+// FDPassNoRecv checks that the control message can be safely ignored by using
+// read(2) instead of recvmsg(2).
+TEST_P(UnixSocketPairCmsgTest, FDPassNoRecv) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+ sent_data, sizeof(sent_data)));
+
+ // Read while ignoring the passed FD.
+ char received_data[20];
+ ASSERT_THAT(
+ ReadFd(sockets->second_fd(), received_data, sizeof(received_data)),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ // Check that the socket still works for reads and writes.
+ ASSERT_NO_FATAL_FAILURE(
+ TransferTest(sockets->first_fd(), sockets->second_fd()));
+}
+
+// FDPassInterspersed1 checks that sent control messages cannot be read before
+// their associated data has been read.
+TEST_P(UnixSocketPairCmsgTest, FDPassInterspersed1) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char written_data[20];
+ RandomizeBuffer(written_data, sizeof(written_data));
+
+ ASSERT_THAT(WriteFd(sockets->first_fd(), written_data, sizeof(written_data)),
+ SyscallSucceedsWithValue(sizeof(written_data)));
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+ sent_data, sizeof(sent_data)));
+
+ // Check that we don't get a control message, but do get the data.
+ char received_data[20];
+ RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data));
+ EXPECT_EQ(0, memcmp(written_data, received_data, sizeof(written_data)));
+}
+
+// FDPassInterspersed2 checks that sent control messages cannot be read after
+// their assocated data has been read while ignoring the control message by
+// using read(2) instead of recvmsg(2).
+TEST_P(UnixSocketPairCmsgTest, FDPassInterspersed2) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+ sent_data, sizeof(sent_data)));
+
+ char written_data[20];
+ RandomizeBuffer(written_data, sizeof(written_data));
+ ASSERT_THAT(WriteFd(sockets->first_fd(), written_data, sizeof(written_data)),
+ SyscallSucceedsWithValue(sizeof(written_data)));
+
+ char received_data[20];
+ ASSERT_THAT(
+ ReadFd(sockets->second_fd(), received_data, sizeof(received_data)),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ ASSERT_NO_FATAL_FAILURE(
+ RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data)));
+ EXPECT_EQ(0, memcmp(written_data, received_data, sizeof(written_data)));
+}
+
+TEST_P(UnixSocketPairCmsgTest, FDPassNotCoalesced) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data1[20];
+ RandomizeBuffer(sent_data1, sizeof(sent_data1));
+
+ auto pair1 =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair1->second_fd(),
+ sent_data1, sizeof(sent_data1)));
+
+ char sent_data2[20];
+ RandomizeBuffer(sent_data2, sizeof(sent_data2));
+
+ auto pair2 =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair2->second_fd(),
+ sent_data2, sizeof(sent_data2)));
+
+ char received_data1[sizeof(sent_data1) + sizeof(sent_data2)];
+ int received_fd1 = -1;
+
+ RecvSingleFD(sockets->second_fd(), &received_fd1, received_data1,
+ sizeof(received_data1), sizeof(sent_data1));
+
+ EXPECT_EQ(0, memcmp(sent_data1, received_data1, sizeof(sent_data1)));
+ TransferTest(pair1->first_fd(), pair1->second_fd());
+
+ char received_data2[sizeof(sent_data1) + sizeof(sent_data2)];
+ int received_fd2 = -1;
+
+ RecvSingleFD(sockets->second_fd(), &received_fd2, received_data2,
+ sizeof(received_data2), sizeof(sent_data2));
+
+ EXPECT_EQ(0, memcmp(sent_data2, received_data2, sizeof(sent_data2)));
+ TransferTest(pair2->first_fd(), pair2->second_fd());
+}
+
+TEST_P(UnixSocketPairCmsgTest, FDPassPeek) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+ sent_data, sizeof(sent_data)));
+
+ char peek_data[20];
+ int peek_fd = -1;
+ PeekSingleFD(sockets->second_fd(), &peek_fd, peek_data, sizeof(peek_data));
+ EXPECT_EQ(0, memcmp(sent_data, peek_data, sizeof(sent_data)));
+ TransferTest(peek_fd, pair->first_fd());
+ EXPECT_THAT(close(peek_fd), SyscallSucceeds());
+
+ char received_data[20];
+ int received_fd = -1;
+ RecvSingleFD(sockets->second_fd(), &received_fd, received_data,
+ sizeof(received_data));
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+ TransferTest(received_fd, pair->first_fd());
+ EXPECT_THAT(close(received_fd), SyscallSucceeds());
+}
+
+TEST_P(UnixSocketPairCmsgTest, BasicCredPass) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ struct ucred sent_creds;
+
+ ASSERT_THAT(sent_creds.pid = getpid(), SyscallSucceeds());
+ ASSERT_THAT(sent_creds.uid = getuid(), SyscallSucceeds());
+ ASSERT_THAT(sent_creds.gid = getgid(), SyscallSucceeds());
+
+ ASSERT_NO_FATAL_FAILURE(
+ SendCreds(sockets->first_fd(), sent_creds, sent_data, sizeof(sent_data)));
+
+ SetSoPassCred(sockets->second_fd());
+
+ char received_data[20];
+ struct ucred received_creds;
+ ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
+ received_data, sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+ EXPECT_EQ(sent_creds.pid, received_creds.pid);
+ EXPECT_EQ(sent_creds.uid, received_creds.uid);
+ EXPECT_EQ(sent_creds.gid, received_creds.gid);
+}
+
+TEST_P(UnixSocketPairCmsgTest, SendNullCredsBeforeSoPassCredRecvEnd) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ ASSERT_NO_FATAL_FAILURE(
+ SendNullCmsg(sockets->first_fd(), sent_data, sizeof(sent_data)));
+
+ SetSoPassCred(sockets->second_fd());
+
+ char received_data[20];
+ struct ucred received_creds;
+ ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
+ received_data, sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ struct ucred want_creds {
+ 0, 65534, 65534
+ };
+
+ EXPECT_EQ(want_creds.pid, received_creds.pid);
+ EXPECT_EQ(want_creds.uid, received_creds.uid);
+ EXPECT_EQ(want_creds.gid, received_creds.gid);
+}
+
+TEST_P(UnixSocketPairCmsgTest, SendNullCredsAfterSoPassCredRecvEnd) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ SetSoPassCred(sockets->second_fd());
+
+ ASSERT_NO_FATAL_FAILURE(
+ SendNullCmsg(sockets->first_fd(), sent_data, sizeof(sent_data)));
+
+ char received_data[20];
+ struct ucred received_creds;
+ ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
+ received_data, sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ struct ucred want_creds;
+ ASSERT_THAT(want_creds.pid = getpid(), SyscallSucceeds());
+ ASSERT_THAT(want_creds.uid = getuid(), SyscallSucceeds());
+ ASSERT_THAT(want_creds.gid = getgid(), SyscallSucceeds());
+
+ EXPECT_EQ(want_creds.pid, received_creds.pid);
+ EXPECT_EQ(want_creds.uid, received_creds.uid);
+ EXPECT_EQ(want_creds.gid, received_creds.gid);
+}
+
+TEST_P(UnixSocketPairCmsgTest, SendNullCredsBeforeSoPassCredSendEnd) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ ASSERT_NO_FATAL_FAILURE(
+ SendNullCmsg(sockets->first_fd(), sent_data, sizeof(sent_data)));
+
+ SetSoPassCred(sockets->first_fd());
+
+ char received_data[20];
+ ASSERT_NO_FATAL_FAILURE(
+ RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+}
+
+TEST_P(UnixSocketPairCmsgTest, SendNullCredsAfterSoPassCredSendEnd) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ SetSoPassCred(sockets->first_fd());
+
+ ASSERT_NO_FATAL_FAILURE(
+ SendNullCmsg(sockets->first_fd(), sent_data, sizeof(sent_data)));
+
+ char received_data[20];
+ ASSERT_NO_FATAL_FAILURE(
+ RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+}
+
+TEST_P(UnixSocketPairCmsgTest,
+ SendNullCredsBeforeSoPassCredRecvEndAfterSendEnd) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ SetSoPassCred(sockets->first_fd());
+
+ ASSERT_NO_FATAL_FAILURE(
+ SendNullCmsg(sockets->first_fd(), sent_data, sizeof(sent_data)));
+
+ SetSoPassCred(sockets->second_fd());
+
+ char received_data[20];
+ struct ucred received_creds;
+ ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
+ received_data, sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ struct ucred want_creds;
+ ASSERT_THAT(want_creds.pid = getpid(), SyscallSucceeds());
+ ASSERT_THAT(want_creds.uid = getuid(), SyscallSucceeds());
+ ASSERT_THAT(want_creds.gid = getgid(), SyscallSucceeds());
+
+ EXPECT_EQ(want_creds.pid, received_creds.pid);
+ EXPECT_EQ(want_creds.uid, received_creds.uid);
+ EXPECT_EQ(want_creds.gid, received_creds.gid);
+}
+
+TEST_P(UnixSocketPairCmsgTest, WriteBeforeSoPassCredRecvEnd) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data, sizeof(sent_data)),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ SetSoPassCred(sockets->second_fd());
+
+ char received_data[20];
+
+ struct ucred received_creds;
+ ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
+ received_data, sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ struct ucred want_creds {
+ 0, 65534, 65534
+ };
+
+ EXPECT_EQ(want_creds.pid, received_creds.pid);
+ EXPECT_EQ(want_creds.uid, received_creds.uid);
+ EXPECT_EQ(want_creds.gid, received_creds.gid);
+}
+
+TEST_P(UnixSocketPairCmsgTest, WriteAfterSoPassCredRecvEnd) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ SetSoPassCred(sockets->second_fd());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+ ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data, sizeof(sent_data)),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ char received_data[20];
+
+ struct ucred received_creds;
+ ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
+ received_data, sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ struct ucred want_creds;
+ ASSERT_THAT(want_creds.pid = getpid(), SyscallSucceeds());
+ ASSERT_THAT(want_creds.uid = getuid(), SyscallSucceeds());
+ ASSERT_THAT(want_creds.gid = getgid(), SyscallSucceeds());
+
+ EXPECT_EQ(want_creds.pid, received_creds.pid);
+ EXPECT_EQ(want_creds.uid, received_creds.uid);
+ EXPECT_EQ(want_creds.gid, received_creds.gid);
+}
+
+TEST_P(UnixSocketPairCmsgTest, WriteBeforeSoPassCredSendEnd) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data, sizeof(sent_data)),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ SetSoPassCred(sockets->first_fd());
+
+ char received_data[20];
+ ASSERT_NO_FATAL_FAILURE(
+ RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+}
+
+TEST_P(UnixSocketPairCmsgTest, WriteAfterSoPassCredSendEnd) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ SetSoPassCred(sockets->first_fd());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data, sizeof(sent_data)),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ char received_data[20];
+ ASSERT_NO_FATAL_FAILURE(
+ RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+}
+
+TEST_P(UnixSocketPairCmsgTest, WriteBeforeSoPassCredRecvEndAfterSendEnd) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ SetSoPassCred(sockets->first_fd());
+
+ ASSERT_THAT(WriteFd(sockets->first_fd(), sent_data, sizeof(sent_data)),
+ SyscallSucceedsWithValue(sizeof(sent_data)));
+
+ SetSoPassCred(sockets->second_fd());
+
+ char received_data[20];
+
+ struct ucred received_creds;
+ ASSERT_NO_FATAL_FAILURE(RecvCreds(sockets->second_fd(), &received_creds,
+ received_data, sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ struct ucred want_creds;
+ ASSERT_THAT(want_creds.pid = getpid(), SyscallSucceeds());
+ ASSERT_THAT(want_creds.uid = getuid(), SyscallSucceeds());
+ ASSERT_THAT(want_creds.gid = getgid(), SyscallSucceeds());
+
+ EXPECT_EQ(want_creds.pid, received_creds.pid);
+ EXPECT_EQ(want_creds.uid, received_creds.uid);
+ EXPECT_EQ(want_creds.gid, received_creds.gid);
+}
+
+TEST_P(UnixSocketPairCmsgTest, CredPassTruncated) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ struct ucred sent_creds;
+
+ ASSERT_THAT(sent_creds.pid = getpid(), SyscallSucceeds());
+ ASSERT_THAT(sent_creds.uid = getuid(), SyscallSucceeds());
+ ASSERT_THAT(sent_creds.gid = getgid(), SyscallSucceeds());
+
+ ASSERT_NO_FATAL_FAILURE(
+ SendCreds(sockets->first_fd(), sent_creds, sent_data, sizeof(sent_data)));
+
+ SetSoPassCred(sockets->second_fd());
+
+ struct msghdr msg = {};
+ char control[CMSG_SPACE(0) + sizeof(pid_t)];
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ char received_data[sizeof(sent_data)] = {};
+ struct iovec iov;
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ EXPECT_EQ(msg.msg_controllen, sizeof(control));
+
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, nullptr);
+ EXPECT_EQ(cmsg->cmsg_len, sizeof(control));
+ EXPECT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+ EXPECT_EQ(cmsg->cmsg_type, SCM_CREDENTIALS);
+
+ pid_t pid = 0;
+ memcpy(&pid, CMSG_DATA(cmsg), sizeof(pid));
+ EXPECT_EQ(pid, sent_creds.pid);
+}
+
+// CredPassNoMsgCtrunc passes a full set of credentials. It then verifies that
+// receiving the full set does not result in MSG_CTRUNC being set in the msghdr.
+TEST_P(UnixSocketPairCmsgTest, CredPassNoMsgCtrunc) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ struct ucred sent_creds;
+
+ ASSERT_THAT(sent_creds.pid = getpid(), SyscallSucceeds());
+ ASSERT_THAT(sent_creds.uid = getuid(), SyscallSucceeds());
+ ASSERT_THAT(sent_creds.gid = getgid(), SyscallSucceeds());
+
+ ASSERT_NO_FATAL_FAILURE(
+ SendCreds(sockets->first_fd(), sent_creds, sent_data, sizeof(sent_data)));
+
+ SetSoPassCred(sockets->second_fd());
+
+ struct msghdr msg = {};
+ char control[CMSG_SPACE(sizeof(struct ucred))];
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ char received_data[sizeof(sent_data)] = {};
+ struct iovec iov;
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ // The control message should not be truncated.
+ EXPECT_EQ(msg.msg_flags, 0);
+ EXPECT_EQ(msg.msg_controllen, sizeof(control));
+
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, nullptr);
+ EXPECT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(struct ucred)));
+ EXPECT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+ EXPECT_EQ(cmsg->cmsg_type, SCM_CREDENTIALS);
+}
+
+// CredPassNoSpaceMsgCtrunc passes a full set of credentials. It then receives
+// the data without providing space for any credentials and verifies that
+// MSG_CTRUNC is set in the msghdr.
+TEST_P(UnixSocketPairCmsgTest, CredPassNoSpaceMsgCtrunc) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ struct ucred sent_creds;
+
+ ASSERT_THAT(sent_creds.pid = getpid(), SyscallSucceeds());
+ ASSERT_THAT(sent_creds.uid = getuid(), SyscallSucceeds());
+ ASSERT_THAT(sent_creds.gid = getgid(), SyscallSucceeds());
+
+ ASSERT_NO_FATAL_FAILURE(
+ SendCreds(sockets->first_fd(), sent_creds, sent_data, sizeof(sent_data)));
+
+ SetSoPassCred(sockets->second_fd());
+
+ struct msghdr msg = {};
+ char control[CMSG_SPACE(0)];
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ char received_data[sizeof(sent_data)] = {};
+ struct iovec iov;
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ // The control message should be truncated.
+ EXPECT_EQ(msg.msg_flags, MSG_CTRUNC);
+ EXPECT_EQ(msg.msg_controllen, sizeof(control));
+
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, nullptr);
+ EXPECT_EQ(cmsg->cmsg_len, sizeof(control));
+ EXPECT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+ EXPECT_EQ(cmsg->cmsg_type, SCM_CREDENTIALS);
+}
+
+// CredPassTruncatedMsgCtrunc passes a full set of credentials. It then receives
+// the data while providing enough space for only the first field of the
+// credentials and verifies that MSG_CTRUNC is set in the msghdr.
+TEST_P(UnixSocketPairCmsgTest, CredPassTruncatedMsgCtrunc) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ struct ucred sent_creds;
+
+ ASSERT_THAT(sent_creds.pid = getpid(), SyscallSucceeds());
+ ASSERT_THAT(sent_creds.uid = getuid(), SyscallSucceeds());
+ ASSERT_THAT(sent_creds.gid = getgid(), SyscallSucceeds());
+
+ ASSERT_NO_FATAL_FAILURE(
+ SendCreds(sockets->first_fd(), sent_creds, sent_data, sizeof(sent_data)));
+
+ SetSoPassCred(sockets->second_fd());
+
+ struct msghdr msg = {};
+ char control[CMSG_SPACE(0) + sizeof(pid_t)];
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ char received_data[sizeof(sent_data)] = {};
+ struct iovec iov;
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ // The control message should be truncated.
+ EXPECT_EQ(msg.msg_flags, MSG_CTRUNC);
+ EXPECT_EQ(msg.msg_controllen, sizeof(control));
+
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, nullptr);
+ EXPECT_EQ(cmsg->cmsg_len, sizeof(control));
+ EXPECT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+ EXPECT_EQ(cmsg->cmsg_type, SCM_CREDENTIALS);
+}
+
+TEST_P(UnixSocketPairCmsgTest, SoPassCred) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ int opt;
+ socklen_t optLen = sizeof(opt);
+ EXPECT_THAT(
+ getsockopt(sockets->first_fd(), SOL_SOCKET, SO_PASSCRED, &opt, &optLen),
+ SyscallSucceeds());
+ EXPECT_FALSE(opt);
+
+ optLen = sizeof(opt);
+ EXPECT_THAT(
+ getsockopt(sockets->second_fd(), SOL_SOCKET, SO_PASSCRED, &opt, &optLen),
+ SyscallSucceeds());
+ EXPECT_FALSE(opt);
+
+ SetSoPassCred(sockets->first_fd());
+
+ optLen = sizeof(opt);
+ EXPECT_THAT(
+ getsockopt(sockets->first_fd(), SOL_SOCKET, SO_PASSCRED, &opt, &optLen),
+ SyscallSucceeds());
+ EXPECT_TRUE(opt);
+
+ optLen = sizeof(opt);
+ EXPECT_THAT(
+ getsockopt(sockets->second_fd(), SOL_SOCKET, SO_PASSCRED, &opt, &optLen),
+ SyscallSucceeds());
+ EXPECT_FALSE(opt);
+
+ int zero = 0;
+ EXPECT_THAT(setsockopt(sockets->first_fd(), SOL_SOCKET, SO_PASSCRED, &zero,
+ sizeof(zero)),
+ SyscallSucceeds());
+
+ optLen = sizeof(opt);
+ EXPECT_THAT(
+ getsockopt(sockets->first_fd(), SOL_SOCKET, SO_PASSCRED, &opt, &optLen),
+ SyscallSucceeds());
+ EXPECT_FALSE(opt);
+
+ optLen = sizeof(opt);
+ EXPECT_THAT(
+ getsockopt(sockets->second_fd(), SOL_SOCKET, SO_PASSCRED, &opt, &optLen),
+ SyscallSucceeds());
+ EXPECT_FALSE(opt);
+}
+
+TEST_P(UnixSocketPairCmsgTest, NoDataCredPass) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ struct msghdr msg = {};
+
+ struct iovec iov;
+ iov.iov_base = sent_data;
+ iov.iov_len = sizeof(sent_data);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ char control[CMSG_SPACE(0)];
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_CREDENTIALS;
+ cmsg->cmsg_len = CMSG_LEN(0);
+
+ ASSERT_THAT(RetryEINTR(sendmsg)(sockets->first_fd(), &msg, 0),
+ SyscallFailsWithErrno(EINVAL));
+}
+
+TEST_P(UnixSocketPairCmsgTest, NoPassCred) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ struct ucred sent_creds;
+
+ ASSERT_THAT(sent_creds.pid = getpid(), SyscallSucceeds());
+ ASSERT_THAT(sent_creds.uid = getuid(), SyscallSucceeds());
+ ASSERT_THAT(sent_creds.gid = getgid(), SyscallSucceeds());
+
+ ASSERT_NO_FATAL_FAILURE(
+ SendCreds(sockets->first_fd(), sent_creds, sent_data, sizeof(sent_data)));
+
+ char received_data[20];
+
+ ASSERT_NO_FATAL_FAILURE(
+ RecvNoCmsg(sockets->second_fd(), received_data, sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+}
+
+TEST_P(UnixSocketPairCmsgTest, CredAndFDPass) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ struct ucred sent_creds;
+
+ ASSERT_THAT(sent_creds.pid = getpid(), SyscallSucceeds());
+ ASSERT_THAT(sent_creds.uid = getuid(), SyscallSucceeds());
+ ASSERT_THAT(sent_creds.gid = getgid(), SyscallSucceeds());
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ ASSERT_NO_FATAL_FAILURE(SendCredsAndFD(sockets->first_fd(), sent_creds,
+ pair->second_fd(), sent_data,
+ sizeof(sent_data)));
+
+ SetSoPassCred(sockets->second_fd());
+
+ char received_data[20];
+ struct ucred received_creds;
+ int fd = -1;
+ ASSERT_NO_FATAL_FAILURE(RecvCredsAndFD(sockets->second_fd(), &received_creds,
+ &fd, received_data,
+ sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ EXPECT_EQ(sent_creds.pid, received_creds.pid);
+ EXPECT_EQ(sent_creds.uid, received_creds.uid);
+ EXPECT_EQ(sent_creds.gid, received_creds.gid);
+
+ ASSERT_NO_FATAL_FAILURE(TransferTest(fd, pair->first_fd()));
+}
+
+TEST_P(UnixSocketPairCmsgTest, FDPassBeforeSoPassCred) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+ sent_data, sizeof(sent_data)));
+
+ SetSoPassCred(sockets->second_fd());
+
+ char received_data[20];
+ struct ucred received_creds;
+ int fd = -1;
+ ASSERT_NO_FATAL_FAILURE(RecvCredsAndFD(sockets->second_fd(), &received_creds,
+ &fd, received_data,
+ sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ struct ucred want_creds {
+ 0, 65534, 65534
+ };
+
+ EXPECT_EQ(want_creds.pid, received_creds.pid);
+ EXPECT_EQ(want_creds.uid, received_creds.uid);
+ EXPECT_EQ(want_creds.gid, received_creds.gid);
+
+ ASSERT_NO_FATAL_FAILURE(TransferTest(fd, pair->first_fd()));
+}
+
+TEST_P(UnixSocketPairCmsgTest, FDPassAfterSoPassCred) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ SetSoPassCred(sockets->second_fd());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+ sent_data, sizeof(sent_data)));
+
+ char received_data[20];
+ struct ucred received_creds;
+ int fd = -1;
+ ASSERT_NO_FATAL_FAILURE(RecvCredsAndFD(sockets->second_fd(), &received_creds,
+ &fd, received_data,
+ sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ struct ucred want_creds;
+ ASSERT_THAT(want_creds.pid = getpid(), SyscallSucceeds());
+ ASSERT_THAT(want_creds.uid = getuid(), SyscallSucceeds());
+ ASSERT_THAT(want_creds.gid = getgid(), SyscallSucceeds());
+
+ EXPECT_EQ(want_creds.pid, received_creds.pid);
+ EXPECT_EQ(want_creds.uid, received_creds.uid);
+ EXPECT_EQ(want_creds.gid, received_creds.gid);
+
+ ASSERT_NO_FATAL_FAILURE(TransferTest(fd, pair->first_fd()));
+}
+
+TEST_P(UnixSocketPairCmsgTest, CloexecDroppedWhenFDPassed) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair = ASSERT_NO_ERRNO_AND_VALUE(
+ UnixDomainSocketPair(SOCK_SEQPACKET | SOCK_CLOEXEC).Create());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+ sent_data, sizeof(sent_data)));
+
+ char received_data[20];
+ int fd = -1;
+ ASSERT_NO_FATAL_FAILURE(RecvSingleFD(sockets->second_fd(), &fd, received_data,
+ sizeof(received_data)));
+
+ EXPECT_THAT(fcntl(fd, F_GETFD), SyscallSucceedsWithValue(0));
+}
+
+TEST_P(UnixSocketPairCmsgTest, CloexecRecvFDPass) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+ sent_data, sizeof(sent_data)));
+
+ struct msghdr msg = {};
+ char control[CMSG_SPACE(sizeof(int))];
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ struct iovec iov;
+ char received_data[20];
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, MSG_CMSG_CLOEXEC),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, nullptr);
+ ASSERT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(int)));
+ ASSERT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+ ASSERT_EQ(cmsg->cmsg_type, SCM_RIGHTS);
+
+ int fd = -1;
+ memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
+
+ EXPECT_THAT(fcntl(fd, F_GETFD), SyscallSucceedsWithValue(FD_CLOEXEC));
+}
+
+TEST_P(UnixSocketPairCmsgTest, FDPassAfterSoPassCredWithoutCredSpace) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ SetSoPassCred(sockets->second_fd());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+ sent_data, sizeof(sent_data)));
+
+ struct msghdr msg = {};
+ char control[CMSG_LEN(0)];
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ char received_data[20];
+ struct iovec iov;
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+
+ EXPECT_EQ(msg.msg_controllen, sizeof(control));
+
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, nullptr);
+ EXPECT_EQ(cmsg->cmsg_len, sizeof(control));
+ EXPECT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+ EXPECT_EQ(cmsg->cmsg_type, SCM_CREDENTIALS);
+}
+
+// This test will validate that MSG_CTRUNC as an input flag to recvmsg will
+// not appear as an output flag on the control message when truncation doesn't
+// happen.
+TEST_P(UnixSocketPairCmsgTest, MsgCtruncInputIsNoop) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+ sent_data, sizeof(sent_data)));
+
+ struct msghdr msg = {};
+ char control[CMSG_SPACE(sizeof(int)) /* we're passing a single fd */];
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ struct iovec iov;
+ char received_data[20];
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, MSG_CTRUNC),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+ struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
+ ASSERT_NE(cmsg, nullptr);
+ ASSERT_EQ(cmsg->cmsg_len, CMSG_LEN(sizeof(int)));
+ ASSERT_EQ(cmsg->cmsg_level, SOL_SOCKET);
+ ASSERT_EQ(cmsg->cmsg_type, SCM_RIGHTS);
+
+ // Now we should verify that MSG_CTRUNC wasn't set as an output flag.
+ EXPECT_EQ(msg.msg_flags & MSG_CTRUNC, 0);
+}
+
+TEST_P(UnixSocketPairCmsgTest, FDPassAfterSoPassCredWithoutCredHeaderSpace) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ char sent_data[20];
+ RandomizeBuffer(sent_data, sizeof(sent_data));
+
+ auto pair =
+ ASSERT_NO_ERRNO_AND_VALUE(UnixDomainSocketPair(SOCK_SEQPACKET).Create());
+
+ SetSoPassCred(sockets->second_fd());
+
+ ASSERT_NO_FATAL_FAILURE(SendSingleFD(sockets->first_fd(), pair->second_fd(),
+ sent_data, sizeof(sent_data)));
+
+ struct msghdr msg = {};
+ char control[CMSG_LEN(0) / 2];
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ char received_data[20];
+ struct iovec iov;
+ iov.iov_base = received_data;
+ iov.iov_len = sizeof(received_data);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ ASSERT_THAT(RetryEINTR(recvmsg)(sockets->second_fd(), &msg, 0),
+ SyscallSucceedsWithValue(sizeof(received_data)));
+
+ EXPECT_EQ(0, memcmp(sent_data, received_data, sizeof(sent_data)));
+ EXPECT_EQ(msg.msg_controllen, 0);
+}
+
+} // namespace
+
+} // namespace testing
+} // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_abstract.cc b/test/syscalls/linux/socket_unix_cmsg.h
index 8241bf997..431606903 100644
--- a/test/syscalls/linux/socket_unix_abstract.cc
+++ b/test/syscalls/linux/socket_unix_cmsg.h
@@ -12,26 +12,19 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include <vector>
+#ifndef GVISOR_TEST_SYSCALLS_LINUX_SOCKET_UNIX_CMSG_H_
+#define GVISOR_TEST_SYSCALLS_LINUX_SOCKET_UNIX_CMSG_H_
#include "test/syscalls/linux/socket_test_util.h"
-#include "test/syscalls/linux/socket_unix.h"
-#include "test/syscalls/linux/unix_domain_socket_test_util.h"
-#include "test/util/test_util.h"
namespace gvisor {
namespace testing {
-std::vector<SocketPairKind> GetSocketPairs() {
- return ApplyVec<SocketPairKind>(
- AbstractBoundUnixDomainSocketPair,
- AllBitwiseCombinations(List<int>{SOCK_STREAM, SOCK_DGRAM, SOCK_SEQPACKET},
- List<int>{0, SOCK_NONBLOCK}));
-}
-
-INSTANTIATE_TEST_SUITE_P(
- AllUnixDomainSockets, UnixSocketPairTest,
- ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+// Test fixture for tests that apply to pairs of connected unix sockets about
+// control messages.
+using UnixSocketPairCmsgTest = SocketPairTest;
} // namespace testing
} // namespace gvisor
+
+#endif // GVISOR_TEST_SYSCALLS_LINUX_SOCKET_UNIX_CMSG_H_
diff --git a/test/syscalls/linux/socket_unix_dgram_local.cc b/test/syscalls/linux/socket_unix_dgram_local.cc
index 4ba2c80ae..8c5a473bd 100644
--- a/test/syscalls/linux/socket_unix_dgram_local.cc
+++ b/test/syscalls/linux/socket_unix_dgram_local.cc
@@ -41,15 +41,15 @@ std::vector<SocketPairKind> GetSocketPairs() {
}
INSTANTIATE_TEST_SUITE_P(
- AllUnixDomainSockets, DgramUnixSocketPairTest,
+ DgramUnixSockets, DgramUnixSocketPairTest,
::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
INSTANTIATE_TEST_SUITE_P(
- AllUnixDomainSockets, UnixNonStreamSocketPairTest,
+ DgramUnixSockets, UnixNonStreamSocketPairTest,
::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
INSTANTIATE_TEST_SUITE_P(
- AllUnixDomainSockets, NonStreamSocketPairTest,
+ DgramUnixSockets, NonStreamSocketPairTest,
::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
} // namespace testing
diff --git a/test/syscalls/linux/socket_unix_dgram_non_blocking.cc b/test/syscalls/linux/socket_unix_dgram_non_blocking.cc
index 9fe86cee8..707052af8 100644
--- a/test/syscalls/linux/socket_unix_dgram_non_blocking.cc
+++ b/test/syscalls/linux/socket_unix_dgram_non_blocking.cc
@@ -44,7 +44,7 @@ TEST_P(NonBlockingDgramUnixSocketPairTest, ReadOneSideClosed) {
}
INSTANTIATE_TEST_SUITE_P(
- AllUnixDomainSockets, NonBlockingDgramUnixSocketPairTest,
+ NonBlockingDgramUnixSockets, NonBlockingDgramUnixSocketPairTest,
::testing::ValuesIn(IncludeReversals(std::vector<SocketPairKind>{
UnixDomainSocketPair(SOCK_DGRAM | SOCK_NONBLOCK),
FilesystemBoundUnixDomainSocketPair(SOCK_DGRAM | SOCK_NONBLOCK),
diff --git a/test/syscalls/linux/socket_unix_filesystem.cc b/test/syscalls/linux/socket_unix_filesystem.cc
deleted file mode 100644
index 5dbe67773..000000000
--- a/test/syscalls/linux/socket_unix_filesystem.cc
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright 2018 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <vector>
-
-#include "test/syscalls/linux/socket_test_util.h"
-#include "test/syscalls/linux/socket_unix.h"
-#include "test/syscalls/linux/unix_domain_socket_test_util.h"
-#include "test/util/test_util.h"
-
-namespace gvisor {
-namespace testing {
-
-std::vector<SocketPairKind> GetSocketPairs() {
- return ApplyVec<SocketPairKind>(
- FilesystemBoundUnixDomainSocketPair,
- AllBitwiseCombinations(List<int>{SOCK_STREAM, SOCK_DGRAM, SOCK_SEQPACKET},
- List<int>{0, SOCK_NONBLOCK}));
-}
-
-INSTANTIATE_TEST_SUITE_P(
- AllUnixDomainSockets, UnixSocketPairTest,
- ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
-
-} // namespace testing
-} // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_filesystem_nonblock.cc b/test/syscalls/linux/socket_unix_filesystem_nonblock.cc
index 137db53c4..8ba7af971 100644
--- a/test/syscalls/linux/socket_unix_filesystem_nonblock.cc
+++ b/test/syscalls/linux/socket_unix_filesystem_nonblock.cc
@@ -30,7 +30,7 @@ std::vector<SocketPairKind> GetSocketPairs() {
}
INSTANTIATE_TEST_SUITE_P(
- AllUnixDomainSockets, NonBlockingSocketPairTest,
+ NonBlockingFilesystemUnixSockets, NonBlockingSocketPairTest,
::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
} // namespace testing
diff --git a/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc b/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc
index 98cf1fe8a..da762cd83 100644
--- a/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc
+++ b/test/syscalls/linux/socket_unix_non_stream_blocking_local.cc
@@ -34,7 +34,7 @@ std::vector<SocketPairKind> GetSocketPairs() {
}
INSTANTIATE_TEST_SUITE_P(
- AllUnixDomainSockets, BlockingNonStreamSocketPairTest,
+ BlockingNonStreamUnixSockets, BlockingNonStreamSocketPairTest,
::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
} // namespace testing
diff --git a/test/syscalls/linux/socket_unix_pair.cc b/test/syscalls/linux/socket_unix_pair.cc
index bacfc11e4..411fb4518 100644
--- a/test/syscalls/linux/socket_unix_pair.cc
+++ b/test/syscalls/linux/socket_unix_pair.cc
@@ -16,6 +16,7 @@
#include "test/syscalls/linux/socket_test_util.h"
#include "test/syscalls/linux/socket_unix.h"
+#include "test/syscalls/linux/socket_unix_cmsg.h"
#include "test/syscalls/linux/unix_domain_socket_test_util.h"
#include "test/util/test_util.h"
@@ -33,5 +34,9 @@ INSTANTIATE_TEST_SUITE_P(
AllUnixDomainSockets, UnixSocketPairTest,
::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+INSTANTIATE_TEST_SUITE_P(
+ AllUnixDomainSockets, UnixSocketPairCmsgTest,
+ ::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
+
} // namespace testing
} // namespace gvisor
diff --git a/test/syscalls/linux/socket_unix_pair_nonblock.cc b/test/syscalls/linux/socket_unix_pair_nonblock.cc
index 583506f08..3135d325f 100644
--- a/test/syscalls/linux/socket_unix_pair_nonblock.cc
+++ b/test/syscalls/linux/socket_unix_pair_nonblock.cc
@@ -30,7 +30,7 @@ std::vector<SocketPairKind> GetSocketPairs() {
}
INSTANTIATE_TEST_SUITE_P(
- AllUnixDomainSockets, NonBlockingSocketPairTest,
+ NonBlockingUnixSockets, NonBlockingSocketPairTest,
::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
} // namespace testing
diff --git a/test/syscalls/linux/socket_unix_seqpacket_local.cc b/test/syscalls/linux/socket_unix_seqpacket_local.cc
index b903a9e8f..dff75a532 100644
--- a/test/syscalls/linux/socket_unix_seqpacket_local.cc
+++ b/test/syscalls/linux/socket_unix_seqpacket_local.cc
@@ -41,15 +41,15 @@ std::vector<SocketPairKind> GetSocketPairs() {
}
INSTANTIATE_TEST_SUITE_P(
- AllUnixDomainSockets, NonStreamSocketPairTest,
+ SeqpacketUnixSockets, NonStreamSocketPairTest,
::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
INSTANTIATE_TEST_SUITE_P(
- AllUnixDomainSockets, SeqpacketUnixSocketPairTest,
+ SeqpacketUnixSockets, SeqpacketUnixSocketPairTest,
::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
INSTANTIATE_TEST_SUITE_P(
- AllUnixDomainSockets, UnixNonStreamSocketPairTest,
+ SeqpacketUnixSockets, UnixNonStreamSocketPairTest,
::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
} // namespace testing
diff --git a/test/syscalls/linux/socket_unix_stream_blocking_local.cc b/test/syscalls/linux/socket_unix_stream_blocking_local.cc
index ce0f1e50d..fa0a9d367 100644
--- a/test/syscalls/linux/socket_unix_stream_blocking_local.cc
+++ b/test/syscalls/linux/socket_unix_stream_blocking_local.cc
@@ -32,7 +32,7 @@ std::vector<SocketPairKind> GetSocketPairs() {
}
INSTANTIATE_TEST_SUITE_P(
- AllUnixDomainSockets, BlockingStreamSocketPairTest,
+ BlockingStreamUnixSockets, BlockingStreamSocketPairTest,
::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
} // namespace testing
diff --git a/test/syscalls/linux/socket_unix_stream_local.cc b/test/syscalls/linux/socket_unix_stream_local.cc
index 6b840189c..65eef1a81 100644
--- a/test/syscalls/linux/socket_unix_stream_local.cc
+++ b/test/syscalls/linux/socket_unix_stream_local.cc
@@ -39,7 +39,7 @@ std::vector<SocketPairKind> GetSocketPairs() {
}
INSTANTIATE_TEST_SUITE_P(
- AllUnixDomainSockets, StreamSocketPairTest,
+ StreamUnixSockets, StreamSocketPairTest,
::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
} // namespace testing
diff --git a/test/syscalls/linux/socket_unix_stream_nonblock_local.cc b/test/syscalls/linux/socket_unix_stream_nonblock_local.cc
index ebec4e0ec..ec777c59f 100644
--- a/test/syscalls/linux/socket_unix_stream_nonblock_local.cc
+++ b/test/syscalls/linux/socket_unix_stream_nonblock_local.cc
@@ -31,7 +31,7 @@ std::vector<SocketPairKind> GetSocketPairs() {
}
INSTANTIATE_TEST_SUITE_P(
- AllUnixDomainSockets, NonBlockingStreamSocketPairTest,
+ NonBlockingStreamUnixSockets, NonBlockingStreamSocketPairTest,
::testing::ValuesIn(IncludeReversals(GetSocketPairs())));
} // namespace testing
diff --git a/test/syscalls/linux/socket_unix_unbound_dgram.cc b/test/syscalls/linux/socket_unix_unbound_dgram.cc
index 2ddc5c11f..52aef891f 100644
--- a/test/syscalls/linux/socket_unix_unbound_dgram.cc
+++ b/test/syscalls/linux/socket_unix_unbound_dgram.cc
@@ -13,7 +13,9 @@
// limitations under the License.
#include <stdio.h>
+#include <sys/socket.h>
#include <sys/un.h>
+
#include "gtest/gtest.h"
#include "gtest/gtest.h"
#include "test/syscalls/linux/socket_test_util.h"
@@ -142,6 +144,28 @@ TEST_P(UnboundDgramUnixSocketPairTest, SendtoWithoutConnect) {
SyscallSucceedsWithValue(sizeof(data)));
}
+TEST_P(UnboundDgramUnixSocketPairTest, SendtoWithoutConnectPassCreds) {
+ auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair());
+
+ ASSERT_THAT(bind(sockets->first_fd(), sockets->first_addr(),
+ sockets->first_addr_size()),
+ SyscallSucceeds());
+
+ SetSoPassCred(sockets->first_fd());
+ char data = 'a';
+ ASSERT_THAT(
+ RetryEINTR(sendto)(sockets->second_fd(), &data, sizeof(data), 0,
+ sockets->first_addr(), sockets->first_addr_size()),
+ SyscallSucceedsWithValue(sizeof(data)));
+ ucred creds;
+ creds.pid = -1;
+ char buf[sizeof(data) + 1];
+ ASSERT_NO_FATAL_FAILURE(
+ RecvCreds(sockets->first_fd(), &creds, buf, sizeof(buf), sizeof(data)));
+ EXPECT_EQ(0, memcmp(&data, buf, sizeof(data)));
+ EXPECT_THAT(getpid(), SyscallSucceedsWithValue(creds.pid));
+}
+
INSTANTIATE_TEST_SUITE_P(
AllUnixDomainSockets, UnboundDgramUnixSocketPairTest,
::testing::ValuesIn(VecCat<SocketPairKind>(
diff --git a/test/syscalls/linux/tcp_socket.cc b/test/syscalls/linux/tcp_socket.cc
index e3f9f9f9d..e95b644ac 100644
--- a/test/syscalls/linux/tcp_socket.cc
+++ b/test/syscalls/linux/tcp_socket.cc
@@ -751,6 +751,133 @@ TEST_P(SimpleTcpSocketTest, NonBlockingConnectRefused) {
EXPECT_THAT(close(s.release()), SyscallSucceeds());
}
+// Test that setting a supported congestion control algorithm succeeds for an
+// unconnected TCP socket
+TEST_P(SimpleTcpSocketTest, SetCongestionControlSucceedsForSupported) {
+ // This is Linux's net/tcp.h TCP_CA_NAME_MAX.
+ const int kTcpCaNameMax = 16;
+
+ FileDescriptor s =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+ {
+ const char kSetCC[kTcpCaNameMax] = "reno";
+ ASSERT_THAT(setsockopt(s.get(), IPPROTO_TCP, TCP_CONGESTION, &kSetCC,
+ strlen(kSetCC)),
+ SyscallSucceedsWithValue(0));
+
+ char got_cc[kTcpCaNameMax];
+ memset(got_cc, '1', sizeof(got_cc));
+ socklen_t optlen = sizeof(got_cc);
+ ASSERT_THAT(
+ getsockopt(s.get(), IPPROTO_TCP, TCP_CONGESTION, &got_cc, &optlen),
+ SyscallSucceedsWithValue(0));
+ // We ignore optlen here as the linux kernel sets optlen to the lower of the
+ // size of the buffer passed in or kTcpCaNameMax and not the length of the
+ // congestion control algorithm's actual name.
+ EXPECT_EQ(0, memcmp(got_cc, kSetCC, sizeof(kTcpCaNameMax)));
+ }
+ {
+ const char kSetCC[kTcpCaNameMax] = "cubic";
+ ASSERT_THAT(setsockopt(s.get(), IPPROTO_TCP, TCP_CONGESTION, &kSetCC,
+ strlen(kSetCC)),
+ SyscallSucceedsWithValue(0));
+
+ char got_cc[kTcpCaNameMax];
+ memset(got_cc, '1', sizeof(got_cc));
+ socklen_t optlen = sizeof(got_cc);
+ ASSERT_THAT(
+ getsockopt(s.get(), IPPROTO_TCP, TCP_CONGESTION, &got_cc, &optlen),
+ SyscallSucceedsWithValue(0));
+ // We ignore optlen here as the linux kernel sets optlen to the lower of the
+ // size of the buffer passed in or kTcpCaNameMax and not the length of the
+ // congestion control algorithm's actual name.
+ EXPECT_EQ(0, memcmp(got_cc, kSetCC, sizeof(kTcpCaNameMax)));
+ }
+}
+
+// This test verifies that a getsockopt(...TCP_CONGESTION) behaviour is
+// consistent between linux and gvisor when the passed in buffer is smaller than
+// kTcpCaNameMax.
+TEST_P(SimpleTcpSocketTest, SetGetTCPCongestionShortReadBuffer) {
+ FileDescriptor s =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+ {
+ // Verify that getsockopt/setsockopt work with buffers smaller than
+ // kTcpCaNameMax.
+ const char kSetCC[] = "cubic";
+ ASSERT_THAT(setsockopt(s.get(), IPPROTO_TCP, TCP_CONGESTION, &kSetCC,
+ strlen(kSetCC)),
+ SyscallSucceedsWithValue(0));
+
+ char got_cc[sizeof(kSetCC)];
+ socklen_t optlen = sizeof(got_cc);
+ ASSERT_THAT(
+ getsockopt(s.get(), IPPROTO_TCP, TCP_CONGESTION, &got_cc, &optlen),
+ SyscallSucceedsWithValue(0));
+ EXPECT_EQ(sizeof(got_cc), optlen);
+ EXPECT_EQ(0, memcmp(got_cc, kSetCC, sizeof(got_cc)));
+ }
+}
+
+// This test verifies that a getsockopt(...TCP_CONGESTION) behaviour is
+// consistent between linux and gvisor when the passed in buffer is larger than
+// kTcpCaNameMax.
+TEST_P(SimpleTcpSocketTest, SetGetTCPCongestionLargeReadBuffer) {
+ // This is Linux's net/tcp.h TCP_CA_NAME_MAX.
+ const int kTcpCaNameMax = 16;
+
+ FileDescriptor s =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+ {
+ // Verify that getsockopt works with buffers larger than
+ // kTcpCaNameMax.
+ const char kSetCC[] = "cubic";
+ ASSERT_THAT(setsockopt(s.get(), IPPROTO_TCP, TCP_CONGESTION, &kSetCC,
+ strlen(kSetCC)),
+ SyscallSucceedsWithValue(0));
+
+ char got_cc[kTcpCaNameMax + 5];
+ socklen_t optlen = sizeof(got_cc);
+ ASSERT_THAT(
+ getsockopt(s.get(), IPPROTO_TCP, TCP_CONGESTION, &got_cc, &optlen),
+ SyscallSucceedsWithValue(0));
+ // Linux copies the minimum of kTcpCaNameMax or the length of the passed in
+ // buffer and sets optlen to the number of bytes actually copied
+ // irrespective of the actual length of the congestion control name.
+ EXPECT_EQ(kTcpCaNameMax, optlen);
+ EXPECT_EQ(0, memcmp(got_cc, kSetCC, sizeof(kSetCC)));
+ }
+}
+
+// Test that setting an unsupported congestion control algorithm fails for an
+// unconnected TCP socket.
+TEST_P(SimpleTcpSocketTest, SetCongestionControlFailsForUnsupported) {
+ // This is Linux's net/tcp.h TCP_CA_NAME_MAX.
+ const int kTcpCaNameMax = 16;
+
+ FileDescriptor s =
+ ASSERT_NO_ERRNO_AND_VALUE(Socket(GetParam(), SOCK_STREAM, IPPROTO_TCP));
+ char old_cc[kTcpCaNameMax];
+ socklen_t optlen = sizeof(old_cc);
+ ASSERT_THAT(
+ getsockopt(s.get(), IPPROTO_TCP, TCP_CONGESTION, &old_cc, &optlen),
+ SyscallSucceedsWithValue(0));
+
+ const char kSetCC[] = "invalid_ca_kSetCC";
+ ASSERT_THAT(
+ setsockopt(s.get(), SOL_TCP, TCP_CONGESTION, &kSetCC, strlen(kSetCC)),
+ SyscallFailsWithErrno(ENOENT));
+
+ char got_cc[kTcpCaNameMax];
+ ASSERT_THAT(
+ getsockopt(s.get(), IPPROTO_TCP, TCP_CONGESTION, &got_cc, &optlen),
+ SyscallSucceedsWithValue(0));
+ // We ignore optlen here as the linux kernel sets optlen to the lower of the
+ // size of the buffer passed in or kTcpCaNameMax and not the length of the
+ // congestion control algorithm's actual name.
+ EXPECT_EQ(0, memcmp(got_cc, old_cc, sizeof(kTcpCaNameMax)));
+}
+
INSTANTIATE_TEST_SUITE_P(AllInetTests, SimpleTcpSocketTest,
::testing::Values(AF_INET, AF_INET6));
diff --git a/test/syscalls/syscall_test_runner.go b/test/syscalls/syscall_test_runner.go
index 9a8e0600b..476248184 100644
--- a/test/syscalls/syscall_test_runner.go
+++ b/test/syscalls/syscall_test_runner.go
@@ -47,6 +47,7 @@ var (
platform = flag.String("platform", "ptrace", "platform to run on")
useTmpfs = flag.Bool("use-tmpfs", false, "mounts tmpfs for /tmp")
fileAccess = flag.String("file-access", "exclusive", "mounts root in exclusive or shared mode")
+ overlay = flag.Bool("overlay", false, "wrap filesystem mounts with writable tmpfs overlay")
parallel = flag.Bool("parallel", false, "run tests in parallel")
runscPath = flag.String("runsc", "", "path to runsc binary")
)
@@ -184,10 +185,13 @@ func runTestCaseRunsc(testBin string, tc gtest.TestCase, t *testing.T) {
"-platform", *platform,
"-root", rootDir,
"-file-access", *fileAccess,
- "--network=none",
+ "-network=none",
"-log-format=text",
"-TESTONLY-unsafe-nonroot=true",
- "--net-raw=true",
+ "-net-raw=true",
+ }
+ if *overlay {
+ args = append(args, "-overlay")
}
if *debug {
args = append(args, "-debug", "-log-packets=true")
@@ -196,7 +200,11 @@ func runTestCaseRunsc(testBin string, tc gtest.TestCase, t *testing.T) {
args = append(args, "-strace")
}
if outDir, ok := syscall.Getenv("TEST_UNDECLARED_OUTPUTS_DIR"); ok {
- debugLogDir, err := ioutil.TempDir(outDir, "runsc")
+ tdir := filepath.Join(outDir, strings.Replace(tc.FullName(), "/", "_", -1))
+ if err := os.MkdirAll(tdir, 0755); err != nil {
+ t.Fatalf("could not create test dir: %v", err)
+ }
+ debugLogDir, err := ioutil.TempDir(tdir, "runsc")
if err != nil {
t.Fatalf("could not create temp dir: %v", err)
}
diff --git a/tools/go_branch.sh b/tools/go_branch.sh
new file mode 100755
index 000000000..8ea6a6d8d
--- /dev/null
+++ b/tools/go_branch.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -eo pipefail
+
+# Discovery the package name from the go.mod file.
+declare -r gomod="$(pwd)/go.mod"
+declare -r module=$(cat "${gomod}" | grep -E "^module" | cut -d' ' -f2)
+
+# Check that gopath has been built.
+declare -r gopath_dir="$(pwd)/bazel-bin/gopath/src/${module}"
+if ! [ -d "${gopath_dir}" ]; then
+ echo "No gopath directory found; build the :gopath target." >&2
+ exit 1
+fi
+
+# Create a temporary working directory, and ensure that this directory and all
+# subdirectories are cleaned up upon exit.
+declare -r tmp_dir=$(mktemp -d)
+finish() {
+ cd # Leave tmp_dir.
+ rm -rf "${tmp_dir}"
+}
+trap finish EXIT
+
+# Record the current working commit.
+declare -r head=$(git describe --always)
+
+# We expect to have an existing go branch that we will use as the basis for
+# this commit. That branch may be empty, but it must exist.
+declare -r go_branch=$(git show-ref --hash origin/go)
+
+# Clone the current repository to the temporary directory, and check out the
+# current go_branch directory. We move to the new repository for convenience.
+declare -r repo_orig="$(pwd)"
+declare -r repo_new="${tmp_dir}/repository"
+git clone . "${repo_new}"
+cd "${repo_new}"
+
+# Setup the repository and checkout the branch.
+git config user.email "gvisor-bot@google.com"
+git config user.name "gVisor bot"
+git fetch origin "${go_branch}"
+git checkout -b go "${go_branch}"
+
+# Start working on a merge commit that combines the previous history with the
+# current history. Note that we don't actually want any changes yet.
+git merge --allow-unrelated-histories --no-commit --strategy ours ${head}
+
+# Sync the entire gopath_dir and go.mod.
+rsync --recursive --verbose --delete --exclude .git --exclude README.md -L "${gopath_dir}/" .
+cp "${gomod}" .
+
+# There are a few solitary files that can get left behind due to the way bazel
+# constructs the gopath target. Note that we don't find all Go files here
+# because they may correspond to unused templates, etc.
+cp "${repo_orig}"/runsc/*.go runsc/
+
+# Update the current working set and commit.
+git add . && git commit -m "Merge ${head} (automated)"
+
+# Push the branch back to the original repository.
+git remote add orig "${repo_orig}" && git push -f orig go:go
diff --git a/tools/run_build.sh b/tools/run_build.sh
new file mode 100755
index 000000000..d49a1d4be
--- /dev/null
+++ b/tools/run_build.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+# Copyright 2018 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Fail on any error.
+set -e
+# Display commands to stderr.
+set -x
+
+# Install the latest version of Bazel and log the version.
+(which use_bazel.sh && use_bazel.sh latest) || which bazel
+bazel version
+
+# Switch into the workspace.
+if [[ -v KOKORO_GIT_COMMIT ]] && [[ -d git/repo ]]; then
+ cd git/repo
+elif [[ -v KOKORO_GIT_COMMIT ]] && [[ -d github/repo ]]; then
+ cd github/repo
+fi
+
+# Build runsc.
+bazel build //runsc
+
+# Move the runsc binary into "latest" directory, and also a directory with the
+# current date.
+if [[ -v KOKORO_ARTIFACTS_DIR ]]; then
+ latest_dir="${KOKORO_ARTIFACTS_DIR}"/latest
+ today_dir="${KOKORO_ARTIFACTS_DIR}"/"$(date -Idate)"
+ mkdir -p "${latest_dir}" "${today_dir}"
+ cp bazel-bin/runsc/linux_amd64_pure_stripped/runsc "${latest_dir}"
+ sha512sum "${latest_dir}"/runsc | awk '{print $1 " runsc"}' > "${latest_dir}"/runsc.sha512
+ cp bazel-bin/runsc/linux_amd64_pure_stripped/runsc "${today_dir}"
+ sha512sum "${today_dir}"/runsc | awk '{print $1 " runsc"}' > "${today_dir}"/runsc.sha512
+fi
diff --git a/tools/run_tests.sh b/tools/run_tests.sh
new file mode 100755
index 000000000..7a1f889dd
--- /dev/null
+++ b/tools/run_tests.sh
@@ -0,0 +1,288 @@
+#!/bin/bash
+
+# Copyright 2018 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Fail on any error. Treat unset variables as error. Print commands as executed.
+set -eux
+
+###################
+# GLOBAL ENV VARS #
+###################
+
+if [[ -v KOKORO_GIT_COMMIT ]] && [[ -d git/repo ]]; then
+ readonly WORKSPACE_DIR="${PWD}/git/repo"
+elif [[ -v KOKORO_GIT_COMMIT ]] && [[ -d github/repo ]]; then
+ readonly WORKSPACE_DIR="${PWD}/github/repo"
+else
+ readonly WORKSPACE_DIR="${PWD}"
+fi
+
+# Used to configure RBE.
+readonly CLOUD_PROJECT_ID="gvisor-rbe"
+readonly RBE_PROJECT_ID="projects/${CLOUD_PROJECT_ID}/instances/default_instance"
+
+# Random runtime name to avoid collisions.
+readonly RUNTIME="runsc_test_$((RANDOM))"
+
+# Packages that will be built and tested.
+readonly BUILD_PACKAGES=("//...")
+readonly TEST_PACKAGES=("//pkg/..." "//runsc/..." "//tools/...")
+
+#######################
+# BAZEL CONFIGURATION #
+#######################
+
+# Install the latest version of Bazel and log the version.
+(which use_bazel.sh && use_bazel.sh latest) || which bazel
+bazel version
+
+# Load the kvm module.
+sudo -n -E modprobe kvm
+
+# General Bazel build/test flags.
+BAZEL_BUILD_FLAGS=(
+ "--show_timestamps"
+ "--test_output=errors"
+ "--keep_going"
+ "--verbose_failures=true"
+)
+
+# Bazel build/test for RBE, a super-set of BAZEL_BUILD_FLAGS.
+BAZEL_BUILD_RBE_FLAGS=(
+ "${BAZEL_BUILD_FLAGS[@]}"
+ "--config=remote"
+ "--project_id=${CLOUD_PROJECT_ID}"
+ "--remote_instance_name=${RBE_PROJECT_ID}"
+)
+if [[ -v KOKORO_BAZEL_AUTH_CREDENTIAL ]]; then
+ BAZEL_BUILD_RBE_FLAGS=(
+ "${BAZEL_BUILD_RBE_FLAGS[@]}"
+ "--auth_credentials=${KOKORO_BAZEL_AUTH_CREDENTIAL}"
+ )
+fi
+
+####################
+# Helper Functions #
+####################
+
+sanity_checks() {
+ cd ${WORKSPACE_DIR}
+ bazel run //:gazelle -- update-repos -from_file=go.mod
+ git diff --exit-code WORKSPACE
+}
+
+build_everything() {
+ FLAVOR="${1}"
+
+ cd ${WORKSPACE_DIR}
+ bazel build \
+ -c "${FLAVOR}" "${BAZEL_BUILD_RBE_FLAGS[@]}" \
+ "${BUILD_PACKAGES[@]}"
+}
+
+# Run simple tests runs the tests that require no special setup or
+# configuration.
+run_simple_tests() {
+ cd ${WORKSPACE_DIR}
+ bazel test \
+ "${BAZEL_BUILD_FLAGS[@]}" \
+ "${TEST_PACKAGES[@]}"
+}
+
+install_runtime() {
+ cd ${WORKSPACE_DIR}
+ sudo -n ${WORKSPACE_DIR}/runsc/test/install.sh --runtime ${RUNTIME}
+}
+
+install_helper() {
+ PACKAGE="${1}"
+ TAG="${2}"
+ GOPATH="${3}"
+
+ # Clone the repository.
+ mkdir -p "${GOPATH}"/src/$(dirname "${PACKAGE}") && \
+ git clone https://"${PACKAGE}" "${GOPATH}"/src/"${PACKAGE}"
+
+ # Checkout and build the repository.
+ (cd "${GOPATH}"/src/"${PACKAGE}" && \
+ git checkout "${TAG}" && \
+ GOPATH="${GOPATH}" make && \
+ sudo -n -E env GOPATH="${GOPATH}" make install)
+}
+
+# Install dependencies for the crictl tests.
+install_crictl_test_deps() {
+ sudo -n -E apt-get update
+ sudo -n -E apt-get install -y btrfs-tools libseccomp-dev
+
+ # Install containerd & cri-tools.
+ GOPATH=$(mktemp -d --tmpdir gopathXXXXX)
+ install_helper github.com/containerd/containerd v1.2.2 "${GOPATH}"
+ install_helper github.com/kubernetes-sigs/cri-tools v1.11.0 "${GOPATH}"
+
+ # Install gvisor-containerd-shim.
+ local latest=/tmp/gvisor-containerd-shim-latest
+ local shim_path=/tmp/gvisor-containerd-shim
+ wget --no-verbose https://storage.googleapis.com/cri-containerd-staging/gvisor-containerd-shim/latest -O ${latest}
+ wget --no-verbose https://storage.googleapis.com/cri-containerd-staging/gvisor-containerd-shim/gvisor-containerd-shim-$(cat ${latest}) -O ${shim_path}
+ chmod +x ${shim_path}
+ sudo -n -E mv ${shim_path} /usr/local/bin
+
+ # Configure containerd-shim.
+ local shim_config_path=/etc/containerd
+ local shim_config_tmp_path=/tmp/gvisor-containerd-shim.toml
+ sudo -n -E mkdir -p ${shim_config_path}
+ cat > ${shim_config_tmp_path} <<-EOF
+ runc_shim = "/usr/local/bin/containerd-shim"
+
+ [runsc_config]
+ debug = "true"
+ debug-log = "/tmp/runsc-logs/"
+ strace = "true"
+ file-access = "shared"
+EOF
+ sudo mv ${shim_config_tmp_path} ${shim_config_path}
+
+ # Configure CNI.
+ (cd "${GOPATH}" && sudo -n -E env PATH="${PATH}" GOPATH="${GOPATH}" \
+ src/github.com/containerd/containerd/script/setup/install-cni)
+}
+
+# Run the tests that require docker.
+run_docker_tests() {
+ cd ${WORKSPACE_DIR}
+
+ # Run tests with a default runtime (runc).
+ bazel test \
+ "${BAZEL_BUILD_FLAGS[@]}" \
+ --test_env=RUNSC_RUNTIME="" \
+ --test_output=all \
+ //runsc/test/image:image_test
+
+ # These names are used to exclude tests not supported in certain
+ # configuration, e.g. save/restore not supported with hostnet.
+ declare -a variations=("" "-kvm" "-hostnet" "-overlay")
+ for v in "${variations[@]}"; do
+ # Change test names otherwise each run of tests will overwrite logs and
+ # results of the previous run.
+ sed -i "s/name = \"integration_test.*\"/name = \"integration_test${v}\"/" runsc/test/integration/BUILD
+ sed -i "s/name = \"image_test.*\"/name = \"image_test${v}\"/" runsc/test/image/BUILD
+ # Run runsc tests with docker that are tagged manual.
+ bazel test \
+ "${BAZEL_BUILD_FLAGS[@]}" \
+ --test_env=RUNSC_RUNTIME="${RUNTIME}${v}" \
+ --test_output=all \
+ //runsc/test/image:image_test${v} \
+ //runsc/test/integration:integration_test${v}
+ done
+}
+
+# Run the tests that require root.
+run_root_tests() {
+ cd ${WORKSPACE_DIR}
+ bazel build //runsc/test/root:root_test
+ local root_test=$(find -L ./bazel-bin/ -executable -type f -name root_test | grep __main__)
+ if [[ ! -f "${root_test}" ]]; then
+ echo "root_test executable not found"
+ exit 1
+ fi
+ sudo -n -E RUNSC_RUNTIME="${RUNTIME}" RUNSC_EXEC=/tmp/"${RUNTIME}"/runsc ${root_test}
+}
+
+# Run syscall unit tests.
+run_syscall_tests() {
+ cd ${WORKSPACE_DIR}
+ bazel test "${BAZEL_BUILD_RBE_FLAGS[@]}" \
+ --test_tag_filters=runsc_ptrace //test/syscalls/...
+}
+
+run_runsc_do_tests() {
+ local runsc=$(find bazel-bin/runsc -type f -executable -name "runsc" | head -n1)
+
+ # run runsc do without root privileges.
+ ${runsc} --rootless do true
+ ${runsc} --rootless --network=none do true
+
+ # run runsc do with root privileges.
+ sudo -n -E ${runsc} do true
+}
+
+# Find and rename all test xml and log files so that Sponge can pick them up.
+# XML files must be named sponge_log.xml, and log files must be named
+# sponge_log.log. We move all such files into KOKORO_ARTIFACTS_DIR, in a
+# subdirectory named with the test name.
+upload_test_artifacts() {
+ # Skip if no kokoro directory.
+ [[ -v KOKORO_ARTIFACTS_DIR ]] || return
+
+ cd ${WORKSPACE_DIR}
+ find -L "bazel-testlogs" -name "test.xml" -o -name "test.log" -o -name "outputs.zip" |
+ tar --create --files-from - --transform 's/test\./sponge_log./' |
+ tar --extract --directory ${KOKORO_ARTIFACTS_DIR}
+ if [[ -d "/tmp/${RUNTIME}/logs" ]]; then
+ tar --create --gzip "--file=${KOKORO_ARTIFACTS_DIR}/runsc-logs.tar.gz" -C /tmp/ ${RUNTIME}/logs
+ fi
+}
+
+# Finish runs at exit, even in the event of an error, and uploads all test
+# artifacts.
+finish() {
+ # Grab the last exit code, we will return it.
+ local exit_code=${?}
+ upload_test_artifacts
+ exit ${exit_code}
+}
+
+# Run bazel in a docker container
+build_in_docker() {
+ cd ${WORKSPACE_DIR}
+ bazel clean
+ bazel shutdown
+ make
+ make runsc
+ make bazel-shutdown
+}
+
+########
+# MAIN #
+########
+
+main() {
+ # Register finish to run at exit.
+ trap finish EXIT
+
+ # Build and run the simple tests.
+ sanity_checks
+ build_everything opt
+ run_simple_tests
+
+ # So far so good. Install more deps and run the integration tests.
+ install_runtime
+ install_crictl_test_deps
+ run_docker_tests
+ run_root_tests
+
+ run_syscall_tests
+ run_runsc_do_tests
+
+ # Build other flavors too.
+ build_everything dbg
+
+ build_in_docker
+ # No need to call "finish" here, it will happen at exit.
+}
+
+# Kick it off.
+main